From e60b53f93b0c4df3f910a535f9e09b359c98092d Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Fri, 6 Mar 2026 02:36:01 -0600 Subject: [PATCH 001/143] Revert "NVIDIA: VR: SAUCE: cxl: add support for cxl reset" This reverts commit f198764ea997285f369d115202a577f6dee55b0a. The CXL reset implementation is being reverted to allow "NVIDIA: VR: SAUCE: CXL/PCI: Move CXL DVSEC definitions into uapi/linux/pci_regs.h" to apply cleanly. The reset functionality will be replaced by the version currently being pursued upstream. Signed-off-by: Jiandi An --- drivers/cxl/cxlpci.h | 40 ++++-------- drivers/pci/pci.c | 147 ------------------------------------------- include/linux/pci.h | 2 +- 3 files changed, 14 insertions(+), 175 deletions(-) diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 67ad5b007498e..54e219b0049ea 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -16,33 +16,19 @@ /* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ #define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_CACHE_CAPABLE BIT(0) -#define CXL_DVSEC_MEM_CAPABLE BIT(2) -#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) -#define CXL_DVSEC_CACHE_WBI_CAPABLE BIT(6) -#define CXL_DVSEC_CXL_RST_CAPABLE BIT(7) -#define CXL_DVSEC_CXL_RST_TIMEOUT_MASK GENMASK(10, 8) -#define CXL_DVSEC_CXL_RST_MEM_CLR_CAPABLE BIT(11) -#define CXL_DVSEC_CTRL_OFFSET 0xC -#define CXL_DVSEC_MEM_ENABLE BIT(2) -#define CXL_DVSEC_CTRL2_OFFSET 0x10 -#define CXL_DVSEC_DISABLE_CACHING BIT(0) -#define CXL_DVSEC_INIT_CACHE_WBI BIT(1) -#define CXL_DVSEC_INIT_CXL_RESET BIT(2) -#define CXL_DVSEC_CXL_RST_MEM_CLR_ENABLE BIT(3) -#define CXL_DVSEC_STATUS2_OFFSET 0x12 -#define CXL_DVSEC_CACHE_INVALID BIT(0) -#define CXL_DVSEC_CXL_RST_COMPLETE BIT(1) -#define CXL_DVSEC_CXL_RESET_ERR BIT(2) -#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + ((i) * 0x10)) -#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + ((i) * 0x10)) -#define CXL_DVSEC_MEM_INFO_VALID BIT(0) -#define CXL_DVSEC_MEM_ACTIVE BIT(1) -#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) -#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + ((i) * 0x10)) -#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + ((i) * 0x10)) -#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) +#define CXL_DVSEC_CAP_OFFSET 0xA +#define CXL_DVSEC_MEM_CAPABLE BIT(2) +#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) +#define CXL_DVSEC_CTRL_OFFSET 0xC +#define CXL_DVSEC_MEM_ENABLE BIT(2) +#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define CXL_DVSEC_MEM_INFO_VALID BIT(0) +#define CXL_DVSEC_MEM_ACTIVE BIT(1) +#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) +#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) #define CXL_DVSEC_RANGE_MAX 2 diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 9a6943688e6db..372de7961d2a6 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -30,7 +30,6 @@ #include #include #include -#include "../cxl/cxlpci.h" #include "pci.h" DEFINE_MUTEX(pci_slot_mutex); @@ -5134,151 +5133,6 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) return rc; } -static int cxl_reset_prepare(struct pci_dev *dev, u16 dvsec) -{ - u32 timeout_us = 100, timeout_tot_us = 10000; - u16 reg, cap; - int rc; - - if (!pci_wait_for_pending_transaction(dev)) - pci_err(dev, "timed out waiting for pending transaction; performing cxl reset anyway\n"); - - /* Check if the device is cache capable. */ - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CAP_OFFSET, &cap); - if (rc) - return rc; - - if (!(cap & CXL_DVSEC_CACHE_CAPABLE)) - return 0; - - /* Disable cache. WB and invalidate cache if capability is advertised */ - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, ®); - if (rc) - return rc; - reg |= CXL_DVSEC_DISABLE_CACHING; - /* - * DEVCTL2 bits are written only once. So check WB+I capability while - * keeping disable caching set. - */ - if (cap & CXL_DVSEC_CACHE_WBI_CAPABLE) - reg |= CXL_DVSEC_INIT_CACHE_WBI; - pci_write_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, reg); - - /* - * From Section 9.6: "Software may leverage the cache size reported in - * the DVSEC CXL Capability2 register to compute a suitable timeout - * value". - * Given there is no conversion factor for cache size -> timeout, - * setting timer for default 10ms. - */ - do { - if (timeout_tot_us == 0) - return -ETIMEDOUT; - usleep_range(timeout_us, timeout_us + 1); - timeout_tot_us -= timeout_us; - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, - ®); - if (rc) - return rc; - } while (!(reg & CXL_DVSEC_CACHE_INVALID)); - - return 0; -} - -static int cxl_reset_init(struct pci_dev *dev, u16 dvsec) -{ - /* - * Timeout values ref CXL Spec v3.2 Ch 8 Control and Status Registers, - * under section 8.1.3.1 DVSEC CXL Capability. - */ - u32 reset_timeouts_ms[] = { 10, 100, 1000, 10000, 100000 }; - u16 reg; - u32 timeout_ms; - int rc, ind; - - /* Check if CXL Reset MEM CLR is supported. */ - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CAP_OFFSET, ®); - if (rc) - return rc; - - if (reg & CXL_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, - ®); - if (rc) - return rc; - - reg |= CXL_DVSEC_CXL_RST_MEM_CLR_ENABLE; - pci_write_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, reg); - } - - /* Read timeout value. */ - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CAP_OFFSET, ®); - if (rc) - return rc; - ind = FIELD_GET(CXL_DVSEC_CXL_RST_TIMEOUT_MASK, reg); - timeout_ms = reset_timeouts_ms[ind]; - - /* Write reset config. */ - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, ®); - if (rc) - return rc; - - reg |= CXL_DVSEC_INIT_CXL_RESET; - pci_write_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, reg); - - /* Wait till timeout and then check reset status is complete. */ - msleep(timeout_ms); - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_STATUS2_OFFSET, ®); - if (rc) - return rc; - if (reg & CXL_DVSEC_CXL_RESET_ERR || - ~reg & CXL_DVSEC_CXL_RST_COMPLETE) - return -ETIMEDOUT; - - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, ®); - if (rc) - return rc; - reg &= (~CXL_DVSEC_DISABLE_CACHING); - pci_write_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, reg); - - return 0; -} - -/** - * cxl_reset - initiate a cxl reset - * @dev: device to reset - * @probe: if true, return 0 if device can be reset this way - * - * Initiate a cxl reset on @dev. - */ -static int cxl_reset(struct pci_dev *dev, bool probe) -{ - u16 dvsec, reg; - int rc; - - dvsec = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL, - CXL_DVSEC_PCIE_DEVICE); - if (!dvsec) - return -ENOTTY; - - /* Check if CXL Reset is supported. */ - rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CAP_OFFSET, ®); - if (rc) - return -ENOTTY; - - if ((reg & CXL_DVSEC_CXL_RST_CAPABLE) == 0) - return -ENOTTY; - - if (probe) - return 0; - - rc = cxl_reset_prepare(dev, dvsec); - if (rc) - return rc; - - return cxl_reset_init(dev, dvsec); -} - void pci_dev_lock(struct pci_dev *dev) { /* block PM suspend, driver probe, etc. */ @@ -5365,7 +5219,6 @@ const struct pci_reset_fn_method pci_reset_fn_methods[] = { { pci_dev_acpi_reset, .name = "acpi" }, { pcie_reset_flr, .name = "flr" }, { pci_af_flr, .name = "af_flr" }, - { cxl_reset, .name = "cxl_reset" }, { pci_pm_reset, .name = "pm" }, { pci_reset_bus_function, .name = "bus" }, { cxl_reset_bus_function, .name = "cxl_bus" }, diff --git a/include/linux/pci.h b/include/linux/pci.h index a5837cd74faad..1bdfd152eb1f8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -51,7 +51,7 @@ PCI_STATUS_PARITY) /* Number of reset methods used in pci_reset_fn_methods array in pci.c */ -#define PCI_NUM_RESET_METHODS 9 +#define PCI_NUM_RESET_METHODS 8 #define PCI_RESET_PROBE true #define PCI_RESET_DO_RESET false From 96cada1a358236846eb36504d786303787df59a4 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Mon, 11 Aug 2025 20:25:19 +0800 Subject: [PATCH 002/143] cxl/hdm: Use str_plural() to simplify the code Use the string choice helper function str_plural() to simplify the code. Signed-off-by: Xichao Zhao Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20250811122519.543554-1-zhao.xichao@vivo.com Signed-off-by: Dave Jiang (cherry picked from commit 22fb4ad898853323f4943de3e0dc555915547ccc) Signed-off-by: Jiandi An --- drivers/cxl/core/hdm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index e930191057c04..777b8ac0c49c1 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -197,7 +197,7 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, */ if (should_emulate_decoders(info)) { dev_dbg(dev, "Fallback map %d range register%s\n", info->ranges, - info->ranges > 1 ? "s" : ""); + str_plural(info->ranges)); cxlhdm->decoder_count = info->ranges; } From 72491e4fb4c57977379ca27fb2b51f3a067a7257 Mon Sep 17 00:00:00 2001 From: Nai-Chen Cheng Date: Tue, 12 Aug 2025 00:49:46 +0800 Subject: [PATCH 003/143] cxl/region: use str_enabled_disabled() instead of ternary operator Replace ternary operator with str_enabled_disabled() helper to enhance code readability and consistency. [dj: Fix spelling in commit log and subject. ] Signed-off-by: Nai-Chen Cheng Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250812-cxl-region-string-choices-v1-1-50200b0bc782@gmail.com Signed-off-by: Dave Jiang (cherry picked from commit 733c4e9bcec9c481afee3891218277d9ecd06599) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index adebbb1db5078..5c581b175013c 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -1468,9 +1469,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, dev_name(port->uport_dev), dev_name(&port->dev), __func__, cxld->interleave_ways, cxld->interleave_granularity, - (cxld->flags & CXL_DECODER_F_ENABLE) ? - "enabled" : - "disabled", + str_enabled_disabled(cxld->flags & CXL_DECODER_F_ENABLE), cxld->hpa_range.start, cxld->hpa_range.end); return -ENXIO; } From bcb76b20b4b26516cd3b90087687578a731950e3 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 4 Aug 2025 01:00:09 -0700 Subject: [PATCH 004/143] cxl: Move hpa_to_spa callback to a new root decoder ops structure The root decoder's HPA to SPA translation logic was implemented using a single function pointer. In preparation for additional per-decoder callbacks, convert this into a struct cxl_rd_ops and move the hpa_to_spa pointer into it. To avoid maintaining a static ops instance populated with mostly NULL pointers, allocate the ops structure dynamically only when a platform requires overrides (e.g. XOR interleave decoding). The setup can be extended as additional callbacks are added. Co-developed-by: Dave Jiang Signed-off-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/818530c82c351a9c0d3a204f593068dd2126a5a9.1754290144.git.alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 524b2b76f365fb90a7f894ac17261ea760464e2c) Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 10 +++++++--- drivers/cxl/core/port.c | 1 + drivers/cxl/core/region.c | 11 ++++++++--- drivers/cxl/cxl.h | 12 +++++++++--- 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 87f0ed3f3f51f..de5f08122aa92 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -20,7 +20,6 @@ static const guid_t acpi_cxl_qtg_id_guid = GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071, 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52); - static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) { struct cxl_cxims_data *cximsd = cxlrd->platform_data; @@ -472,8 +471,13 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, cxlrd->qos_class = cfmws->qtg_id; - if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) - cxlrd->hpa_to_spa = cxl_xor_hpa_to_spa; + if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) { + cxlrd->ops = kzalloc(sizeof(*cxlrd->ops), GFP_KERNEL); + if (!cxlrd->ops) + return -ENOMEM; + + cxlrd->ops->hpa_to_spa = cxl_xor_hpa_to_spa; + } rc = cxl_decoder_add(cxld, target_map); if (rc) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 29197376b18e3..8f36ff413f5d5 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -450,6 +450,7 @@ static void cxl_root_decoder_release(struct device *dev) if (atomic_read(&cxlrd->region_id) >= 0) memregion_free(atomic_read(&cxlrd->region_id)); __cxl_decoder_release(&cxlrd->cxlsd.cxld); + kfree(cxlrd->ops); kfree(cxlrd); } diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 5c581b175013c..ef1f69ba8899d 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2917,6 +2917,11 @@ static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos) return false; } +static bool has_hpa_to_spa(struct cxl_root_decoder *cxlrd) +{ + return cxlrd->ops && cxlrd->ops->hpa_to_spa; +} + u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) { @@ -2971,8 +2976,8 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, hpa = hpa_offset + p->res->start + p->cache_size; /* Root decoder translation overrides typical modulo decode */ - if (cxlrd->hpa_to_spa) - hpa = cxlrd->hpa_to_spa(cxlrd, hpa); + if (has_hpa_to_spa(cxlrd)) + hpa = cxlrd->ops->hpa_to_spa(cxlrd, hpa); if (!cxl_resource_contains_addr(p->res, hpa)) { dev_dbg(&cxlr->dev, @@ -2981,7 +2986,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, } /* Simple chunk check, by pos & gran, only applies to modulo decodes */ - if (!cxlrd->hpa_to_spa && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos))) + if (!has_hpa_to_spa(cxlrd) && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos))) return ULLONG_MAX; return hpa; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 847e37be42c47..4b247ab188833 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -419,27 +419,33 @@ struct cxl_switch_decoder { }; struct cxl_root_decoder; -typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa); +/** + * struct cxl_rd_ops - CXL root decoder callback operations + * @hpa_to_spa: Convert host physical address to system physical address + */ +struct cxl_rd_ops { + u64 (*hpa_to_spa)(struct cxl_root_decoder *cxlrd, u64 hpa); +}; /** * struct cxl_root_decoder - Static platform CXL address decoder * @res: host / parent resource for region allocations * @cache_size: extended linear cache size if exists, otherwise zero. * @region_id: region id for next region provisioning event - * @hpa_to_spa: translate CXL host-physical-address to Platform system-physical-address * @platform_data: platform specific configuration data * @range_lock: sync region autodiscovery by address range * @qos_class: QoS performance class cookie + * @ops: CXL root decoder operations * @cxlsd: base cxl switch decoder */ struct cxl_root_decoder { struct resource *res; resource_size_t cache_size; atomic_t region_id; - cxl_hpa_to_spa_fn hpa_to_spa; void *platform_data; struct mutex range_lock; int qos_class; + struct cxl_rd_ops *ops; struct cxl_switch_decoder cxlsd; }; From 9adb9f38bc2e1952346c75c12a58db3eda0d3e39 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 4 Aug 2025 01:00:10 -0700 Subject: [PATCH 005/143] cxl: Define a SPA->CXL HPA root decoder callback for XOR Math When DPA->SPA translation was introduced, it included a helper that applied the XOR maps to do the CXL HPA -> SPA translation for XOR region interleaves. In preparation for adding SPA->DPA address translation, introduce the reverse callback. The root decoder callback is defined generically and not all usages may be self inverting like this XOR function. Add another root decoder callback that is the spa_to_hpa function. Update the existing cxl_xor_hpa_to_spa() with a name that reflects what it does without directionality: cxl_apply_xor_maps(), a generic parameter: addr replaces hpa, and code comments stating that the function supports the translation in either direction. Signed-off-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/79d9d72230c599cae94d7221781ead6392ae6d3f.1754290144.git.alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit b83ee9614a3ec196111f0ae54335b99700f78b45) Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 27 ++++++++++++++++----------- drivers/cxl/cxl.h | 2 ++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index de5f08122aa92..b9ba1c33e4d24 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -20,7 +20,7 @@ static const guid_t acpi_cxl_qtg_id_guid = GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071, 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52); -static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) +static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr) { struct cxl_cxims_data *cximsd = cxlrd->platform_data; int hbiw = cxlrd->cxlsd.nr_targets; @@ -29,19 +29,23 @@ static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) /* No xormaps for host bridge interleave ways of 1 or 3 */ if (hbiw == 1 || hbiw == 3) - return hpa; + return addr; /* - * For root decoders using xormaps (hbiw: 2,4,6,8,12,16) restore - * the position bit to its value before the xormap was applied at - * HPA->DPA translation. + * In regions using XOR interleave arithmetic the CXL HPA may not + * be the same as the SPA. This helper performs the SPA->CXL HPA + * or the CXL HPA->SPA translation. Since XOR is self-inverting, + * so is this function. + * + * For root decoders using xormaps (hbiw: 2,4,6,8,12,16) applying the + * xormaps will toggle a position bit. * * pos is the lowest set bit in an XORMAP - * val is the XORALLBITS(HPA & XORMAP) + * val is the XORALLBITS(addr & XORMAP) * * XORALLBITS: The CXL spec (3.1 Table 9-22) defines XORALLBITS * as an operation that outputs a single bit by XORing all the - * bits in the input (hpa & xormap). Implement XORALLBITS using + * bits in the input (addr & xormap). Implement XORALLBITS using * hweight64(). If the hamming weight is even the XOR of those * bits results in val==0, if odd the XOR result is val==1. */ @@ -50,11 +54,11 @@ static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) if (!cximsd->xormaps[i]) continue; pos = __ffs(cximsd->xormaps[i]); - val = (hweight64(hpa & cximsd->xormaps[i]) & 1); - hpa = (hpa & ~(1ULL << pos)) | (val << pos); + val = (hweight64(addr & cximsd->xormaps[i]) & 1); + addr = (addr & ~(1ULL << pos)) | (val << pos); } - return hpa; + return addr; } struct cxl_cxims_context { @@ -476,7 +480,8 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, if (!cxlrd->ops) return -ENOMEM; - cxlrd->ops->hpa_to_spa = cxl_xor_hpa_to_spa; + cxlrd->ops->hpa_to_spa = cxl_apply_xor_maps; + cxlrd->ops->spa_to_hpa = cxl_apply_xor_maps; } rc = cxl_decoder_add(cxld, target_map); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 4b247ab188833..4fe3df06f57a3 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -422,9 +422,11 @@ struct cxl_root_decoder; /** * struct cxl_rd_ops - CXL root decoder callback operations * @hpa_to_spa: Convert host physical address to system physical address + * @spa_to_hpa: Convert system physical address to host physical address */ struct cxl_rd_ops { u64 (*hpa_to_spa)(struct cxl_root_decoder *cxlrd, u64 hpa); + u64 (*spa_to_hpa)(struct cxl_root_decoder *cxlrd, u64 spa); }; /** From 81a3bda34d786fc94199f9cdf2c742e6df096e9f Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 4 Aug 2025 01:00:11 -0700 Subject: [PATCH 006/143] cxl/region: Introduce SPA to DPA address translation Add infrastructure to translate System Physical Addresses (SPA) to Device Physical Addresses (DPA) within CXL regions. This capability will be used by follow-on patches that add poison inject and clear operations at the region level. The SPA-to-DPA translation process follows these steps: 1. Apply root decoder transformations (SPA to HPA) if configured. 2. Extract the position in region interleave from the HPA offset. 3. Extract the DPA offset from the HPA offset. 4. Use position to find endpoint decoder. 5. Use endpoint decoder to find memdev and calculate DPA from offset. 6. Return the result - a memdev and a DPA. It is Step 1 above that makes this a driver level operation and not work we can push to user space. Rather than exporting the XOR maps for root decoders configured with XOR interleave, the driver performs this complex calculation for the user. Steps 2 and 3 follow the CXL Spec 3.2 Section 8.2.4.20.13 Implementation Note: Device Decode Logic. These calculations mirror much of the logic introduced earlier in DPA to SPA translation, see cxl_dpa_to_hpa(), where the driver needed to reverse the spec defined 'Device Decode Logic'. Signed-off-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/422f0e27742c6ca9a11f7cd83e6ba9fa1a8d0c74.1754290144.git.alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit dc181170491bda9944f95ca39017667fe7fd767d) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 101 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index ef1f69ba8899d..5892de29b470b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2922,6 +2922,11 @@ static bool has_hpa_to_spa(struct cxl_root_decoder *cxlrd) return cxlrd->ops && cxlrd->ops->hpa_to_spa; } +static bool has_spa_to_hpa(struct cxl_root_decoder *cxlrd) +{ + return cxlrd->ops && cxlrd->ops->spa_to_hpa; +} + u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) { @@ -2992,6 +2997,102 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, return hpa; } +struct dpa_result { + struct cxl_memdev *cxlmd; + u64 dpa; +}; + +static int __maybe_unused region_offset_to_dpa_result(struct cxl_region *cxlr, + u64 offset, + struct dpa_result *result) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_endpoint_decoder *cxled; + u64 hpa, hpa_offset, dpa_offset; + u64 bits_upper, bits_lower; + u64 shifted, rem, temp; + u16 eig = 0; + u8 eiw = 0; + int pos; + + lockdep_assert_held(&cxl_rwsem.region); + lockdep_assert_held(&cxl_rwsem.dpa); + + /* Input validation ensures valid ways and gran */ + granularity_to_eig(p->interleave_granularity, &eig); + ways_to_eiw(p->interleave_ways, &eiw); + + /* + * If the root decoder has SPA to CXL HPA callback, use it. Otherwise + * CXL HPA is assumed to equal SPA. + */ + if (has_spa_to_hpa(cxlrd)) { + hpa = cxlrd->ops->spa_to_hpa(cxlrd, p->res->start + offset); + hpa_offset = hpa - p->res->start; + } else { + hpa_offset = offset; + } + /* + * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13 + * eiw < 8 + * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8]. + * Per spec "remove IW bits starting with bit position IG+8" + * eiw >= 8 + * Position is not explicitly stored in HPA_OFFSET bits. It is + * derived from the modulo operation of the upper bits using + * the total number of interleave ways. + */ + if (eiw < 8) { + pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0); + } else { + shifted = hpa_offset >> (eig + 8); + div64_u64_rem(shifted, p->interleave_ways, &rem); + pos = rem; + } + if (pos < 0 || pos >= p->nr_targets) { + dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n", + pos, p->nr_targets); + return -ENXIO; + } + + /* + * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13 + * Lower bits [IG+7:0] pass through unchanged + * (eiw < 8) + * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW) + * Clear the position bits to isolate upper section, then + * reverse the left shift by eiw that occurred during DPA->HPA + * (eiw >= 8) + * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3 + * Extract upper bits from the correct bit range and divide by 3 + * to recover the original DPA upper bits + */ + bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0); + if (eiw < 8) { + temp = hpa_offset &= ~((u64)GENMASK(eig + eiw + 8 - 1, 0)); + dpa_offset = temp >> eiw; + } else { + bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3); + dpa_offset = bits_upper << (eig + 8); + } + dpa_offset |= bits_lower; + + /* Look-up and return the result: a memdev and a DPA */ + for (int i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + if (cxled->pos != pos) + continue; + result->cxlmd = cxled_to_memdev(cxled); + result->dpa = cxl_dpa_resource_start(cxled) + dpa_offset; + + return 0; + } + dev_err(&cxlr->dev, "No device found for position %d\n", pos); + + return -ENXIO; +} + static struct lock_class_key cxl_pmem_region_key; static int cxl_pmem_region_alloc(struct cxl_region *cxlr) From 39224db3084bf946efba3b9eece03e5253ce329c Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 4 Aug 2025 01:00:12 -0700 Subject: [PATCH 007/143] cxl/core: Add locked variants of the poison inject and clear funcs The core functions that validate and send inject and clear commands to the memdev devices require holding both the dpa_rwsem and the region_rwsem. In preparation for another caller of these functions that must hold the locks upon entry, split the work into a locked and unlocked pair. Consideration was given to moving the locking to both callers, however, the existing caller is not in the core (mem.c) and cannot access the locks. Signed-off-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/1d601f586975195733984ca63d1b5789bbe8690f.1754290144.git.alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 25a0207828bc52f1ebb6588f9417eb43ca4960a3) Signed-off-by: Jiandi An --- drivers/cxl/core/memdev.c | 52 +++++++++++++++++++++++++++------------ drivers/cxl/cxlmem.h | 2 ++ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index c569e00a511f4..90d3390d9c7c6 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -276,7 +276,7 @@ static int cxl_validate_poison_dpa(struct cxl_memdev *cxlmd, u64 dpa) return 0; } -int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa) +int cxl_inject_poison_locked(struct cxl_memdev *cxlmd, u64 dpa) { struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; struct cxl_mbox_inject_poison inject; @@ -288,13 +288,8 @@ int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa) if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; - ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); - if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) - return rc; - - ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); - if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) - return rc; + lockdep_assert_held(&cxl_rwsem.dpa); + lockdep_assert_held(&cxl_rwsem.region); rc = cxl_validate_poison_dpa(cxlmd, dpa); if (rc) @@ -324,9 +319,24 @@ int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa) return 0; } + +int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa) +{ + int rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + return cxl_inject_poison_locked(cxlmd, dpa); +} EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, "CXL"); -int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa) +int cxl_clear_poison_locked(struct cxl_memdev *cxlmd, u64 dpa) { struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; struct cxl_mbox_clear_poison clear; @@ -338,13 +348,8 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa) if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; - ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); - if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) - return rc; - - ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); - if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) - return rc; + lockdep_assert_held(&cxl_rwsem.dpa); + lockdep_assert_held(&cxl_rwsem.region); rc = cxl_validate_poison_dpa(cxlmd, dpa); if (rc) @@ -383,6 +388,21 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa) return 0; } + +int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa) +{ + int rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + return cxl_clear_poison_locked(cxlmd, dpa); +} EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, "CXL"); static struct attribute *cxl_memdev_attributes[] = { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 751478dfc4106..434031a0c1f74 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -869,6 +869,8 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len, int cxl_trigger_poison_list(struct cxl_memdev *cxlmd); int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa); int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa); +int cxl_inject_poison_locked(struct cxl_memdev *cxlmd, u64 dpa); +int cxl_clear_poison_locked(struct cxl_memdev *cxlmd, u64 dpa); #ifdef CONFIG_CXL_EDAC_MEM_FEATURES int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd); From 1b68abfb430981aaa26a99b333e7e4894e6bedcc Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 4 Aug 2025 01:00:13 -0700 Subject: [PATCH 008/143] cxl/region: Add inject and clear poison by region offset Add CXL region debugfs attributes to inject and clear poison based on an offset into the region. These new interfaces allow users to operate on poison at the region level without needing to resolve Device Physical Addresses (DPA) or target individual memdevs. The implementation uses a new helper, region_offset_to_dpa_result() that applies decoder interleave logic, including XOR-based address decoding when applicable. Note that XOR decodes rely on driver internal xormaps which are not exposed to userspace. So, this support is not only a simplification of poison operations that could be done using existing per memdev operations, but also it enables this functionality for XOR interleaved regions for the first time. New debugfs attributes are added in /sys/kernel/debug/cxl/regionX/: inject_poison and clear_poison. These are only exposed if all memdevs participating in the region support both inject and clear commands, ensuring consistent and reliable behavior across multi-device regions. If tracing is enabled, these operations are logged as cxl_poison events in /sys/kernel/tracing/trace. The ABI documentation warns users of the significant risks that come with using these capabilities. A CXL Maturity Map update shows this user flow is now supported. Signed-off-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/f3fd8628ab57ea79704fb2d645902cd499c066af.1754290144.git.alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit c3dd67681c70cc95cc2c889b1b58a1667bb1c48b) Signed-off-by: Jiandi An --- Documentation/ABI/testing/debugfs-cxl | 87 ++++++++++++ Documentation/driver-api/cxl/maturity-map.rst | 2 +- drivers/cxl/core/core.h | 4 + drivers/cxl/core/memdev.c | 8 ++ drivers/cxl/core/region.c | 131 +++++++++++++++++- 5 files changed, 228 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/debugfs-cxl b/Documentation/ABI/testing/debugfs-cxl index e95e21f131e96..2989d4da96c1b 100644 --- a/Documentation/ABI/testing/debugfs-cxl +++ b/Documentation/ABI/testing/debugfs-cxl @@ -19,6 +19,20 @@ Description: is returned to the user. The inject_poison attribute is only visible for devices supporting the capability. + TEST-ONLY INTERFACE: This interface is intended for testing + and validation purposes only. It is not a data repair mechanism + and should never be used on production systems or live data. + + DATA LOSS RISK: For CXL persistent memory (PMEM) devices, + poison injection can result in permanent data loss. Injected + poison may render data permanently inaccessible even after + clearing, as the clear operation writes zeros and does not + recover original data. + + SYSTEM STABILITY RISK: For volatile memory, poison injection + can cause kernel crashes, system instability, or unpredictable + behavior if the poisoned addresses are accessed by running code + or critical kernel structures. What: /sys/kernel/debug/cxl/memX/clear_poison Date: April, 2023 @@ -35,6 +49,79 @@ Description: The clear_poison attribute is only visible for devices supporting the capability. + TEST-ONLY INTERFACE: This interface is intended for testing + and validation purposes only. It is not a data repair mechanism + and should never be used on production systems or live data. + + CLEAR IS NOT DATA RECOVERY: This operation writes zeros to the + specified address range and removes the address from the poison + list. It does NOT recover or restore original data that may have + been present before poison injection. Any original data at the + cleared address is permanently lost and replaced with zeros. + + CLEAR IS NOT A REPAIR MECHANISM: This interface is for testing + purposes only and should not be used as a data repair tool. + Clearing poison is fundamentally different from data recovery + or error correction. + +What: /sys/kernel/debug/cxl/regionX/inject_poison +Date: August, 2025 +Contact: linux-cxl@vger.kernel.org +Description: + (WO) When a Host Physical Address (HPA) is written to this + attribute, the region driver translates it to a Device + Physical Address (DPA) and identifies the corresponding + memdev. It then sends an inject poison command to that memdev + at the translated DPA. Refer to the memdev ABI entry at: + /sys/kernel/debug/cxl/memX/inject_poison for the detailed + behavior. This attribute is only visible if all memdevs + participating in the region support both inject and clear + poison commands. + + TEST-ONLY INTERFACE: This interface is intended for testing + and validation purposes only. It is not a data repair mechanism + and should never be used on production systems or live data. + + DATA LOSS RISK: For CXL persistent memory (PMEM) devices, + poison injection can result in permanent data loss. Injected + poison may render data permanently inaccessible even after + clearing, as the clear operation writes zeros and does not + recover original data. + + SYSTEM STABILITY RISK: For volatile memory, poison injection + can cause kernel crashes, system instability, or unpredictable + behavior if the poisoned addresses are accessed by running code + or critical kernel structures. + +What: /sys/kernel/debug/cxl/regionX/clear_poison +Date: August, 2025 +Contact: linux-cxl@vger.kernel.org +Description: + (WO) When a Host Physical Address (HPA) is written to this + attribute, the region driver translates it to a Device + Physical Address (DPA) and identifies the corresponding + memdev. It then sends a clear poison command to that memdev + at the translated DPA. Refer to the memdev ABI entry at: + /sys/kernel/debug/cxl/memX/clear_poison for the detailed + behavior. This attribute is only visible if all memdevs + participating in the region support both inject and clear + poison commands. + + TEST-ONLY INTERFACE: This interface is intended for testing + and validation purposes only. It is not a data repair mechanism + and should never be used on production systems or live data. + + CLEAR IS NOT DATA RECOVERY: This operation writes zeros to the + specified address range and removes the address from the poison + list. It does NOT recover or restore original data that may have + been present before poison injection. Any original data at the + cleared address is permanently lost and replaced with zeros. + + CLEAR IS NOT A REPAIR MECHANISM: This interface is for testing + purposes only and should not be used as a data repair tool. + Clearing poison is fundamentally different from data recovery + or error correction. + What: /sys/kernel/debug/cxl/einj_types Date: January, 2024 KernelVersion: v6.9 diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst index 1330f3f52129a..282c1102dd819 100644 --- a/Documentation/driver-api/cxl/maturity-map.rst +++ b/Documentation/driver-api/cxl/maturity-map.rst @@ -173,7 +173,7 @@ Accelerator User Flow Support ----------------- -* [0] Inject & clear poison by HPA +* [2] Inject & clear poison by region offset Details ======= diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 2669f251d6775..eac8cc1bdaa07 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -135,6 +135,10 @@ enum cxl_poison_trace_type { CXL_POISON_TRACE_CLEAR, }; +enum poison_cmd_enabled_bits; +bool cxl_memdev_has_poison_cmd(struct cxl_memdev *cxlmd, + enum poison_cmd_enabled_bits cmd); + long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 90d3390d9c7c6..e370d733e4400 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -200,6 +200,14 @@ static ssize_t security_erase_store(struct device *dev, static struct device_attribute dev_attr_security_erase = __ATTR(erase, 0200, NULL, security_erase_store); +bool cxl_memdev_has_poison_cmd(struct cxl_memdev *cxlmd, + enum poison_cmd_enabled_bits cmd) +{ + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + + return test_bit(cmd, mds->poison.enabled_cmds); +} + static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd) { struct cxl_dev_state *cxlds = cxlmd->cxlds; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 5892de29b470b..04d326c274875 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2,6 +2,7 @@ /* Copyright(c) 2022 Intel Corporation. All rights reserved. */ #include #include +#include #include #include #include @@ -3002,9 +3003,8 @@ struct dpa_result { u64 dpa; }; -static int __maybe_unused region_offset_to_dpa_result(struct cxl_region *cxlr, - u64 offset, - struct dpa_result *result) +static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, + struct dpa_result *result) { struct cxl_region_params *p = &cxlr->params; struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); @@ -3652,6 +3652,105 @@ static void shutdown_notifiers(void *_cxlr) unregister_mt_adistance_algorithm(&cxlr->adist_notifier); } +static void remove_debugfs(void *dentry) +{ + debugfs_remove_recursive(dentry); +} + +static int validate_region_offset(struct cxl_region *cxlr, u64 offset) +{ + struct cxl_region_params *p = &cxlr->params; + resource_size_t region_size; + u64 hpa; + + if (offset < p->cache_size) { + dev_err(&cxlr->dev, + "Offset %#llx is within extended linear cache %#llx\n", + offset, p->cache_size); + return -EINVAL; + } + + region_size = resource_size(p->res); + if (offset >= region_size) { + dev_err(&cxlr->dev, "Offset %#llx exceeds region size %#llx\n", + offset, region_size); + return -EINVAL; + } + + hpa = p->res->start + offset; + if (hpa < p->res->start || hpa > p->res->end) { + dev_err(&cxlr->dev, "HPA %#llx not in region %pr\n", hpa, + p->res); + return -EINVAL; + } + + return 0; +} + +static int cxl_region_debugfs_poison_inject(void *data, u64 offset) +{ + struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL }; + struct cxl_region *cxlr = data; + int rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + if (validate_region_offset(cxlr, offset)) + return -EINVAL; + + rc = region_offset_to_dpa_result(cxlr, offset, &result); + if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { + dev_dbg(&cxlr->dev, + "Failed to resolve DPA for region offset %#llx rc %d\n", + offset, rc); + + return rc ? rc : -EINVAL; + } + + return cxl_inject_poison_locked(result.cxlmd, result.dpa); +} + +DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_inject_fops, NULL, + cxl_region_debugfs_poison_inject, "%llx\n"); + +static int cxl_region_debugfs_poison_clear(void *data, u64 offset) +{ + struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL }; + struct cxl_region *cxlr = data; + int rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + if (validate_region_offset(cxlr, offset)) + return -EINVAL; + + rc = region_offset_to_dpa_result(cxlr, offset, &result); + if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { + dev_dbg(&cxlr->dev, + "Failed to resolve DPA for region offset %#llx rc %d\n", + offset, rc); + + return rc ? rc : -EINVAL; + } + + return cxl_clear_poison_locked(result.cxlmd, result.dpa); +} + +DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, + cxl_region_debugfs_poison_clear, "%llx\n"); + static int cxl_region_can_probe(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; @@ -3681,6 +3780,7 @@ static int cxl_region_probe(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); struct cxl_region_params *p = &cxlr->params; + bool poison_supported = true; int rc; rc = cxl_region_can_probe(cxlr); @@ -3704,6 +3804,31 @@ static int cxl_region_probe(struct device *dev) if (rc) return rc; + /* Create poison attributes if all memdevs support the capabilities */ + for (int i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + + if (!cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_INJECT) || + !cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_CLEAR)) { + poison_supported = false; + break; + } + } + + if (poison_supported) { + struct dentry *dentry; + + dentry = cxl_debugfs_create_dir(dev_name(dev)); + debugfs_create_file("inject_poison", 0200, dentry, cxlr, + &cxl_poison_inject_fops); + debugfs_create_file("clear_poison", 0200, dentry, cxlr, + &cxl_poison_clear_fops); + rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); + if (rc) + return rc; + } + switch (cxlr->mode) { case CXL_PARTMODE_PMEM: rc = devm_cxl_region_edac_register(cxlr); From 803e7861416e492ce663cb32d61b70dfb816e458 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 18 Aug 2025 08:39:53 -0700 Subject: [PATCH 009/143] cxl: Fix emit of type resource_size_t argument for validate_region_offset() 0day reported warnings of: drivers/cxl/core/region.c:3664:25: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 4 has type 'resource_size_t' {aka 'unsigned int'} [-Wformat=] drivers/cxl/core/region.c:3671:37: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 4 has type 'resource_size_t' {aka 'unsigned int'} [-Wformat=] Replace %#llx with %pr to emit resource_size_t arguments. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508160513.NAZ9i9rQ-lkp@intel.com/ Cc: Alison Schofield Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250818153953.3658952-1-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit e6a9530b3ee7407b70b60e4df70688db0d239e1a) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 04d326c274875..d9d65229eb58a 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3665,15 +3665,15 @@ static int validate_region_offset(struct cxl_region *cxlr, u64 offset) if (offset < p->cache_size) { dev_err(&cxlr->dev, - "Offset %#llx is within extended linear cache %#llx\n", - offset, p->cache_size); + "Offset %#llx is within extended linear cache %pr\n", + offset, &p->cache_size); return -EINVAL; } region_size = resource_size(p->res); if (offset >= region_size) { - dev_err(&cxlr->dev, "Offset %#llx exceeds region size %#llx\n", - offset, region_size); + dev_err(&cxlr->dev, "Offset %#llx exceeds region size %pr\n", + offset, ®ion_size); return -EINVAL; } From a77661f34717e3ad46331341dd5f1c3f61367a7e Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:04 -0700 Subject: [PATCH 010/143] mm/memory_hotplug: Update comment for hotplug memory callback priorities Add clarification to comment for memory hotplug callback ordering as the current comment does not provide clear language on which callback happens first. Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-2-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 65128868bb3b0621d2d8e71f19852675a064b373) Signed-off-by: Jiandi An --- include/linux/memory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/memory.h b/include/linux/memory.h index 2a770e7c6ab1e..d231a2323331a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -115,8 +115,8 @@ struct notifier_block; struct mem_section; /* - * Priorities for the hotplug memory callback routines (stored in decreasing - * order in the callback chain) + * Priorities for the hotplug memory callback routines. Invoked from + * high to low. Higher priorities correspond to higher numbers. */ #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 From 7615f21661b24857c82bf91baf50789873879790 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:05 -0700 Subject: [PATCH 011/143] drivers/base/node: Add a helper function node_update_perf_attrs() Add helper function node_update_perf_attrs() to allow update of node access coordinates computed by an external agent such as CXL. The helper allows updating of coordinates after the attribute being created by HMAT. Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-3-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit b57fc652ca24ada3b0c888327f9944ed21559286) Signed-off-by: Jiandi An --- drivers/base/node.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/node.h | 8 ++++++++ 2 files changed, 46 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index 67b01d5797377..3e2329ccb618d 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -248,6 +248,44 @@ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, } EXPORT_SYMBOL_GPL(node_set_perf_attrs); +/** + * node_update_perf_attrs - Update the performance values for given access class + * @nid: Node identifier to be updated + * @coord: Heterogeneous memory performance coordinates + * @access: The access class for the given attributes + */ +void node_update_perf_attrs(unsigned int nid, struct access_coordinate *coord, + enum access_coordinate_class access) +{ + struct node_access_nodes *access_node; + struct node *node; + int i; + + if (WARN_ON_ONCE(!node_online(nid))) + return; + + node = node_devices[nid]; + list_for_each_entry(access_node, &node->access_list, list_node) { + if (access_node->access != access) + continue; + + access_node->coord = *coord; + for (i = 0; access_attrs[i]; i++) { + sysfs_notify(&access_node->dev.kobj, + NULL, access_attrs[i]->name); + } + break; + } + + /* When setting CPU access coordinates, update mempolicy */ + if (access != ACCESS_COORDINATE_CPU) + return; + + if (mempolicy_set_node_perf(nid, coord)) + pr_info("failed to set mempolicy attrs for node %d\n", nid); +} +EXPORT_SYMBOL_GPL(node_update_perf_attrs); + /** * struct node_cache_info - Internal tracking for memory node caches * @dev: Device represeting the cache level diff --git a/include/linux/node.h b/include/linux/node.h index 2c7529335b21a..866e3323f1fdc 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -85,6 +85,8 @@ struct node_cache_attrs { void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs); void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, enum access_coordinate_class access); +void node_update_perf_attrs(unsigned int nid, struct access_coordinate *coord, + enum access_coordinate_class access); #else static inline void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs) @@ -96,6 +98,12 @@ static inline void node_set_perf_attrs(unsigned int nid, enum access_coordinate_class access) { } + +static inline void node_update_perf_attrs(unsigned int nid, + struct access_coordinate *coord, + enum access_coordinate_class access) +{ +} #endif struct node { From 184617310d5312f33d2e527647482af30018bce7 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:06 -0700 Subject: [PATCH 012/143] cxl, acpi/hmat: Update CXL access coordinates directly instead of through HMAT The current implementation of CXL memory hotplug notifier gets called before the HMAT memory hotplug notifier. The CXL driver calculates the access coordinates (bandwidth and latency values) for the CXL end to end path (i.e. CPU to endpoint). When the CXL region is onlined, the CXL memory hotplug notifier writes the access coordinates to the HMAT target structs. Then the HMAT memory hotplug notifier is called and it creates the access coordinates for the node sysfs attributes. During testing on an Intel platform, it was found that although the newly calculated coordinates were pushed to sysfs, the sysfs attributes for the access coordinates showed up with the wrong initiator. The system has 4 nodes (0, 1, 2, 3) where node 0 and 1 are CPU nodes and node 2 and 3 are CXL nodes. The expectation is that node 2 would show up as a target to node 0: /sys/devices/system/node/node2/access0/initiators/node0 However it was observed that node 2 showed up as a target under node 1: /sys/devices/system/node/node2/access0/initiators/node1 The original intent of the 'ext_updated' flag in HMAT handling code was to stop HMAT memory hotplug callback from clobbering the access coordinates after CXL has injected its calculated coordinates and replaced the generic target access coordinates provided by the HMAT table in the HMAT target structs. However the flag is hacky at best and blocks the updates from other CXL regions that are onlined in the same node later on. Remove the 'ext_updated' flag usage and just update the access coordinates for the nodes directly without touching HMAT target data. The hotplug memory callback ordering is changed. Instead of changing CXL, move HMAT back so there's room for the levels rather than have CXL share the same level as SLAB_CALLBACK_PRI. The change will resulting in the CXL callback to be executed after the HMAT callback. With the change, the CXL hotplug memory notifier runs after the HMAT callback. The HMAT callback will create the node sysfs attributes for access coordinates. The CXL callback will write the access coordinates to the now created node sysfs attributes directly and will not pollute the HMAT target values. A nodemask is introduced to keep track if a node has been updated and prevents further updates. Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region") Cc: stable@vger.kernel.org Tested-by: Marc Herbert Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-4-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 2e454fb8056df6da4bba7d89a57bf60e217463c0) Signed-off-by: Jiandi An --- drivers/acpi/numa/hmat.c | 6 ------ drivers/cxl/core/cdat.c | 5 ----- drivers/cxl/core/core.h | 1 - drivers/cxl/core/region.c | 20 ++++++++++++-------- include/linux/memory.h | 2 +- 5 files changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 9085375830605..f153a2c18f1d9 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -74,7 +74,6 @@ struct memory_target { struct node_cache_attrs cache_attrs; u8 gen_port_device_handle[ACPI_SRAT_DEVICE_HANDLE_SIZE]; bool registered; - bool ext_updated; /* externally updated */ }; struct memory_initiator { @@ -391,7 +390,6 @@ int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, coord->read_bandwidth, access); hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH, coord->write_bandwidth, access); - target->ext_updated = true; return 0; } @@ -773,10 +771,6 @@ static void hmat_update_target_attrs(struct memory_target *target, u32 best = 0; int i; - /* Don't update if an external agent has changed the data. */ - if (target->ext_updated) - return; - /* Don't update for generic port if there's no device handle */ if ((access == NODE_ACCESS_CLASS_GENPORT_SINK_LOCAL || access == NODE_ACCESS_CLASS_GENPORT_SINK_CPU) && diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c0af645425f4a..c891fd618cfda 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -1081,8 +1081,3 @@ int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, { return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); } - -bool cxl_need_node_perf_attrs_update(int nid) -{ - return !acpi_node_backed_by_real_pxm(nid); -} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index eac8cc1bdaa07..e5157a328f30c 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -143,7 +143,6 @@ long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, enum access_coordinate_class access); -bool cxl_need_node_perf_attrs_update(int nid); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d9d65229eb58a..238b148768148 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -32,6 +32,12 @@ * 3. Decoder targets */ +/* + * nodemask that sets per node when the access_coordinates for the node has + * been updated by the CXL memory hotplug notifier. + */ +static nodemask_t nodemask_region_seen = NODE_MASK_NONE; + static struct cxl_region *to_cxl_region(struct device *dev); #define __ACCESS_ATTR_RO(_level, _name) { \ @@ -2442,14 +2448,8 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { if (cxlr->coord[i].read_bandwidth) { - rc = 0; - if (cxl_need_node_perf_attrs_update(nid)) - node_set_perf_attrs(nid, &cxlr->coord[i], i); - else - rc = cxl_update_hmat_access_coordinates(nid, cxlr, i); - - if (rc == 0) - cset++; + node_update_perf_attrs(nid, &cxlr->coord[i], i); + cset++; } } @@ -2487,6 +2487,10 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, if (nid != region_nid) return NOTIFY_DONE; + /* No action needed if node bit already set */ + if (node_test_and_set(nid, nodemask_region_seen)) + return NOTIFY_DONE; + if (!cxl_region_update_coordinates(cxlr, nid)) return NOTIFY_DONE; diff --git a/include/linux/memory.h b/include/linux/memory.h index d231a2323331a..55f0a47c85ebf 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -120,8 +120,8 @@ struct mem_section; */ #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 -#define HMAT_CALLBACK_PRI 2 #define CXL_CALLBACK_PRI 5 +#define HMAT_CALLBACK_PRI 6 #define MM_COMPUTE_BATCH_PRI 10 #define CPUSET_CALLBACK_PRI 10 #define MEMTIER_HOTPLUG_PRI 100 From dccc854c5b09521b1a8a2a3d80f87098c65c2ab3 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:07 -0700 Subject: [PATCH 013/143] acpi/hmat: Remove now unused hmat_update_target_coordinates() Remove deadcode since CXL no longer calls hmat_update_target_coordinates(). Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-5-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit e99ecbc4c89adf551cccbbc00b5cb08c50969af6) Signed-off-by: Jiandi An --- drivers/acpi/numa/hmat.c | 28 ---------------------------- drivers/cxl/core/cdat.c | 6 ------ drivers/cxl/core/core.h | 2 -- include/linux/acpi.h | 12 ------------ 4 files changed, 48 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index f153a2c18f1d9..11e4483685c9c 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -367,34 +367,6 @@ static void hmat_update_target_access(struct memory_target *target, } } -int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, - enum access_coordinate_class access) -{ - struct memory_target *target; - int pxm; - - if (nid == NUMA_NO_NODE) - return -EINVAL; - - pxm = node_to_pxm(nid); - guard(mutex)(&target_lock); - target = find_mem_target(pxm); - if (!target) - return -ENODEV; - - hmat_update_target_access(target, ACPI_HMAT_READ_LATENCY, - coord->read_latency, access); - hmat_update_target_access(target, ACPI_HMAT_WRITE_LATENCY, - coord->write_latency, access); - hmat_update_target_access(target, ACPI_HMAT_READ_BANDWIDTH, - coord->read_bandwidth, access); - hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH, - coord->write_bandwidth, access); - - return 0; -} -EXPORT_SYMBOL_GPL(hmat_update_target_coordinates); - static __init void hmat_add_locality(struct acpi_hmat_locality *hmat_loc) { struct memory_locality *loc; diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c891fd618cfda..bca1ec279651d 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -1075,9 +1075,3 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth; } } - -int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, - enum access_coordinate_class access) -{ - return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); -} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index e5157a328f30c..5707cd60a8eb0 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -141,8 +141,6 @@ bool cxl_memdev_has_poison_cmd(struct cxl_memdev *cxlmd, long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); -int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, - enum access_coordinate_class access); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 42cbeaba2a510..0c6087ea979b2 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1637,18 +1637,6 @@ static inline void acpi_use_parent_companion(struct device *dev) ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent)); } -#ifdef CONFIG_ACPI_HMAT -int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, - enum access_coordinate_class access); -#else -static inline int hmat_update_target_coordinates(int nid, - struct access_coordinate *coord, - enum access_coordinate_class access) -{ - return -EOPNOTSUPP; -} -#endif - #ifdef CONFIG_ACPI_NUMA bool acpi_node_backed_by_real_pxm(int nid); #else From c21080e76843e965ec8736b8e6dd77f0f6678939 Mon Sep 17 00:00:00 2001 From: Rakuram Eswaran Date: Mon, 18 Aug 2025 23:23:34 +0530 Subject: [PATCH 014/143] Documentation/driver-api: Fix typo error in cxl Fixed the following typo errors intersparsed ==> interspersed in Documentation/driver-api/cxl/platform/bios-and-efi.rst Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Gregory Price Link: https://patch.msgid.link/20250818175335.5312-1-rakuram.e96@gmail.com Signed-off-by: Dave Jiang (cherry picked from commit a414408126d13d6d5b2d2c4e537295771cc256cb) Signed-off-by: Jiandi An --- Documentation/driver-api/cxl/platform/bios-and-efi.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/driver-api/cxl/platform/bios-and-efi.rst b/Documentation/driver-api/cxl/platform/bios-and-efi.rst index 645322632cc9b..a9aa0ccd92af7 100644 --- a/Documentation/driver-api/cxl/platform/bios-and-efi.rst +++ b/Documentation/driver-api/cxl/platform/bios-and-efi.rst @@ -202,7 +202,7 @@ future and such a configuration should be avoided. Memory Holes ------------ -If your platform includes memory holes intersparsed between your CXL memory, it +If your platform includes memory holes interspersed between your CXL memory, it is recommended to utilize multiple decoders to cover these regions of memory, rather than try to program the decoders to accept the entire range and expect Linux to manage the overlap. From feab42d8032e913a466f30c41c1a07b8db37aacc Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Sep 2025 09:00:34 -0700 Subject: [PATCH 015/143] cxl/acpi: Rename CFMW coherency restrictions ACPICA commit 710745713ad3a2543dbfb70e84764f31f0e46bdc This has been renamed in more recent CXL specs, as type3 (memory expanders) can also use HDM-DB for device coherent memory. Link: https://github.com/acpica/acpica/commit/710745713ad3a2543dbfb70e84764f31f0e46bdc Acked-by: Rafael J. Wysocki (Intel) Signed-off-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20250908160034.86471-1-dave@stgolabs.net Signed-off-by: Dave Jiang (cherry picked from commit c4272905c37930c19b54fa3549b22899122ce69e) Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 4 ++-- include/acpi/actbl1.h | 4 ++-- tools/testing/cxl/test/cxl.c | 18 +++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index b9ba1c33e4d24..b8f124685f1dc 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -116,9 +116,9 @@ static unsigned long cfmws_to_decoder_flags(int restrictions) { unsigned long flags = CXL_DECODER_F_ENABLE; - if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE2) + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_DEVMEM) flags |= CXL_DECODER_F_TYPE2; - if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE3) + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM) flags |= CXL_DECODER_F_TYPE3; if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_VOLATILE) flags |= CXL_DECODER_F_RAM; diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index 99fd1588ff382..eb787dfbd2fa0 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -560,8 +560,8 @@ struct acpi_cedt_cfmws_target_element { /* Values for Restrictions field above */ -#define ACPI_CEDT_CFMWS_RESTRICT_TYPE2 (1) -#define ACPI_CEDT_CFMWS_RESTRICT_TYPE3 (1<<1) +#define ACPI_CEDT_CFMWS_RESTRICT_DEVMEM (1) +#define ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM (1<<1) #define ACPI_CEDT_CFMWS_RESTRICT_VOLATILE (1<<2) #define ACPI_CEDT_CFMWS_RESTRICT_PMEM (1<<3) #define ACPI_CEDT_CFMWS_RESTRICT_FIXED (1<<4) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index f4dceecf7e335..8b5b8d17b8b84 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -210,7 +210,7 @@ static struct { }, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, @@ -225,7 +225,7 @@ static struct { }, .interleave_ways = 1, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, @@ -240,7 +240,7 @@ static struct { }, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, @@ -255,7 +255,7 @@ static struct { }, .interleave_ways = 1, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, @@ -270,7 +270,7 @@ static struct { }, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, @@ -285,7 +285,7 @@ static struct { }, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M, @@ -302,7 +302,7 @@ static struct { .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, @@ -318,7 +318,7 @@ static struct { .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR, .interleave_ways = 1, .granularity = 0, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, @@ -334,7 +334,7 @@ static struct { .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR, .interleave_ways = 8, .granularity = 1, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_512M * 6UL, From 7c6f7d990111b793bacc0665a486305417fa69c2 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 15 Sep 2025 16:57:20 +0200 Subject: [PATCH 016/143] cxl: Documentation/driver-api/cxl: Describe the x86 Low Memory Hole solution Add documentation on how to resolve conflicts between CXL Fixed Memory Windows, Platform Low Memory Holes, intermediate Switch and Endpoint Decoders. [dj]: Fixed inconsistent spacing after '.' [dj]: Fixed subject line from Alison. [dj]: Removed '::' before table from Bagas. Reviewed-by: Gregory Price Signed-off-by: Fabio M. De Francesco Reviewed-by: Bagas Sanjaya Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Signed-off-by: Dave Jiang (cherry picked from commit c5dca38633daa1e240144bac453cf9065604a413) Signed-off-by: Jiandi An --- Documentation/driver-api/cxl/conventions.rst | 135 +++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/Documentation/driver-api/cxl/conventions.rst b/Documentation/driver-api/cxl/conventions.rst index da347a81a237a..e37336d7b116e 100644 --- a/Documentation/driver-api/cxl/conventions.rst +++ b/Documentation/driver-api/cxl/conventions.rst @@ -45,3 +45,138 @@ Detailed Description of the Change ---------------------------------- + + +Resolve conflict between CFMWS, Platform Memory Holes, and Endpoint Decoders +============================================================================ + +Document +-------- + +CXL Revision 3.2, Version 1.0 + +License +------- + +SPDX-License Identifier: CC-BY-4.0 + +Creator/Contributors +-------------------- + +- Fabio M. De Francesco, Intel +- Dan J. Williams, Intel +- Mahesh Natu, Intel + +Summary of the Change +--------------------- + +According to the current Compute Express Link (CXL) Specifications (Revision +3.2, Version 1.0), the CXL Fixed Memory Window Structure (CFMWS) describes zero +or more Host Physical Address (HPA) windows associated with each CXL Host +Bridge. Each window represents a contiguous HPA range that may be interleaved +across one or more targets, including CXL Host Bridges. Each window has a set +of restrictions that govern its usage. It is the Operating System-directed +configuration and Power Management (OSPM) responsibility to utilize each window +for the specified use. + +Table 9-22 of the current CXL Specifications states that the Window Size field +contains the total number of consecutive bytes of HPA this window describes. +This value must be a multiple of the Number of Interleave Ways (NIW) * 256 MB. + +Platform Firmware (BIOS) might reserve physical addresses below 4 GB where a +memory gap such as the Low Memory Hole for PCIe MMIO may exist. In such cases, +the CFMWS Range Size may not adhere to the NIW * 256 MB rule. + +The HPA represents the actual physical memory address space that the CXL devices +can decode and respond to, while the System Physical Address (SPA), a related +but distinct concept, represents the system-visible address space that users can +direct transaction to and so it excludes reserved regions. + +BIOS publishes CFMWS to communicate the active SPA ranges that, on platforms +with LMH's, map to a strict subset of the HPA. The SPA range trims out the hole, +resulting in lost capacity in the Endpoints with no SPA to map to that part of +the HPA range that intersects the hole. + +E.g, an x86 platform with two CFMWS and an LMH starting at 2 GB: + + +--------+------------+-------------------+------------------+-------------------+------+ + | Window | CFMWS Base | CFMWS Size | HDM Decoder Base | HDM Decoder Size | Ways | + +========+============+===================+==================+===================+======+ + |  0 | 0 GB | 2 GB | 0 GB | 3 GB | 12 | + +--------+------------+-------------------+------------------+-------------------+------+ + |  1 | 4 GB | NIW*256MB Aligned | 4 GB | NIW*256MB Aligned | 12 | + +--------+------------+-------------------+------------------+-------------------+------+ + +HDM decoder base and HDM decoder size represent all the 12 Endpoint Decoders of +a 12 ways region and all the intermediate Switch Decoders. They are configured +by the BIOS according to the NIW * 256MB rule, resulting in a HPA range size of +3GB. Instead, the CFMWS Base and CFMWS Size are used to configure the Root +Decoder HPA range that results smaller (2GB) than that of the Switch and +Endpoint Decoders in the hierarchy (3GB). + +This creates 2 issues which lead to a failure to construct a region: + +1) A mismatch in region size between root and any HDM decoder. The root decoders + will always be smaller due to the trim. + +2) The trim causes the root decoder to violate the (NIW * 256MB) rule. + +This change allows a region with a base address of 0GB to bypass these checks to +allow for region creation with the trimmed root decoder address range. + +This change does not allow for any other arbitrary region to violate these +checks - it is intended exclusively to enable x86 platforms which map CXL memory +under 4GB. + +Despite the HDM decoders covering the PCIE hole HPA region, it is expected that +the platform will never route address accesses to the CXL complex because the +root decoder only covers the trimmed region (which excludes this). This is +outside the ability of Linux to enforce. + +On the example platform, only the first 2GB will be potentially usable, but +Linux, aiming to adhere to the current specifications, fails to construct +Regions and attach Endpoint and intermediate Switch Decoders to them. + +There are several points of failure that due to the expectation that the Root +Decoder HPA size, that is equal to the CFMWS from which it is configured, has +to be greater or equal to the matching Switch and Endpoint HDM Decoders. + +In order to succeed with construction and attachment, Linux must construct a +Region with Root Decoder HPA range size, and then attach to that all the +intermediate Switch Decoders and Endpoint Decoders that belong to the hierarchy +regardless of their range sizes. + +Benefits of the Change +---------------------- + +Without the change, the OSPM wouldn't match intermediate Switch and Endpoint +Decoders with Root Decoders configured with CFMWS HPA sizes that don't align +with the NIW * 256MB constraint, and so it leads to lost memdev capacity. + +This change allows the OSPM to construct Regions and attach intermediate Switch +and Endpoint Decoders to them, so that the addressable part of the memory +devices total capacity is made available to the users. + +References +---------- + +Compute Express Link Specification Revision 3.2, Version 1.0 + + +Detailed Description of the Change +---------------------------------- + +The description of the Window Size field in table 9-22 needs to account for +platforms with Low Memory Holes, where SPA ranges might be subsets of the +endpoints HPA. Therefore, it has to be changed to the following: + +"The total number of consecutive bytes of HPA this window represents. This value +shall be a multiple of NIW * 256 MB. + +On platforms that reserve physical addresses below 4 GB, such as the Low Memory +Hole for PCIe MMIO on x86, an instance of CFMWS whose Base HPA range is 0 might +have a size that doesn't align with the NIW * 256 MB constraint. + +Note that the matching intermediate Switch Decoders and the Endpoint Decoders +HPA range sizes must still align to the above-mentioned rule, but the memory +capacity that exceeds the CFMWS window size won't be accessible.". From cec28eff80d39cb43b14c94a176107cb13aea1b7 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:19 -0700 Subject: [PATCH 017/143] cxl: Add helper to detect top of CXL device topology Add a helper to replace the open code detection of CXL device hierarchy root, or the host bridge. The helper will be used for delayed downstream port (dport) creation. Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Dan Williams Reviewed-by: Alison Schofield Reviewed-by: Robert Richter Tested-by: Robert Richter Signed-off-by: Dave Jiang (cherry picked from commit 4fde89539a18d39169a511fda00db65eeba1a8e0) Signed-off-by: Jiandi An --- drivers/cxl/core/port.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 8f36ff413f5d5..66c0c849c4a0d 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -33,6 +33,15 @@ static DEFINE_IDA(cxl_port_ida); static DEFINE_XARRAY(cxl_root_buses); +/* + * The terminal device in PCI is NULL and @platform_bus + * for platform devices (for cxl_test) + */ +static bool is_cxl_host_bridge(struct device *dev) +{ + return (!dev || dev == &platform_bus); +} + int cxl_num_decoders_committed(struct cxl_port *port) { lockdep_assert_held(&cxl_rwsem.region); @@ -1542,7 +1551,7 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd, resource_size_t component_reg_phys; int rc; - if (!dparent) { + if (is_cxl_host_bridge(dparent)) { /* * The iteration reached the topology root without finding the * CXL-root 'cxl_port' on a previous iteration, fail for now to @@ -1630,11 +1639,7 @@ int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd) struct device *uport_dev; struct cxl_dport *dport; - /* - * The terminal "grandparent" in PCI is NULL and @platform_bus - * for platform devices - */ - if (!dport_dev || dport_dev == &platform_bus) + if (is_cxl_host_bridge(dport_dev)) return 0; uport_dev = dport_dev->parent; From 05e634c07a82cc42ce2e96f4247a7225516e578c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:20 -0700 Subject: [PATCH 018/143] cxl: Add helper to delete dport Refactor the code in reap_dports() out to provide a helper function that reaps a single dport. This will be used later in the cleanup path for allocating a dport. Renaming to del_port() and del_dports() to mirror devm_cxl_add_dport(). [dj] Fixed up subject per Robert Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Tested-by: Robert Richter Signed-off-by: Dave Jiang (cherry picked from commit 8330671c57c7056ef5e1e8dccfcdda7d5fe6d0b0) Signed-off-by: Jiandi An --- drivers/cxl/core/port.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 66c0c849c4a0d..dbea9feacdddf 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1433,7 +1433,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_endpoint_autoremove, "CXL"); * through ->remove(). This "bottom-up" removal selectively removes individual * child ports manually. This depends on devm_cxl_add_port() to not change is * devm action registration order, and for dports to have already been - * destroyed by reap_dports(). + * destroyed by del_dports(). */ static void delete_switch_port(struct cxl_port *port) { @@ -1442,18 +1442,24 @@ static void delete_switch_port(struct cxl_port *port) devm_release_action(port->dev.parent, unregister_port, port); } -static void reap_dports(struct cxl_port *port) +static void del_dport(struct cxl_dport *dport) +{ + struct cxl_port *port = dport->port; + + devm_release_action(&port->dev, cxl_dport_unlink, dport); + devm_release_action(&port->dev, cxl_dport_remove, dport); + devm_kfree(&port->dev, dport); +} + +static void del_dports(struct cxl_port *port) { struct cxl_dport *dport; unsigned long index; device_lock_assert(&port->dev); - xa_for_each(&port->dports, index, dport) { - devm_release_action(&port->dev, cxl_dport_unlink, dport); - devm_release_action(&port->dev, cxl_dport_remove, dport); - devm_kfree(&port->dev, dport); - } + xa_for_each(&port->dports, index, dport) + del_dport(dport); } struct detach_ctx { @@ -1511,7 +1517,7 @@ static void cxl_detach_ep(void *data) */ died = true; port->dead = true; - reap_dports(port); + del_dports(port); } device_unlock(&port->dev); From d56872375583099739bfc4d4ba8f59bcd1a1a76f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:21 -0700 Subject: [PATCH 019/143] cxl: Add a cached copy of target_map to cxl_decoder Add a cached copy of the hardware port-id list that is available at init before all @dport objects have been instantiated. Change is in preparation of delayed dport instantiation. Reviewed-by: Robert Richter Reviewed-by: Jonathan Cameron Tested-by: Robert Richter Reviewed-by: Alison Schofield Signed-off-by: Dave Jiang (cherry picked from commit 02edab6ceefaaf8cb917e864d8c26dbac0ea9686) Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 7 +++---- drivers/cxl/core/hdm.c | 20 ++++++++------------ drivers/cxl/core/port.c | 22 +++++++--------------- drivers/cxl/core/region.c | 4 +++- drivers/cxl/cxl.h | 8 ++++++-- tools/testing/cxl/test/cxl.c | 8 ++++---- 6 files changed, 31 insertions(+), 38 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index b8f124685f1dc..bd2e282ca93a0 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -401,7 +401,6 @@ DEFINE_FREE(del_cxl_resource, struct resource *, if (_T) del_cxl_resource(_T)) static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, struct cxl_cfmws_context *ctx) { - int target_map[CXL_DECODER_MAX_INTERLEAVE]; struct cxl_port *root_port = ctx->root_port; struct cxl_cxims_context cxims_ctx; struct device *dev = ctx->dev; @@ -419,8 +418,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, rc = eig_to_granularity(cfmws->granularity, &ig); if (rc) return rc; - for (i = 0; i < ways; i++) - target_map[i] = cfmws->interleave_targets[i]; struct resource *res __free(del_cxl_resource) = alloc_cxl_resource( cfmws->base_hpa, cfmws->window_size, ctx->id++); @@ -446,6 +443,8 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, .end = cfmws->base_hpa + cfmws->window_size - 1, }; cxld->interleave_ways = ways; + for (i = 0; i < ways; i++) + cxld->target_map[i] = cfmws->interleave_targets[i]; /* * Minimize the x1 granularity to advertise support for any * valid region granularity @@ -484,7 +483,7 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, cxlrd->ops->spa_to_hpa = cxl_apply_xor_maps; } - rc = cxl_decoder_add(cxld, target_map); + rc = cxl_decoder_add(cxld); if (rc) return rc; diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 777b8ac0c49c1..13c53b9c17d13 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -21,12 +21,11 @@ struct cxl_rwsem cxl_rwsem = { .dpa = __RWSEM_INITIALIZER(cxl_rwsem.dpa), }; -static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, - int *target_map) +static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld) { int rc; - rc = cxl_decoder_add_locked(cxld, target_map); + rc = cxl_decoder_add_locked(cxld); if (rc) { put_device(&cxld->dev); dev_err(&port->dev, "Failed to add decoder\n"); @@ -54,7 +53,6 @@ int devm_cxl_add_passthrough_decoder(struct cxl_port *port) { struct cxl_switch_decoder *cxlsd; struct cxl_dport *dport = NULL; - int single_port_map[1]; unsigned long index; struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); @@ -73,9 +71,9 @@ int devm_cxl_add_passthrough_decoder(struct cxl_port *port) xa_for_each(&port->dports, index, dport) break; - single_port_map[0] = dport->port_id; + cxlsd->cxld.target_map[0] = dport->port_id; - return add_hdm_decoder(port, &cxlsd->cxld, single_port_map); + return add_hdm_decoder(port, &cxlsd->cxld); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_passthrough_decoder, "CXL"); @@ -984,7 +982,7 @@ static int cxl_setup_hdm_decoder_from_dvsec( } static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, - int *target_map, void __iomem *hdm, int which, + void __iomem *hdm, int which, u64 *dpa_base, struct cxl_endpoint_dvsec_info *info) { struct cxl_endpoint_decoder *cxled = NULL; @@ -1104,7 +1102,7 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, hi = readl(hdm + CXL_HDM_DECODER0_TL_HIGH(which)); target_list.value = (hi << 32) + lo; for (i = 0; i < cxld->interleave_ways; i++) - target_map[i] = target_list.target_id[i]; + cxld->target_map[i] = target_list.target_id[i]; return 0; } @@ -1180,7 +1178,6 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, cxl_settle_decoders(cxlhdm); for (i = 0; i < cxlhdm->decoder_count; i++) { - int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 }; int rc, target_count = cxlhdm->target_count; struct cxl_decoder *cxld; @@ -1208,8 +1205,7 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, cxld = &cxlsd->cxld; } - rc = init_hdm_decoder(port, cxld, target_map, hdm, i, - &dpa_base, info); + rc = init_hdm_decoder(port, cxld, hdm, i, &dpa_base, info); if (rc) { if (rc == -ENOSPC) continue; @@ -1219,7 +1215,7 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, put_device(&cxld->dev); return rc; } - rc = add_hdm_decoder(port, cxld, target_map); + rc = add_hdm_decoder(port, cxld); if (rc) { dev_warn(&port->dev, "Failed to add decoder%d.%d\n", port->id, i); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index dbea9feacdddf..c36e089e53990 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1716,13 +1716,11 @@ struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd, EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, "CXL"); static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, - struct cxl_port *port, int *target_map) + struct cxl_port *port) { + struct cxl_decoder *cxld = &cxlsd->cxld; int i; - if (!target_map) - return 0; - device_lock_assert(&port->dev); if (xa_empty(&port->dports)) @@ -1730,7 +1728,7 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, guard(rwsem_write)(&cxl_rwsem.region); for (i = 0; i < cxlsd->cxld.interleave_ways; i++) { - struct cxl_dport *dport = find_dport(port, target_map[i]); + struct cxl_dport *dport = find_dport(port, cxld->target_map[i]); if (!dport) return -ENXIO; @@ -1922,9 +1920,6 @@ EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, "CXL"); /** * cxl_decoder_add_locked - Add a decoder with targets * @cxld: The cxl decoder allocated by cxl__decoder_alloc() - * @target_map: A list of downstream ports that this decoder can direct memory - * traffic to. These numbers should correspond with the port number - * in the PCIe Link Capabilities structure. * * Certain types of decoders may not have any targets. The main example of this * is an endpoint device. A more awkward example is a hostbridge whose root @@ -1938,7 +1933,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, "CXL"); * Return: Negative error code if the decoder wasn't properly configured; else * returns 0. */ -int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map) +int cxl_decoder_add_locked(struct cxl_decoder *cxld) { struct cxl_port *port; struct device *dev; @@ -1959,7 +1954,7 @@ int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map) if (!is_endpoint_decoder(dev)) { struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); - rc = decoder_populate_targets(cxlsd, port, target_map); + rc = decoder_populate_targets(cxlsd, port); if (rc && (cxld->flags & CXL_DECODER_F_ENABLE)) { dev_err(&port->dev, "Failed to populate active decoder targets\n"); @@ -1978,9 +1973,6 @@ EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, "CXL"); /** * cxl_decoder_add - Add a decoder with targets * @cxld: The cxl decoder allocated by cxl__decoder_alloc() - * @target_map: A list of downstream ports that this decoder can direct memory - * traffic to. These numbers should correspond with the port number - * in the PCIe Link Capabilities structure. * * This is the unlocked variant of cxl_decoder_add_locked(). * See cxl_decoder_add_locked(). @@ -1988,7 +1980,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, "CXL"); * Context: Process context. Takes and releases the device lock of the port that * owns the @cxld. */ -int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map) +int cxl_decoder_add(struct cxl_decoder *cxld) { struct cxl_port *port; @@ -2001,7 +1993,7 @@ int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map) port = to_cxl_port(cxld->dev.parent); guard(device)(&port->dev); - return cxl_decoder_add_locked(cxld, target_map); + return cxl_decoder_add_locked(cxld); } EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, "CXL"); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 238b148768148..32675a70cadf9 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1516,8 +1516,10 @@ static int cxl_port_setup_targets(struct cxl_port *port, cxl_rr->nr_targets_set); return -ENXIO; } - } else + } else { cxlsd->target[cxl_rr->nr_targets_set] = ep->dport; + cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id; + } inc = 1; out_target_set: cxl_rr->nr_targets_set += inc; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 4fe3df06f57a3..5be51b6abecd7 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -357,6 +357,9 @@ enum cxl_decoder_type { * @target_type: accelerator vs expander (type2 vs type3) selector * @region: currently assigned region for this decoder * @flags: memory type capabilities and locking + * @target_map: cached copy of hardware port-id list, available at init + * before all @dport objects have been instantiated. While + * dport id is 8bit, CFMWS interleave targets are 32bits. * @commit: device/decoder-type specific callback to commit settings to hw * @reset: device/decoder-type specific callback to reset hw settings */ @@ -369,6 +372,7 @@ struct cxl_decoder { enum cxl_decoder_type target_type; struct cxl_region *region; unsigned long flags; + u32 target_map[CXL_DECODER_MAX_INTERLEAVE]; int (*commit)(struct cxl_decoder *cxld); void (*reset)(struct cxl_decoder *cxld); }; @@ -789,9 +793,9 @@ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, unsigned int nr_targets); struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, unsigned int nr_targets); -int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); +int cxl_decoder_add(struct cxl_decoder *cxld); struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port); -int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map); +int cxl_decoder_add_locked(struct cxl_decoder *cxld); int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld); static inline int cxl_root_decoder_autoremove(struct device *host, struct cxl_root_decoder *cxlrd) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 8b5b8d17b8b84..306c5cbc24187 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -651,7 +651,7 @@ static int mock_cxl_add_passthrough_decoder(struct cxl_port *port) struct target_map_ctx { - int *target_map; + u32 *target_map; int index; int target_count; }; @@ -955,9 +955,7 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, target_count = NR_CXL_SWITCH_PORTS; for (i = 0; i < NR_CXL_PORT_DECODERS; i++) { - int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 }; struct target_map_ctx ctx = { - .target_map = target_map, .target_count = target_count, }; struct cxl_decoder *cxld; @@ -986,6 +984,8 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, cxld = &cxled->cxld; } + ctx.target_map = cxld->target_map; + mock_init_hdm_decoder(cxld); if (target_count) { @@ -997,7 +997,7 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, } } - rc = cxl_decoder_add_locked(cxld, target_map); + rc = cxl_decoder_add_locked(cxld); if (rc) { put_device(&cxld->dev); dev_err(&port->dev, "Failed to add decoder\n"); From f0daa5e8205d7dc7d400be0f5ebfef8049fa911e Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:23 -0700 Subject: [PATCH 020/143] cxl/test: Refactor decoder setup to reduce cxl_test burden Group the decoder setup code in switch and endpoint port probe into a single function for each to reduce the number of functions to be mocked in cxl_test. Introduce devm_cxl_switch_port_decoders_setup() and devm_cxl_endpoint_decoders_setup(). These two functions will be mocked instead with some functions optimized out since the mock version does not do anything. Remove devm_cxl_setup_hdm(), devm_cxl_add_passthrough_decoder(), and devm_cxl_enumerate_decoders() in cxl_test mock code. In turn, mock_cxl_add_passthrough_decoder() can be removed since cxl_test does not setup passthrough decoders. __wrap_cxl_hdm_decode_init() and __wrap_cxl_dvsec_rr_decode() can be removed as well since they only return 0 when called. [dj: drop 'struct cxl_port' forward declaration (Robert)] Suggested-by: Robert Richter Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Reviewed-by: Robert Richter Signed-off-by: Dave Jiang (cherry picked from commit 68d5d9734c12fce20ad493fe24738ab2019108c0) Signed-off-by: Jiandi An --- drivers/cxl/core/core.h | 5 +++ drivers/cxl/core/hdm.c | 81 +++++++++++++++++++++++++++++++---- drivers/cxl/core/pci.c | 42 ++++++++++++++++++ drivers/cxl/cxl.h | 9 ++-- drivers/cxl/cxlpci.h | 2 - drivers/cxl/port.c | 38 +--------------- tools/testing/cxl/Kbuild | 7 +-- tools/testing/cxl/test/cxl.c | 42 +++++++++++++----- tools/testing/cxl/test/mock.c | 69 ++++------------------------- tools/testing/cxl/test/mock.h | 7 +-- 10 files changed, 169 insertions(+), 133 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 5707cd60a8eb0..1fb66132b7777 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -148,6 +148,11 @@ int cxl_ras_init(void); void cxl_ras_exit(void); int cxl_gpf_port_setup(struct cxl_dport *dport); +struct cxl_hdm; +int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, + struct cxl_endpoint_dvsec_info *info); +int cxl_port_get_possible_dports(struct cxl_port *port); + #ifdef CONFIG_CXL_FEATURES struct cxl_feat_entry * cxl_feature_info(struct cxl_features_state *cxlfs, const uuid_t *uuid); diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 13c53b9c17d13..d435178f63b82 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -49,7 +49,7 @@ static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld) * are claimed and passed to the single dport. Disable the range until the first * CXL region is enumerated / activated. */ -int devm_cxl_add_passthrough_decoder(struct cxl_port *port) +static int devm_cxl_add_passthrough_decoder(struct cxl_port *port) { struct cxl_switch_decoder *cxlsd; struct cxl_dport *dport = NULL; @@ -75,7 +75,6 @@ int devm_cxl_add_passthrough_decoder(struct cxl_port *port) return add_hdm_decoder(port, &cxlsd->cxld); } -EXPORT_SYMBOL_NS_GPL(devm_cxl_add_passthrough_decoder, "CXL"); static void parse_hdm_decoder_caps(struct cxl_hdm *cxlhdm) { @@ -145,8 +144,8 @@ static bool should_emulate_decoders(struct cxl_endpoint_dvsec_info *info) * @port: cxl_port to map * @info: cached DVSEC range register info */ -struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, - struct cxl_endpoint_dvsec_info *info) +static struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, + struct cxl_endpoint_dvsec_info *info) { struct cxl_register_map *reg_map = &port->reg_map; struct device *dev = &port->dev; @@ -201,7 +200,6 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, return cxlhdm; } -EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, "CXL"); static void __cxl_dpa_debug(struct seq_file *file, struct resource *r, int depth) { @@ -1167,8 +1165,8 @@ static void cxl_settle_decoders(struct cxl_hdm *cxlhdm) * @cxlhdm: Structure to populate with HDM capabilities * @info: cached DVSEC range register info */ -int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, - struct cxl_endpoint_dvsec_info *info) +static int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, + struct cxl_endpoint_dvsec_info *info) { void __iomem *hdm = cxlhdm->regs.hdm_decoder; struct cxl_port *port = cxlhdm->port; @@ -1225,4 +1223,71 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, return 0; } -EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_decoders, "CXL"); + +/** + * devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders + * @port: CXL port context + * + * Return 0 or -errno on error + */ +int devm_cxl_switch_port_decoders_setup(struct cxl_port *port) +{ + struct cxl_hdm *cxlhdm; + + if (is_cxl_root(port) || is_cxl_endpoint(port)) + return -EOPNOTSUPP; + + cxlhdm = devm_cxl_setup_hdm(port, NULL); + if (!IS_ERR(cxlhdm)) + return devm_cxl_enumerate_decoders(cxlhdm, NULL); + + if (PTR_ERR(cxlhdm) != -ENODEV) { + dev_err(&port->dev, "Failed to map HDM decoder capability\n"); + return PTR_ERR(cxlhdm); + } + + if (cxl_port_get_possible_dports(port) == 1) { + dev_dbg(&port->dev, "Fallback to passthrough decoder\n"); + return devm_cxl_add_passthrough_decoder(port); + } + + dev_err(&port->dev, "HDM decoder capability not found\n"); + return -ENXIO; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_switch_port_decoders_setup, "CXL"); + +/** + * devm_cxl_endpoint_decoders_setup - allocate and setup endpoint decoders + * @port: CXL port context + * + * Return 0 or -errno on error + */ +int devm_cxl_endpoint_decoders_setup(struct cxl_port *port) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + struct cxl_endpoint_dvsec_info info = { .port = port }; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_hdm *cxlhdm; + int rc; + + if (!is_cxl_endpoint(port)) + return -EOPNOTSUPP; + + rc = cxl_dvsec_rr_decode(cxlds, &info); + if (rc < 0) + return rc; + + cxlhdm = devm_cxl_setup_hdm(port, &info); + if (IS_ERR(cxlhdm)) { + if (PTR_ERR(cxlhdm) == -ENODEV) + dev_err(&port->dev, "HDM decoder registers not found\n"); + return PTR_ERR(cxlhdm); + } + + rc = cxl_hdm_decode_init(cxlds, cxlhdm, &info); + if (rc) + return rc; + + return devm_cxl_enumerate_decoders(cxlhdm, &info); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_endpoint_decoders_setup, "CXL"); diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index b50551601c2e4..fa02366d35f2d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1169,3 +1169,45 @@ int cxl_gpf_port_setup(struct cxl_dport *dport) return 0; } + +static int count_dports(struct pci_dev *pdev, void *data) +{ + struct cxl_walk_context *ctx = data; + int type = pci_pcie_type(pdev); + + if (pdev->bus != ctx->bus) + return 0; + if (!pci_is_pcie(pdev)) + return 0; + if (type != ctx->type) + return 0; + + ctx->count++; + return 0; +} + +int cxl_port_get_possible_dports(struct cxl_port *port) +{ + struct pci_bus *bus = cxl_port_to_pci_bus(port); + struct cxl_walk_context ctx; + int type; + + if (!bus) { + dev_err(&port->dev, "No PCI bus found for port %s\n", + dev_name(&port->dev)); + return -ENXIO; + } + + if (pci_is_root_bus(bus)) + type = PCI_EXP_TYPE_ROOT_PORT; + else + type = PCI_EXP_TYPE_DOWNSTREAM; + + ctx = (struct cxl_walk_context) { + .bus = bus, + .type = type, + }; + pci_walk_bus(bus, count_dports, &ctx); + + return ctx.count; +} diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 5be51b6abecd7..e4f37c143c1ef 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -818,12 +818,9 @@ struct cxl_endpoint_dvsec_info { struct range dvsec_range[2]; }; -struct cxl_hdm; -struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, - struct cxl_endpoint_dvsec_info *info); -int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, - struct cxl_endpoint_dvsec_info *info); -int devm_cxl_add_passthrough_decoder(struct cxl_port *port); +int devm_cxl_switch_port_decoders_setup(struct cxl_port *port); +int devm_cxl_endpoint_decoders_setup(struct cxl_port *port); + struct cxl_dev_state; int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, struct cxl_endpoint_dvsec_info *info); diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 54e219b0049ea..7ae621e618e79 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -129,8 +129,6 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) int devm_cxl_port_enumerate_dports(struct cxl_port *port); struct cxl_dev_state; -int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, - struct cxl_endpoint_dvsec_info *info); void read_cdat_data(struct cxl_port *port); void cxl_cor_error_detected(struct pci_dev *pdev); pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index cf32dc50b7a61..d8cae2b5bac6c 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -59,7 +59,6 @@ static int discover_region(struct device *dev, void *unused) static int cxl_switch_port_probe(struct cxl_port *port) { - struct cxl_hdm *cxlhdm; int rc; /* Cache the data early to ensure is_visible() works */ @@ -71,43 +70,14 @@ static int cxl_switch_port_probe(struct cxl_port *port) cxl_switch_parse_cdat(port); - cxlhdm = devm_cxl_setup_hdm(port, NULL); - if (!IS_ERR(cxlhdm)) - return devm_cxl_enumerate_decoders(cxlhdm, NULL); - - if (PTR_ERR(cxlhdm) != -ENODEV) { - dev_err(&port->dev, "Failed to map HDM decoder capability\n"); - return PTR_ERR(cxlhdm); - } - - if (rc == 1) { - dev_dbg(&port->dev, "Fallback to passthrough decoder\n"); - return devm_cxl_add_passthrough_decoder(port); - } - - dev_err(&port->dev, "HDM decoder capability not found\n"); - return -ENXIO; + return devm_cxl_switch_port_decoders_setup(port); } static int cxl_endpoint_port_probe(struct cxl_port *port) { - struct cxl_endpoint_dvsec_info info = { .port = port }; struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); - struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_hdm *cxlhdm; int rc; - rc = cxl_dvsec_rr_decode(cxlds, &info); - if (rc < 0) - return rc; - - cxlhdm = devm_cxl_setup_hdm(port, &info); - if (IS_ERR(cxlhdm)) { - if (PTR_ERR(cxlhdm) == -ENODEV) - dev_err(&port->dev, "HDM decoder registers not found\n"); - return PTR_ERR(cxlhdm); - } - /* Cache the data early to ensure is_visible() works */ read_cdat_data(port); cxl_endpoint_parse_cdat(port); @@ -117,11 +87,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) if (rc) return rc; - rc = cxl_hdm_decode_init(cxlds, cxlhdm, &info); - if (rc) - return rc; - - rc = devm_cxl_enumerate_decoders(cxlhdm, &info); + rc = devm_cxl_endpoint_decoders_setup(port); if (rc) return rc; diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index d07f14cb7aa45..51b8ab289eae9 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -5,16 +5,13 @@ ldflags-y += --wrap=acpi_evaluate_integer ldflags-y += --wrap=acpi_pci_find_root ldflags-y += --wrap=nvdimm_bus_register ldflags-y += --wrap=devm_cxl_port_enumerate_dports -ldflags-y += --wrap=devm_cxl_setup_hdm -ldflags-y += --wrap=devm_cxl_add_passthrough_decoder -ldflags-y += --wrap=devm_cxl_enumerate_decoders ldflags-y += --wrap=cxl_await_media_ready -ldflags-y += --wrap=cxl_hdm_decode_init -ldflags-y += --wrap=cxl_dvsec_rr_decode ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat ldflags-y += --wrap=cxl_dport_init_ras_reporting +ldflags-y += --wrap=devm_cxl_switch_port_decoders_setup +ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 306c5cbc24187..36dff58275a25 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -643,13 +643,6 @@ static struct cxl_hdm *mock_cxl_setup_hdm(struct cxl_port *port, return cxlhdm; } -static int mock_cxl_add_passthrough_decoder(struct cxl_port *port) -{ - dev_err(&port->dev, "unexpected passthrough decoder for cxl_test\n"); - return -EOPNOTSUPP; -} - - struct target_map_ctx { u32 *target_map; int index; @@ -1013,6 +1006,36 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, return 0; } +static int __mock_cxl_decoders_setup(struct cxl_port *port) +{ + struct cxl_hdm *cxlhdm; + + cxlhdm = mock_cxl_setup_hdm(port, NULL); + if (IS_ERR(cxlhdm)) { + if (PTR_ERR(cxlhdm) != -ENODEV) + dev_err(&port->dev, "Failed to map HDM decoder capability\n"); + return PTR_ERR(cxlhdm); + } + + return mock_cxl_enumerate_decoders(cxlhdm, NULL); +} + +static int mock_cxl_switch_port_decoders_setup(struct cxl_port *port) +{ + if (is_cxl_root(port) || is_cxl_endpoint(port)) + return -EOPNOTSUPP; + + return __mock_cxl_decoders_setup(port); +} + +static int mock_cxl_endpoint_decoders_setup(struct cxl_port *port) +{ + if (!is_cxl_endpoint(port)) + return -EOPNOTSUPP; + + return __mock_cxl_decoders_setup(port); +} + static int mock_cxl_port_enumerate_dports(struct cxl_port *port) { struct platform_device **array; @@ -1127,10 +1150,9 @@ static struct cxl_mock_ops cxl_mock_ops = { .acpi_table_parse_cedt = mock_acpi_table_parse_cedt, .acpi_evaluate_integer = mock_acpi_evaluate_integer, .acpi_pci_find_root = mock_acpi_pci_find_root, + .devm_cxl_switch_port_decoders_setup = mock_cxl_switch_port_decoders_setup, + .devm_cxl_endpoint_decoders_setup = mock_cxl_endpoint_decoders_setup, .devm_cxl_port_enumerate_dports = mock_cxl_port_enumerate_dports, - .devm_cxl_setup_hdm = mock_cxl_setup_hdm, - .devm_cxl_add_passthrough_decoder = mock_cxl_add_passthrough_decoder, - .devm_cxl_enumerate_decoders = mock_cxl_enumerate_decoders, .cxl_endpoint_parse_cdat = mock_cxl_endpoint_parse_cdat, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 1989ae020df3d..f335889b7756a 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -131,55 +131,35 @@ __wrap_nvdimm_bus_register(struct device *dev, } EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register); -struct cxl_hdm *__wrap_devm_cxl_setup_hdm(struct cxl_port *port, - struct cxl_endpoint_dvsec_info *info) - -{ - int index; - struct cxl_hdm *cxlhdm; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_port(port->uport_dev)) - cxlhdm = ops->devm_cxl_setup_hdm(port, info); - else - cxlhdm = devm_cxl_setup_hdm(port, info); - put_cxl_mock_ops(index); - - return cxlhdm; -} -EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_setup_hdm, "CXL"); - -int __wrap_devm_cxl_add_passthrough_decoder(struct cxl_port *port) +int __wrap_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) { int rc, index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); if (ops && ops->is_mock_port(port->uport_dev)) - rc = ops->devm_cxl_add_passthrough_decoder(port); + rc = ops->devm_cxl_switch_port_decoders_setup(port); else - rc = devm_cxl_add_passthrough_decoder(port); + rc = devm_cxl_switch_port_decoders_setup(port); put_cxl_mock_ops(index); return rc; } -EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_passthrough_decoder, "CXL"); +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_switch_port_decoders_setup, "CXL"); -int __wrap_devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, - struct cxl_endpoint_dvsec_info *info) +int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port) { int rc, index; - struct cxl_port *port = cxlhdm->port; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); if (ops && ops->is_mock_port(port->uport_dev)) - rc = ops->devm_cxl_enumerate_decoders(cxlhdm, info); + rc = ops->devm_cxl_endpoint_decoders_setup(port); else - rc = devm_cxl_enumerate_decoders(cxlhdm, info); + rc = devm_cxl_endpoint_decoders_setup(port); put_cxl_mock_ops(index); return rc; } -EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_enumerate_decoders, "CXL"); +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_endpoint_decoders_setup, "CXL"); int __wrap_devm_cxl_port_enumerate_dports(struct cxl_port *port) { @@ -211,39 +191,6 @@ int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_await_media_ready, "CXL"); -int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds, - struct cxl_hdm *cxlhdm, - struct cxl_endpoint_dvsec_info *info) -{ - int rc = 0, index; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_dev(cxlds->dev)) - rc = 0; - else - rc = cxl_hdm_decode_init(cxlds, cxlhdm, info); - put_cxl_mock_ops(index); - - return rc; -} -EXPORT_SYMBOL_NS_GPL(__wrap_cxl_hdm_decode_init, "CXL"); - -int __wrap_cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, - struct cxl_endpoint_dvsec_info *info) -{ - int rc = 0, index; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_dev(cxlds->dev)) - rc = 0; - else - rc = cxl_dvsec_rr_decode(cxlds, info); - put_cxl_mock_ops(index); - - return rc; -} -EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dvsec_rr_decode, "CXL"); - struct cxl_dport *__wrap_devm_cxl_add_rch_dport(struct cxl_port *port, struct device *dport_dev, int port_id, diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index d1b0271d28220..9d5ad3fd55ecc 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -20,11 +20,8 @@ struct cxl_mock_ops { bool (*is_mock_port)(struct device *dev); bool (*is_mock_dev)(struct device *dev); int (*devm_cxl_port_enumerate_dports)(struct cxl_port *port); - struct cxl_hdm *(*devm_cxl_setup_hdm)( - struct cxl_port *port, struct cxl_endpoint_dvsec_info *info); - int (*devm_cxl_add_passthrough_decoder)(struct cxl_port *port); - int (*devm_cxl_enumerate_decoders)( - struct cxl_hdm *hdm, struct cxl_endpoint_dvsec_info *info); + int (*devm_cxl_switch_port_decoders_setup)(struct cxl_port *port); + int (*devm_cxl_endpoint_decoders_setup)(struct cxl_port *port); void (*cxl_endpoint_parse_cdat)(struct cxl_port *port); }; From 11b6f7c7f7c6543bff7e9a3e2b67579bd0481ac9 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:24 -0700 Subject: [PATCH 021/143] cxl: Defer dport allocation for switch ports The current implementation enumerates the dports during the cxl_port driver probe. Without an endpoint connected, the dport may not be active during port probe. This scheme may prevent a valid hardware dport id to be retrieved and MMIO registers to be read when an endpoint is hot-plugged. Move the dport allocation and setup to behind memdev probe so the endpoint is guaranteed to be connected. In the original enumeration behavior, there are 3 phases (or 2 if no CXL switches) for port creation. cxl_acpi() creates a Root Port (RP) from the ACPI0017.N device. Through that it enumerates downstream ports composed of ACPI0016.N devices through add_host_bridge_dport(). Once done, it uses add_host_bridge_uport() to create the ports that enumerate the PCI RPs as the dports of these ports. Every time a port is created, the port driver is attached, cxl_switch_porbe_probe() is called and devm_cxl_port_enumerate_dports() is invoked to enumerate and probe the dports. The second phase is if there are any CXL switches. When the pci endpoint device driver (cxl_pci) calls probe, it will add a mem device and triggers the cxl_mem_probe(). cxl_mem_probe() calls devm_cxl_enumerate_ports() and attempts to discovery and create all the ports represent CXL switches. During this phase, a port is created per switch and the attached dports are also enumerated and probed. The last phase is creating endpoint port which happens for all endpoint devices. The new sequence is instead of creating all possible dports at initial port creation, defer port instantiation until a memdev beneath that dport arrives. Introduce devm_cxl_create_or_extend_port() to centralize the creation and extension of ports with new dports as memory devices arrive. As part of this rework, switch decoder target list is amended at runtime as dports show up. While the decoders are allocated during the port driver probe, The decoders must also be updated since previously they were setup when all the dports are setup. Now every time a dport is setup per endpoint, the switch target listing need to be updated with new dport. A guard(rwsem_write) is used to update decoder targets. This is similar to when decoder_populate_target() is called and the decoder programming must be protected. Also the port registers are probed the first time when the first dport shows up. This ensures that the CXL link is established when the port registers are probed. [dj] Use ERR_CAST() (Jonathan) Link: https://lore.kernel.org/linux-cxl/20250305100123.3077031-1-rrichter@amd.com/ Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang (cherry picked from commit 4f06d81e7c6a02f850bfe9812295b1e859ab2db0) Signed-off-by: Jiandi An --- drivers/cxl/core/cdat.c | 2 +- drivers/cxl/core/core.h | 2 + drivers/cxl/core/hdm.c | 6 - drivers/cxl/core/pci.c | 46 ++++++++ drivers/cxl/core/port.c | 240 ++++++++++++++++++++++++++++++++-------- drivers/cxl/port.c | 11 +- 6 files changed, 247 insertions(+), 60 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index bca1ec279651d..44c1c778b7cce 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -338,7 +338,7 @@ static int match_cxlrd_hb(struct device *dev, void *data) guard(rwsem_read)(&cxl_rwsem.region); for (int i = 0; i < cxlsd->nr_targets; i++) { - if (host_bridge == cxlsd->target[i]->dport_dev) + if (cxlsd->target[i] && host_bridge == cxlsd->target[i]->dport_dev) return 1; } diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 1fb66132b7777..c7c314a372a95 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -147,6 +147,8 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, int cxl_ras_init(void); void cxl_ras_exit(void); int cxl_gpf_port_setup(struct cxl_dport *dport); +struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev); struct cxl_hdm; int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index d435178f63b82..4ecbf1d23bc59 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -52,8 +52,6 @@ static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld) static int devm_cxl_add_passthrough_decoder(struct cxl_port *port) { struct cxl_switch_decoder *cxlsd; - struct cxl_dport *dport = NULL; - unsigned long index; struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); /* @@ -69,10 +67,6 @@ static int devm_cxl_add_passthrough_decoder(struct cxl_port *port) device_lock_assert(&port->dev); - xa_for_each(&port->dports, index, dport) - break; - cxlsd->cxld.target_map[0] = dport->port_id; - return add_hdm_decoder(port, &cxlsd->cxld); } diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index fa02366d35f2d..9ec288ed39aea 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -24,6 +24,52 @@ static unsigned short media_ready_timeout = 60; module_param(media_ready_timeout, ushort, 0644); MODULE_PARM_DESC(media_ready_timeout, "seconds to wait for media ready"); +static int pci_get_port_num(struct pci_dev *pdev) +{ + u32 lnkcap; + int type; + + type = pci_pcie_type(pdev); + if (type != PCI_EXP_TYPE_DOWNSTREAM && type != PCI_EXP_TYPE_ROOT_PORT) + return -EINVAL; + + if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP, + &lnkcap)) + return -ENXIO; + + return FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); +} + +/** + * devm_cxl_add_dport_by_dev - allocate a dport by the dport device + * @port: cxl_port that hosts the dport + * @dport_dev: 'struct device' of the dport + * + * Returns the allocated dport on success or ERR_PTR() of -errno on error + */ +struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) +{ + struct cxl_register_map map; + struct pci_dev *pdev; + int port_num, rc; + + if (!dev_is_pci(dport_dev)) + return ERR_PTR(-EINVAL); + + pdev = to_pci_dev(dport_dev); + port_num = pci_get_port_num(pdev); + if (port_num < 0) + return ERR_PTR(port_num); + + rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + if (rc) + return ERR_PTR(rc); + + device_lock_assert(&port->dev); + return devm_cxl_add_dport(port, dport_dev, port_num, map.resource); +} + struct cxl_walk_context { struct pci_bus *bus; struct cxl_port *port; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c36e089e53990..c016eaa1e91b0 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1358,21 +1358,6 @@ static struct cxl_port *find_cxl_port(struct device *dport_dev, return port; } -static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port, - struct device *dport_dev, - struct cxl_dport **dport) -{ - struct cxl_find_port_ctx ctx = { - .dport_dev = dport_dev, - .parent_port = parent_port, - .dport = dport, - }; - struct cxl_port *port; - - port = __find_cxl_port(&ctx); - return port; -} - /* * All users of grandparent() are using it to walk PCIe-like switch port * hierarchy. A PCIe switch is comprised of a bridge device representing the @@ -1548,13 +1533,154 @@ static resource_size_t find_component_registers(struct device *dev) return map.resource; } +static int match_port_by_uport(struct device *dev, const void *data) +{ + const struct device *uport_dev = data; + struct cxl_port *port; + + if (!is_cxl_port(dev)) + return 0; + + port = to_cxl_port(dev); + return uport_dev == port->uport_dev; +} + +/* + * Function takes a device reference on the port device. Caller should do a + * put_device() when done. + */ +static struct cxl_port *find_cxl_port_by_uport(struct device *uport_dev) +{ + struct device *dev; + + dev = bus_find_device(&cxl_bus_type, NULL, uport_dev, match_port_by_uport); + if (dev) + return to_cxl_port(dev); + return NULL; +} + +static int update_decoder_targets(struct device *dev, void *data) +{ + struct cxl_dport *dport = data; + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + int i; + + if (!is_switch_decoder(dev)) + return 0; + + cxlsd = to_cxl_switch_decoder(dev); + cxld = &cxlsd->cxld; + guard(rwsem_write)(&cxl_rwsem.region); + + for (i = 0; i < cxld->interleave_ways; i++) { + if (cxld->target_map[i] == dport->port_id) { + cxlsd->target[i] = dport; + dev_dbg(dev, "dport%d found in target list, index %d\n", + dport->port_id, i); + return 1; + } + } + + return 0; +} + +DEFINE_FREE(del_cxl_dport, struct cxl_dport *, if (!IS_ERR_OR_NULL(_T)) del_dport(_T)) +static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, + struct device *dport_dev) +{ + struct cxl_dport *dport; + int rc; + + device_lock_assert(&port->dev); + if (!port->dev.driver) + return ERR_PTR(-ENXIO); + + dport = cxl_find_dport_by_dev(port, dport_dev); + if (dport) { + dev_dbg(&port->dev, "dport%d:%s already exists\n", + dport->port_id, dev_name(dport_dev)); + return ERR_PTR(-EBUSY); + } + + struct cxl_dport *new_dport __free(del_cxl_dport) = + devm_cxl_add_dport_by_dev(port, dport_dev); + if (IS_ERR(new_dport)) + return new_dport; + + cxl_switch_parse_cdat(port); + + if (ida_is_empty(&port->decoder_ida)) { + rc = devm_cxl_switch_port_decoders_setup(port); + if (rc) + return ERR_PTR(rc); + dev_dbg(&port->dev, "first dport%d:%s added with decoders\n", + new_dport->port_id, dev_name(dport_dev)); + return no_free_ptr(new_dport); + } + + /* New dport added, update the decoder targets */ + device_for_each_child(&port->dev, new_dport, update_decoder_targets); + + dev_dbg(&port->dev, "dport%d:%s added\n", new_dport->port_id, + dev_name(dport_dev)); + + return no_free_ptr(new_dport); +} + +static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev, + struct cxl_port *parent_port, + struct cxl_dport *parent_dport, + struct device *uport_dev, + struct device *dport_dev) +{ + resource_size_t component_reg_phys; + + device_lock_assert(&parent_port->dev); + if (!parent_port->dev.driver) { + dev_warn(ep_dev, + "port %s:%s:%s disabled, failed to enumerate CXL.mem\n", + dev_name(&parent_port->dev), dev_name(uport_dev), + dev_name(dport_dev)); + } + + struct cxl_port *port __free(put_cxl_port) = + find_cxl_port_by_uport(uport_dev); + if (!port) { + component_reg_phys = find_component_registers(uport_dev); + port = devm_cxl_add_port(&parent_port->dev, uport_dev, + component_reg_phys, parent_dport); + if (IS_ERR(port)) + return ERR_CAST(port); + + /* + * retry to make sure a port is found. a port device + * reference is taken. + */ + port = find_cxl_port_by_uport(uport_dev); + if (!port) + return ERR_PTR(-ENODEV); + + dev_dbg(ep_dev, "created port %s:%s\n", + dev_name(&port->dev), dev_name(port->uport_dev)); + } else { + /* + * Port was created before right before this function is + * called. Signal the caller to deal with it. + */ + return ERR_PTR(-EAGAIN); + } + + guard(device)(&port->dev); + return cxl_port_add_dport(port, dport_dev); +} + static int add_port_attach_ep(struct cxl_memdev *cxlmd, struct device *uport_dev, struct device *dport_dev) { struct device *dparent = grandparent(dport_dev); struct cxl_dport *dport, *parent_dport; - resource_size_t component_reg_phys; int rc; if (is_cxl_host_bridge(dparent)) { @@ -1569,42 +1695,31 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd, } struct cxl_port *parent_port __free(put_cxl_port) = - find_cxl_port(dparent, &parent_dport); + find_cxl_port_by_uport(dparent->parent); if (!parent_port) { /* iterate to create this parent_port */ return -EAGAIN; } - /* - * Definition with __free() here to keep the sequence of - * dereferencing the device of the port before the parent_port releasing. - */ - struct cxl_port *port __free(put_cxl_port) = NULL; scoped_guard(device, &parent_port->dev) { - if (!parent_port->dev.driver) { - dev_warn(&cxlmd->dev, - "port %s:%s disabled, failed to enumerate CXL.mem\n", - dev_name(&parent_port->dev), dev_name(uport_dev)); - return -ENXIO; + parent_dport = cxl_find_dport_by_dev(parent_port, dparent); + if (!parent_dport) { + parent_dport = cxl_port_add_dport(parent_port, dparent); + if (IS_ERR(parent_dport)) + return PTR_ERR(parent_dport); } - port = find_cxl_port_at(parent_port, dport_dev, &dport); - if (!port) { - component_reg_phys = find_component_registers(uport_dev); - port = devm_cxl_add_port(&parent_port->dev, uport_dev, - component_reg_phys, parent_dport); - if (IS_ERR(port)) - return PTR_ERR(port); - - /* retry find to pick up the new dport information */ - port = find_cxl_port_at(parent_port, dport_dev, &dport); - if (!port) - return -ENXIO; + dport = devm_cxl_create_port(&cxlmd->dev, parent_port, + parent_dport, uport_dev, + dport_dev); + if (IS_ERR(dport)) { + /* Port already exists, restart iteration */ + if (PTR_ERR(dport) == -EAGAIN) + return 0; + return PTR_ERR(dport); } } - dev_dbg(&cxlmd->dev, "add to new port %s:%s\n", - dev_name(&port->dev), dev_name(port->uport_dev)); rc = cxl_add_ep(dport, &cxlmd->dev); if (rc == -EBUSY) { /* @@ -1617,6 +1732,25 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd, return rc; } +static struct cxl_dport *find_or_add_dport(struct cxl_port *port, + struct device *dport_dev) +{ + struct cxl_dport *dport; + + device_lock_assert(&port->dev); + dport = cxl_find_dport_by_dev(port, dport_dev); + if (!dport) { + dport = cxl_port_add_dport(port, dport_dev); + if (IS_ERR(dport)) + return dport; + + /* New dport added, restart iteration */ + return ERR_PTR(-EAGAIN); + } + + return dport; +} + int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd) { struct device *dev = &cxlmd->dev; @@ -1659,12 +1793,26 @@ int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd) dev_name(iter), dev_name(dport_dev), dev_name(uport_dev)); struct cxl_port *port __free(put_cxl_port) = - find_cxl_port(dport_dev, &dport); + find_cxl_port_by_uport(uport_dev); if (port) { dev_dbg(&cxlmd->dev, "found already registered port %s:%s\n", dev_name(&port->dev), dev_name(port->uport_dev)); + + /* + * RP port enumerated by cxl_acpi without dport will + * have the dport added here. + */ + scoped_guard(device, &port->dev) { + dport = find_or_add_dport(port, dport_dev); + if (IS_ERR(dport)) { + if (PTR_ERR(dport) == -EAGAIN) + goto retry; + return PTR_ERR(dport); + } + } + rc = cxl_add_ep(dport, &cxlmd->dev); /* @@ -1724,14 +1872,16 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, device_lock_assert(&port->dev); if (xa_empty(&port->dports)) - return -EINVAL; + return 0; guard(rwsem_write)(&cxl_rwsem.region); for (i = 0; i < cxlsd->cxld.interleave_ways; i++) { struct cxl_dport *dport = find_dport(port, cxld->target_map[i]); - if (!dport) - return -ENXIO; + if (!dport) { + /* dport may be activated later */ + continue; + } cxlsd->target[i] = dport; } diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index d8cae2b5bac6c..51c8f2f84717a 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -59,18 +59,13 @@ static int discover_region(struct device *dev, void *unused) static int cxl_switch_port_probe(struct cxl_port *port) { - int rc; + /* Reset nr_dports for rebind of driver */ + port->nr_dports = 0; /* Cache the data early to ensure is_visible() works */ read_cdat_data(port); - rc = devm_cxl_port_enumerate_dports(port); - if (rc < 0) - return rc; - - cxl_switch_parse_cdat(port); - - return devm_cxl_switch_port_decoders_setup(port); + return 0; } static int cxl_endpoint_port_probe(struct cxl_port *port) From 6762d6e771ab8acf14568b4554ebab9f70cc41e8 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:25 -0700 Subject: [PATCH 022/143] cxl/test: Add mock version of devm_cxl_add_dport_by_dev() devm_cxl_add_dport_by_dev() outside of cxl_test is done through PCI hierarchy. However with cxl_test, it needs to be done through the platform device hierarchy. Add the mock function for devm_cxl_add_dport_by_dev(). When cxl_core calls a cxl_core exported function and that function is mocked by cxl_test, the call chain causes a circular dependency issue. Dan provided a workaround to avoid this issue. Apply the method to changes from the late dport allocation changes in order to enable cxl-test. In cxl_core they are defined with "__" added in front of the function. A macro is used to define the original function names for when non-test version of the kernel is built. A bit of macros and typedefs are used to allow mocking of those functions in cxl_test. Co-developed-by: Dan Williams Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Tested-by: Alison Schofield Tested-by: Robert Richter Signed-off-by: Dave Jiang (cherry picked from commit d96eb90d9ca6e4652c8a23d48c94364aa061fdc4) Signed-off-by: Jiandi An --- drivers/cxl/core/core.h | 2 -- drivers/cxl/core/pci.c | 7 ++-- drivers/cxl/cxl.h | 20 +++++++++++ tools/testing/cxl/Kbuild | 1 + tools/testing/cxl/cxl_core_exports.c | 12 +++++++ tools/testing/cxl/exports.h | 10 ++++++ tools/testing/cxl/test/cxl.c | 53 ++++++++++++++++++++++++++-- tools/testing/cxl/test/mock.c | 23 ++++++++++++ tools/testing/cxl/test/mock.h | 2 ++ 9 files changed, 123 insertions(+), 7 deletions(-) create mode 100644 tools/testing/cxl/exports.h diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index c7c314a372a95..1fb66132b7777 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -147,8 +147,6 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, int cxl_ras_init(void); void cxl_ras_exit(void); int cxl_gpf_port_setup(struct cxl_dport *dport); -struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev); struct cxl_hdm; int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 9ec288ed39aea..18825e1505d6a 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -41,14 +41,14 @@ static int pci_get_port_num(struct pci_dev *pdev) } /** - * devm_cxl_add_dport_by_dev - allocate a dport by the dport device + * __devm_cxl_add_dport_by_dev - allocate a dport by dport device * @port: cxl_port that hosts the dport * @dport_dev: 'struct device' of the dport * * Returns the allocated dport on success or ERR_PTR() of -errno on error */ -struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev) +struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) { struct cxl_register_map map; struct pci_dev *pdev; @@ -69,6 +69,7 @@ struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, device_lock_assert(&port->dev); return devm_cxl_add_dport(port, dport_dev, port_num, map.resource); } +EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL"); struct cxl_walk_context { struct pci_bus *bus; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index e4f37c143c1ef..ed0df7db628ac 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -914,6 +914,10 @@ void cxl_coordinates_combine(struct access_coordinate *out, struct access_coordinate *c2); bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port); +struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev); +struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev); /* * Unit test builds overrides this to __weak, find the 'strong' version @@ -924,4 +928,20 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port); #endif u16 cxl_gpf_get_dvsec(struct device *dev); + +/* + * Declaration for functions that are mocked by cxl_test that are called by + * cxl_core. The respective functions are defined as __foo() and called by + * cxl_core as foo(). The macros below ensures that those functions would + * exist as foo(). See tools/testing/cxl/cxl_core_exports.c and + * tools/testing/cxl/exports.h for setting up the mock functions. The dance + * is done to avoid a circular dependency where cxl_core calls a function that + * ends up being a mock function and goes to * cxl_test where it calls a + * cxl_core function. + */ +#ifndef CXL_TEST_ENABLE +#define DECLARE_TESTABLE(x) __##x +#define devm_cxl_add_dport_by_dev DECLARE_TESTABLE(devm_cxl_add_dport_by_dev) +#endif + #endif /* __CXL_H__ */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 51b8ab289eae9..81e3795673c5a 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -18,6 +18,7 @@ CXL_SRC := $(DRIVERS)/cxl CXL_CORE_SRC := $(DRIVERS)/cxl/core ccflags-y := -I$(srctree)/drivers/cxl/ ccflags-y += -D__mock=__weak +ccflags-y += -DCXL_TEST_ENABLE=1 ccflags-y += -DTRACE_INCLUDE_PATH=$(CXL_CORE_SRC) -I$(srctree)/drivers/cxl/core/ obj-m += cxl_acpi.o diff --git a/tools/testing/cxl/cxl_core_exports.c b/tools/testing/cxl/cxl_core_exports.c index f088792a8925f..0d18abc1f5a31 100644 --- a/tools/testing/cxl/cxl_core_exports.c +++ b/tools/testing/cxl/cxl_core_exports.c @@ -2,6 +2,18 @@ /* Copyright(c) 2022 Intel Corporation. All rights reserved. */ #include "cxl.h" +#include "exports.h" /* Exporting of cxl_core symbols that are only used by cxl_test */ EXPORT_SYMBOL_NS_GPL(cxl_num_decoders_committed, "CXL"); + +cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev = + __devm_cxl_add_dport_by_dev; +EXPORT_SYMBOL_NS_GPL(_devm_cxl_add_dport_by_dev, "CXL"); + +struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) +{ + return _devm_cxl_add_dport_by_dev(port, dport_dev); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport_by_dev, "CXL"); diff --git a/tools/testing/cxl/exports.h b/tools/testing/cxl/exports.h new file mode 100644 index 0000000000000..9261ce6f11973 --- /dev/null +++ b/tools/testing/cxl/exports.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2025 Intel Corporation */ +#ifndef __MOCK_CXL_EXPORTS_H_ +#define __MOCK_CXL_EXPORTS_H_ + +typedef struct cxl_dport *(*cxl_add_dport_by_dev_fn)(struct cxl_port *port, + struct device *dport_dev); +extern cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev; + +#endif diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 36dff58275a25..b10434236590f 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1036,10 +1036,12 @@ static int mock_cxl_endpoint_decoders_setup(struct cxl_port *port) return __mock_cxl_decoders_setup(port); } -static int mock_cxl_port_enumerate_dports(struct cxl_port *port) +static int get_port_array(struct cxl_port *port, + struct platform_device ***port_array, + int *port_array_size) { struct platform_device **array; - int i, array_size; + int array_size; if (port->depth == 1) { if (is_multi_bridge(port->uport_dev)) { @@ -1073,6 +1075,22 @@ static int mock_cxl_port_enumerate_dports(struct cxl_port *port) return -ENXIO; } + *port_array = array; + *port_array_size = array_size; + + return 0; +} + +static int mock_cxl_port_enumerate_dports(struct cxl_port *port) +{ + struct platform_device **array; + int i, array_size; + int rc; + + rc = get_port_array(port, &array, &array_size); + if (rc) + return rc; + for (i = 0; i < array_size; i++) { struct platform_device *pdev = array[i]; struct cxl_dport *dport; @@ -1094,6 +1112,36 @@ static int mock_cxl_port_enumerate_dports(struct cxl_port *port) return 0; } +static struct cxl_dport *mock_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) +{ + struct platform_device **array; + int rc, i, array_size; + + rc = get_port_array(port, &array, &array_size); + if (rc) + return ERR_PTR(rc); + + for (i = 0; i < array_size; i++) { + struct platform_device *pdev = array[i]; + + if (pdev->dev.parent != port->uport_dev) { + dev_dbg(&port->dev, "%s: mismatch parent %s\n", + dev_name(port->uport_dev), + dev_name(pdev->dev.parent)); + continue; + } + + if (&pdev->dev != dport_dev) + continue; + + return devm_cxl_add_dport(port, &pdev->dev, pdev->id, + CXL_RESOURCE_NONE); + } + + return ERR_PTR(-ENODEV); +} + /* * Faking the cxl_dpa_perf for the memdev when appropriate. */ @@ -1154,6 +1202,7 @@ static struct cxl_mock_ops cxl_mock_ops = { .devm_cxl_endpoint_decoders_setup = mock_cxl_endpoint_decoders_setup, .devm_cxl_port_enumerate_dports = mock_cxl_port_enumerate_dports, .cxl_endpoint_parse_cdat = mock_cxl_endpoint_parse_cdat, + .devm_cxl_add_dport_by_dev = mock_cxl_add_dport_by_dev, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index f335889b7756a..e98101f083cd3 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -10,12 +10,18 @@ #include #include #include "mock.h" +#include "../exports.h" static LIST_HEAD(mock); +static struct cxl_dport * +redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev); + void register_cxl_mock_ops(struct cxl_mock_ops *ops) { list_add_rcu(&ops->list, &mock); + _devm_cxl_add_dport_by_dev = redirect_devm_cxl_add_dport_by_dev; } EXPORT_SYMBOL_GPL(register_cxl_mock_ops); @@ -23,6 +29,7 @@ DEFINE_STATIC_SRCU(cxl_mock_srcu); void unregister_cxl_mock_ops(struct cxl_mock_ops *ops) { + _devm_cxl_add_dport_by_dev = __devm_cxl_add_dport_by_dev; list_del_rcu(&ops->list); synchronize_srcu(&cxl_mock_srcu); } @@ -258,6 +265,22 @@ void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, "CXL"); +struct cxl_dport *redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) +{ + int index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + struct cxl_dport *dport; + + if (ops && ops->is_mock_port(port->uport_dev)) + dport = ops->devm_cxl_add_dport_by_dev(port, dport_dev); + else + dport = __devm_cxl_add_dport_by_dev(port, dport_dev); + put_cxl_mock_ops(index); + + return dport; +} + MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("cxl_test: emulation module"); MODULE_IMPORT_NS("ACPI"); diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 9d5ad3fd55ecc..4ed932e76aae8 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -23,6 +23,8 @@ struct cxl_mock_ops { int (*devm_cxl_switch_port_decoders_setup)(struct cxl_port *port); int (*devm_cxl_endpoint_decoders_setup)(struct cxl_port *port); void (*cxl_endpoint_parse_cdat)(struct cxl_port *port); + struct cxl_dport *(*devm_cxl_add_dport_by_dev)(struct cxl_port *port, + struct device *dport_dev); }; void register_cxl_mock_ops(struct cxl_mock_ops *ops); From 9982139286a0498afb1ec68094d037351d200ce6 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:26 -0700 Subject: [PATCH 023/143] cxl/test: Adjust the mock version of devm_cxl_switch_port_decoders_setup() With devm_cxl_switch_port_decoders_setup() being called within cxl_core instead of by the port driver probe, adjustments are needed to deal with circular symbol dependency when this function is being mock'd. Add the appropriate changes to get around the circular dependency. Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang (cherry picked from commit 644685abc16b58b3afcc2feb0ac14e86476ca2ed) Signed-off-by: Jiandi An --- drivers/cxl/core/hdm.c | 6 +++--- drivers/cxl/cxl.h | 2 ++ tools/testing/cxl/Kbuild | 1 - tools/testing/cxl/cxl_core_exports.c | 10 ++++++++++ tools/testing/cxl/exports.h | 3 +++ tools/testing/cxl/test/mock.c | 10 +++++++--- 6 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 4ecbf1d23bc59..de78601821e60 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -1219,12 +1219,12 @@ static int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, } /** - * devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders + * __devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders * @port: CXL port context * * Return 0 or -errno on error */ -int devm_cxl_switch_port_decoders_setup(struct cxl_port *port) +int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port) { struct cxl_hdm *cxlhdm; @@ -1248,7 +1248,7 @@ int devm_cxl_switch_port_decoders_setup(struct cxl_port *port) dev_err(&port->dev, "HDM decoder capability not found\n"); return -ENXIO; } -EXPORT_SYMBOL_NS_GPL(devm_cxl_switch_port_decoders_setup, "CXL"); +EXPORT_SYMBOL_NS_GPL(__devm_cxl_switch_port_decoders_setup, "CXL"); /** * devm_cxl_endpoint_decoders_setup - allocate and setup endpoint decoders diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ed0df7db628ac..7374c81f55f44 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -819,6 +819,7 @@ struct cxl_endpoint_dvsec_info { }; int devm_cxl_switch_port_decoders_setup(struct cxl_port *port); +int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port); int devm_cxl_endpoint_decoders_setup(struct cxl_port *port); struct cxl_dev_state; @@ -942,6 +943,7 @@ u16 cxl_gpf_get_dvsec(struct device *dev); #ifndef CXL_TEST_ENABLE #define DECLARE_TESTABLE(x) __##x #define devm_cxl_add_dport_by_dev DECLARE_TESTABLE(devm_cxl_add_dport_by_dev) +#define devm_cxl_switch_port_decoders_setup DECLARE_TESTABLE(devm_cxl_switch_port_decoders_setup) #endif #endif /* __CXL_H__ */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 81e3795673c5a..0d5ce4b74b9f7 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -10,7 +10,6 @@ ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat ldflags-y += --wrap=cxl_dport_init_ras_reporting -ldflags-y += --wrap=devm_cxl_switch_port_decoders_setup ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup DRIVERS := ../../../drivers diff --git a/tools/testing/cxl/cxl_core_exports.c b/tools/testing/cxl/cxl_core_exports.c index 0d18abc1f5a31..6754de35598d5 100644 --- a/tools/testing/cxl/cxl_core_exports.c +++ b/tools/testing/cxl/cxl_core_exports.c @@ -17,3 +17,13 @@ struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, return _devm_cxl_add_dport_by_dev(port, dport_dev); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport_by_dev, "CXL"); + +cxl_switch_decoders_setup_fn _devm_cxl_switch_port_decoders_setup = + __devm_cxl_switch_port_decoders_setup; +EXPORT_SYMBOL_NS_GPL(_devm_cxl_switch_port_decoders_setup, "CXL"); + +int devm_cxl_switch_port_decoders_setup(struct cxl_port *port) +{ + return _devm_cxl_switch_port_decoders_setup(port); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_switch_port_decoders_setup, "CXL"); diff --git a/tools/testing/cxl/exports.h b/tools/testing/cxl/exports.h index 9261ce6f11973..7ebee7c0bd67e 100644 --- a/tools/testing/cxl/exports.h +++ b/tools/testing/cxl/exports.h @@ -7,4 +7,7 @@ typedef struct cxl_dport *(*cxl_add_dport_by_dev_fn)(struct cxl_port *port, struct device *dport_dev); extern cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev; +typedef int(*cxl_switch_decoders_setup_fn)(struct cxl_port *port); +extern cxl_switch_decoders_setup_fn _devm_cxl_switch_port_decoders_setup; + #endif diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index e98101f083cd3..995269a75cbd1 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -17,11 +17,14 @@ static LIST_HEAD(mock); static struct cxl_dport * redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, struct device *dport_dev); +static int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port); void register_cxl_mock_ops(struct cxl_mock_ops *ops) { list_add_rcu(&ops->list, &mock); _devm_cxl_add_dport_by_dev = redirect_devm_cxl_add_dport_by_dev; + _devm_cxl_switch_port_decoders_setup = + redirect_devm_cxl_switch_port_decoders_setup; } EXPORT_SYMBOL_GPL(register_cxl_mock_ops); @@ -29,6 +32,8 @@ DEFINE_STATIC_SRCU(cxl_mock_srcu); void unregister_cxl_mock_ops(struct cxl_mock_ops *ops) { + _devm_cxl_switch_port_decoders_setup = + __devm_cxl_switch_port_decoders_setup; _devm_cxl_add_dport_by_dev = __devm_cxl_add_dport_by_dev; list_del_rcu(&ops->list); synchronize_srcu(&cxl_mock_srcu); @@ -138,7 +143,7 @@ __wrap_nvdimm_bus_register(struct device *dev, } EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register); -int __wrap_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) +int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) { int rc, index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); @@ -146,12 +151,11 @@ int __wrap_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) if (ops && ops->is_mock_port(port->uport_dev)) rc = ops->devm_cxl_switch_port_decoders_setup(port); else - rc = devm_cxl_switch_port_decoders_setup(port); + rc = __devm_cxl_switch_port_decoders_setup(port); put_cxl_mock_ops(index); return rc; } -EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_switch_port_decoders_setup, "CXL"); int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port) { From 95be66117d66c0bc98c1926dbeb1346d9000d524 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:27 -0700 Subject: [PATCH 024/143] cxl/test: Setup target_map for cxl_test decoder initialization cxl_test uses mock functions for decoder enumaration. Add initialization of the cxld->target_map[] for cxl_test based decoders in the mock functions. Reviewed-by: Jonathan Cameron Tested-by: Robert Richter Reviewed-by: Alison Schofield Signed-off-by: Dave Jiang (cherry picked from commit 87439b598ad962ffc5744e2e0a8b461e78d8d32f) Signed-off-by: Jiandi An --- tools/testing/cxl/test/cxl.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index b10434236590f..cb18ee41a7cf8 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -887,15 +887,21 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) */ if (WARN_ON(!dev)) continue; + cxlsd = to_cxl_switch_decoder(dev); if (i == 0) { /* put cxl_mem.4 second in the decode order */ - if (pdev->id == 4) + if (pdev->id == 4) { cxlsd->target[1] = dport; - else + cxld->target_map[1] = dport->port_id; + } else { cxlsd->target[0] = dport; - } else + cxld->target_map[0] = dport->port_id; + } + } else { cxlsd->target[0] = dport; + cxld->target_map[0] = dport->port_id; + } cxld = &cxlsd->cxld; cxld->target_type = CXL_DECODER_HOSTONLYMEM; cxld->flags = CXL_DECODER_F_ENABLE; From a7610893815f428abc4108df22421f3390dd4b0a Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 11:09:28 -0700 Subject: [PATCH 025/143] cxl: Change sslbis handler to only handle single dport While cxl_switch_parse_cdat() is harmless to be run multiple times, it is not efficient in the current scheme where one dport is being updated at a time by the memdev probe path. Change the input parameter to the specific dport being updated to pick up the SSLBIS information for just that dport. Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Tested-by: Robert Richter Signed-off-by: Dave Jiang (cherry picked from commit d64035a5a37741b25712fb9c2f6aca535c2967ea) Signed-off-by: Jiandi An --- drivers/cxl/core/cdat.c | 23 ++++++++++------------- drivers/cxl/core/port.c | 2 +- drivers/cxl/cxl.h | 2 +- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 44c1c778b7cce..c4bd6e8a0cf03 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -440,8 +440,8 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, } *tbl = (struct acpi_cdat_sslbis_table *)header; int size = sizeof(header->cdat) + sizeof(tbl->sslbis_header); struct acpi_cdat_sslbis *sslbis; - struct cxl_port *port = arg; - struct device *dev = &port->dev; + struct cxl_dport *dport = arg; + struct device *dev = &dport->port->dev; int remain, entries, i; u16 len; @@ -467,8 +467,6 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, u16 y = le16_to_cpu((__force __le16)tbl->entries[i].porty_id); __le64 le_base; __le16 le_val; - struct cxl_dport *dport; - unsigned long index; u16 dsp_id; u64 val; @@ -499,28 +497,27 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), sslbis->data_type); - xa_for_each(&port->dports, index, dport) { - if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || - dsp_id == dport->port_id) { - cxl_access_coordinate_set(dport->coord, - sslbis->data_type, - val); - } + if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || + dsp_id == dport->port_id) { + cxl_access_coordinate_set(dport->coord, + sslbis->data_type, val); + return 0; } } return 0; } -void cxl_switch_parse_cdat(struct cxl_port *port) +void cxl_switch_parse_cdat(struct cxl_dport *dport) { + struct cxl_port *port = dport->port; int rc; if (!port->cdat.table) return; rc = cdat_table_parse(ACPI_CDAT_TYPE_SSLBIS, cdat_sslbis_handler, - port, port->cdat.table, port->cdat.length); + dport, port->cdat.table, port->cdat.length); rc = cdat_table_parse_output(rc); if (rc) dev_dbg(&port->dev, "Failed to parse SSLBIS: %d\n", rc); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c016eaa1e91b0..960d8eb6275e5 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1608,7 +1608,7 @@ static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, if (IS_ERR(new_dport)) return new_dport; - cxl_switch_parse_cdat(port); + cxl_switch_parse_cdat(new_dport); if (ida_is_empty(&port->decoder_ida)) { rc = devm_cxl_switch_port_decoders_setup(port); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 7374c81f55f44..0e0e518031fb0 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -900,7 +900,7 @@ static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, #endif void cxl_endpoint_parse_cdat(struct cxl_port *port); -void cxl_switch_parse_cdat(struct cxl_port *port); +void cxl_switch_parse_cdat(struct cxl_dport *dport); int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord); From fe39189d5dce7f1847811cb4ba2aeeab298c2168 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 14 Aug 2025 15:21:44 -0700 Subject: [PATCH 026/143] cxl: Move port register setup to when first dport appear This patch moves the port register setup to when the first dport appears via the memdev probe path. At this point, the CXL link should be established and the register access is expected to succeed. This change addresses an error message observed when PCIe hotplug is enabled on an Intel platform. The error messages "cxl portN: Couldn't locate the CXL.cache and CXL.mem capability array header" is observed for the host bridge (CHBCR) during cxl_acpi driver probe. If the cxl_acpi module probe is running before the CXL link between the endpoint device and the RP is established, then the platform may not have exposed DVSEC ID 3 and/or DVSEC ID 7 blocks which will trigger the error message. This behavior is defined by the CXL spec r3.2 9.12.3 for RPs and DSPs, however the Intel platform also added this behavior to the host bridge. This change also needs the dport enumeration to be moved to the memdev probe path in order to address the issue. This change is not a wholly contained solution by itself. [dj: Add missing var init during port alloc] Suggested-by: Dan Williams Reviewed-by: Jonathan Cameron Tested-by: Robert Richter Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Signed-off-by: Dave Jiang (cherry picked from commit f6ee24913de24dbda8d49213e1a27f5e1a5204cc) Signed-off-by: Jiandi An --- drivers/cxl/core/port.c | 17 ++++++++++++++--- drivers/cxl/cxl.h | 2 ++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 960d8eb6275e5..d5f71eb1ade85 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -750,6 +750,7 @@ static struct cxl_port *cxl_port_alloc(struct device *uport_dev, xa_init(&port->dports); xa_init(&port->endpoints); xa_init(&port->regions); + port->component_reg_phys = CXL_RESOURCE_NONE; device_initialize(dev); lockdep_set_class_and_subclass(&dev->mutex, &cxl_port_key, port->depth); @@ -868,9 +869,7 @@ static int cxl_port_add(struct cxl_port *port, if (rc) return rc; - rc = cxl_port_setup_regs(port, component_reg_phys); - if (rc) - return rc; + port->component_reg_phys = component_reg_phys; } else { rc = dev_set_name(dev, "root%d", port->id); if (rc) @@ -1201,6 +1200,18 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, cxl_debugfs_create_dport_dir(dport); + /* + * Setup port register if this is the first dport showed up. Having + * a dport also means that there is at least 1 active link. + */ + if (port->nr_dports == 1 && + port->component_reg_phys != CXL_RESOURCE_NONE) { + rc = cxl_port_setup_regs(port, port->component_reg_phys); + if (rc) + return ERR_PTR(rc); + port->component_reg_phys = CXL_RESOURCE_NONE; + } + return dport; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 0e0e518031fb0..231ddccf89773 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -607,6 +607,7 @@ struct cxl_dax_region { * @cdat: Cached CDAT data * @cdat_available: Should a CDAT attribute be available in sysfs * @pci_latency: Upstream latency in picoseconds + * @component_reg_phys: Physical address of component register */ struct cxl_port { struct device dev; @@ -630,6 +631,7 @@ struct cxl_port { } cdat; bool cdat_available; long pci_latency; + resource_size_t component_reg_phys; }; /** From 7922d5cf26b682f614985a6df83bae7d4ccc4aa6 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Wed, 1 Oct 2025 14:03:37 +0800 Subject: [PATCH 027/143] cxl/port: Avoid missing port component registers setup port->nr_dports is used to represent how many dports added to the cxl port, it will increase in add_dport() when a new dport is being added to the cxl port, but it will not be reduced when a dport is removed from the cxl port. Currently, when the first dport is added to a cxl port, it will trigger component registers setup on the cxl port, the implementation is using port->nr_dports to confirm if the dport is the first dport. A corner case here is that adding dport could fail after port->nr_dports updating and before checking port->nr_dports for component registers setup. If the failure happens during the first dport attaching, it will cause that CXL subsystem has not chance to execute component registers setup for the cxl port. the failure flow like below: port->nr_dports = 0 dport 1 adding to the port: add_dport() # port->nr_dports: 1 failed on devm_add_action_or_reset() or sysfs_create_link() return error # port->nr_dports: 1 dport 2 adding to the port: add_dport() # port->nr_dports: 2 no failure skip component registers setup because of port->nr_dports is 2 The solution here is that moving component registers setup closer to add_dport(), so if add_dport() is executed correctly for the first dport, component registers setup on the port will be executed immediately after that. Fixes: f6ee24913de2 ("cxl: Move port register setup to when first dport appear") Signed-off-by: Li Ming Reviewed-by: Dave Jiang Reviewed-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang (cherry picked from commit 02e7567f5da023524476053a38c54f4f19130959) Signed-off-by: Jiandi An --- drivers/cxl/core/port.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index d5f71eb1ade85..8128fd2b5b317 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1182,6 +1182,20 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, if (rc) return ERR_PTR(rc); + /* + * Setup port register if this is the first dport showed up. Having + * a dport also means that there is at least 1 active link. + */ + if (port->nr_dports == 1 && + port->component_reg_phys != CXL_RESOURCE_NONE) { + rc = cxl_port_setup_regs(port, port->component_reg_phys); + if (rc) { + xa_erase(&port->dports, (unsigned long)dport->dport_dev); + return ERR_PTR(rc); + } + port->component_reg_phys = CXL_RESOURCE_NONE; + } + get_device(dport_dev); rc = devm_add_action_or_reset(host, cxl_dport_remove, dport); if (rc) @@ -1200,18 +1214,6 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, cxl_debugfs_create_dport_dir(dport); - /* - * Setup port register if this is the first dport showed up. Having - * a dport also means that there is at least 1 active link. - */ - if (port->nr_dports == 1 && - port->component_reg_phys != CXL_RESOURCE_NONE) { - rc = cxl_port_setup_regs(port, port->component_reg_phys); - if (rc) - return ERR_PTR(rc); - port->component_reg_phys = CXL_RESOURCE_NONE; - } - return dport; } From a7e9e0ce8262a0e2640a830073d2fb9a862b6e6f Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 14 Oct 2025 00:31:04 -0700 Subject: [PATCH 028/143] cxl/region: Use %pa printk format to emit resource_size_t KASAN reports a stack-out-of-bounds access in validate_region_offset() while running the cxl-poison.sh unit test because the printk format specifier, %pr format, is not a match for the resource_size_t type of the variables. %pr expects struct resource pointers and attempts to dereference the structure fields, reading beyond the bounds of the stack variables. Since these messages emit an 'A exceeds B' type of message, keep the resource_size_t's and use the %pa specifier to be architecture safe. BUG: KASAN: stack-out-of-bounds in resource_string.isra.0+0xe9a/0x1690 [] Read of size 8 at addr ffff88800a7afb40 by task bash/1397 ... [] The buggy address belongs to stack of task bash/1397 [] and is located at offset 56 in frame: [] validate_region_offset+0x0/0x1c0 [cxl_core] Fixes: c3dd67681c70 ("cxl/region: Add inject and clear poison by region offset") Signed-off-by: Alison Schofield Reviewed-by: Dave Jiang Signed-off-by: Dave Jiang (cherry picked from commit 257c4b03a2f7d8c15f79c79b09a561af9734f6c4) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 32675a70cadf9..149f9bdabbb40 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3671,14 +3671,14 @@ static int validate_region_offset(struct cxl_region *cxlr, u64 offset) if (offset < p->cache_size) { dev_err(&cxlr->dev, - "Offset %#llx is within extended linear cache %pr\n", + "Offset %#llx is within extended linear cache %pa\n", offset, &p->cache_size); return -EINVAL; } region_size = resource_size(p->res); if (offset >= region_size) { - dev_err(&cxlr->dev, "Offset %#llx exceeds region size %pr\n", + dev_err(&cxlr->dev, "Offset %#llx exceeds region size %pa\n", offset, ®ion_size); return -EINVAL; } From 4fd81efb72528ee14ed47890b97e04ab397f7845 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 31 Oct 2025 10:32:24 -0700 Subject: [PATCH 029/143] cxl: Adjust offset calculation for poison injection The HPA to DPA translation for poison injection assumes that the base address starts from where the CXL region begins. When the extended linear cache is active, the offset can be within the DRAM region. Adjust the offset so that it correctly reflects the offset within the CXL region. [ dj: Add fixes tag from Alison ] Fixes: c3dd67681c70 ("cxl/region: Add inject and clear poison by region offset") Link: https://patch.msgid.link/20251031173224.3537030-5-dave.jiang@intel.com Reviewed-by: Alison Schofield Signed-off-by: Dave Jiang (cherry picked from commit b6cfddd26ec55e865b4715f73e9bbb17a15091ed) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 149f9bdabbb40..cc18f2672ee7b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3710,6 +3710,7 @@ static int cxl_region_debugfs_poison_inject(void *data, u64 offset) if (validate_region_offset(cxlr, offset)) return -EINVAL; + offset -= cxlr->params.cache_size; rc = region_offset_to_dpa_result(cxlr, offset, &result); if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { dev_dbg(&cxlr->dev, @@ -3742,6 +3743,7 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset) if (validate_region_offset(cxlr, offset)) return -EINVAL; + offset -= cxlr->params.cache_size; rc = region_offset_to_dpa_result(cxlr, offset, &result); if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { dev_dbg(&cxlr->dev, From 80192f42359001ae681cb5d771d923b221dde0d1 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 3 Oct 2025 10:32:32 -0400 Subject: [PATCH 030/143] Documentation/driver-api/cxl: remove page-allocator quirk section The node/zone quirk section of the cxl documentation is incorrect. The actual reason for fallback allocation misbehavior in the described configuration is due to a kswapd/reclaim thrashing scenario fixed by the linked patch. Remove this section. Link: https://lore.kernel.org/linux-mm/20250919162134.1098208-1-hannes@cmpxchg.org/ Signed-off-by: Gregory Price Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang (cherry picked from commit 82b5d7e30b24b7df5dbf10aea97292be38daf88d) Signed-off-by: Jiandi An --- .../cxl/allocation/page-allocator.rst | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/Documentation/driver-api/cxl/allocation/page-allocator.rst b/Documentation/driver-api/cxl/allocation/page-allocator.rst index 7b8fe1b8d5bbb..3fa584a248bdd 100644 --- a/Documentation/driver-api/cxl/allocation/page-allocator.rst +++ b/Documentation/driver-api/cxl/allocation/page-allocator.rst @@ -41,37 +41,6 @@ To simplify this, the page allocator will prefer :code:`ZONE_MOVABLE` over will fallback to allocate from :code:`ZONE_NORMAL`. -Zone and Node Quirks -==================== -Let's consider a configuration where the local DRAM capacity is largely onlined -into :code:`ZONE_NORMAL`, with no :code:`ZONE_MOVABLE` capacity present. The -CXL capacity has the opposite configuration - all onlined in -:code:`ZONE_MOVABLE`. - -Under the default allocation policy, the page allocator will completely skip -:code:`ZONE_MOVABLE` as a valid allocation target. This is because, as of -Linux v6.15, the page allocator does (approximately) the following: :: - - for (each zone in local_node): - - for (each node in fallback_order): - - attempt_allocation(gfp_flags); - -Because the local node does not have :code:`ZONE_MOVABLE`, the CXL node is -functionally unreachable for direct allocation. As a result, the only way -for CXL capacity to be used is via `demotion` in the reclaim path. - -This configuration also means that if the DRAM ndoe has :code:`ZONE_MOVABLE` -capacity - when that capacity is depleted, the page allocator will actually -prefer CXL :code:`ZONE_MOVABLE` pages over DRAM :code:`ZONE_NORMAL` pages. - -We may wish to invert this priority in future Linux versions. - -If `demotion` and `swap` are disabled, Linux will begin to cause OOM crashes -when the DRAM nodes are depleted. See the reclaim section for more details. - - CGroups and CPUSets =================== Finally, assuming CXL memory is reachable via the page allocation (i.e. onlined From 2d3a899e50bfd9b941aa8a1f1c1285470fbcb63f Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sat, 27 Sep 2025 18:07:09 +0800 Subject: [PATCH 031/143] cxl/port: Remove devm_cxl_port_enumerate_dports() devm_cxl_port_enumerate_dports() is not longer used after below commit commit 4f06d81e7c6a ("cxl: Defer dport allocation for switch ports") Delete it and the relevant interface implemented in cxl_test. Signed-off-by: Li Ming Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang (cherry picked from commit 3f5b8f7f34f6d8e63c02d177341e43ebee4c2d36) Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 87 ++++------------------------------- drivers/cxl/cxlpci.h | 1 - tools/testing/cxl/Kbuild | 1 - tools/testing/cxl/test/cxl.c | 32 ------------- tools/testing/cxl/test/mock.c | 15 ------ tools/testing/cxl/test/mock.h | 1 - 6 files changed, 8 insertions(+), 129 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 18825e1505d6a..5b023a0178a47 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -71,85 +71,6 @@ struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL"); -struct cxl_walk_context { - struct pci_bus *bus; - struct cxl_port *port; - int type; - int error; - int count; -}; - -static int match_add_dports(struct pci_dev *pdev, void *data) -{ - struct cxl_walk_context *ctx = data; - struct cxl_port *port = ctx->port; - int type = pci_pcie_type(pdev); - struct cxl_register_map map; - struct cxl_dport *dport; - u32 lnkcap, port_num; - int rc; - - if (pdev->bus != ctx->bus) - return 0; - if (!pci_is_pcie(pdev)) - return 0; - if (type != ctx->type) - return 0; - if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP, - &lnkcap)) - return 0; - - rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); - if (rc) - dev_dbg(&port->dev, "failed to find component registers\n"); - - port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); - dport = devm_cxl_add_dport(port, &pdev->dev, port_num, map.resource); - if (IS_ERR(dport)) { - ctx->error = PTR_ERR(dport); - return PTR_ERR(dport); - } - ctx->count++; - - return 0; -} - -/** - * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port - * @port: cxl_port whose ->uport_dev is the upstream of dports to be enumerated - * - * Returns a positive number of dports enumerated or a negative error - * code. - */ -int devm_cxl_port_enumerate_dports(struct cxl_port *port) -{ - struct pci_bus *bus = cxl_port_to_pci_bus(port); - struct cxl_walk_context ctx; - int type; - - if (!bus) - return -ENXIO; - - if (pci_is_root_bus(bus)) - type = PCI_EXP_TYPE_ROOT_PORT; - else - type = PCI_EXP_TYPE_DOWNSTREAM; - - ctx = (struct cxl_walk_context) { - .port = port, - .bus = bus, - .type = type, - }; - pci_walk_bus(bus, match_add_dports, &ctx); - - if (ctx.count == 0) - return -ENODEV; - if (ctx.error) - return ctx.error; - return ctx.count; -} -EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, "CXL"); - static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); @@ -1217,6 +1138,14 @@ int cxl_gpf_port_setup(struct cxl_dport *dport) return 0; } +struct cxl_walk_context { + struct pci_bus *bus; + struct cxl_port *port; + int type; + int error; + int count; +}; + static int count_dports(struct pci_dev *pdev, void *data) { struct cxl_walk_context *ctx = data; diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 7ae621e618e79..1d526bea84312 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -127,7 +127,6 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) return lnksta2 & PCI_EXP_LNKSTA2_FLIT; } -int devm_cxl_port_enumerate_dports(struct cxl_port *port); struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); void cxl_cor_error_detected(struct pci_dev *pdev); diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0d5ce4b74b9f7..3dae06ac7fba5 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -4,7 +4,6 @@ ldflags-y += --wrap=is_acpi_device_node ldflags-y += --wrap=acpi_evaluate_integer ldflags-y += --wrap=acpi_pci_find_root ldflags-y += --wrap=nvdimm_bus_register -ldflags-y += --wrap=devm_cxl_port_enumerate_dports ldflags-y += --wrap=cxl_await_media_ready ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index cb18ee41a7cf8..fc271561827b6 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1087,37 +1087,6 @@ static int get_port_array(struct cxl_port *port, return 0; } -static int mock_cxl_port_enumerate_dports(struct cxl_port *port) -{ - struct platform_device **array; - int i, array_size; - int rc; - - rc = get_port_array(port, &array, &array_size); - if (rc) - return rc; - - for (i = 0; i < array_size; i++) { - struct platform_device *pdev = array[i]; - struct cxl_dport *dport; - - if (pdev->dev.parent != port->uport_dev) { - dev_dbg(&port->dev, "%s: mismatch parent %s\n", - dev_name(port->uport_dev), - dev_name(pdev->dev.parent)); - continue; - } - - dport = devm_cxl_add_dport(port, &pdev->dev, pdev->id, - CXL_RESOURCE_NONE); - - if (IS_ERR(dport)) - return PTR_ERR(dport); - } - - return 0; -} - static struct cxl_dport *mock_cxl_add_dport_by_dev(struct cxl_port *port, struct device *dport_dev) { @@ -1206,7 +1175,6 @@ static struct cxl_mock_ops cxl_mock_ops = { .acpi_pci_find_root = mock_acpi_pci_find_root, .devm_cxl_switch_port_decoders_setup = mock_cxl_switch_port_decoders_setup, .devm_cxl_endpoint_decoders_setup = mock_cxl_endpoint_decoders_setup, - .devm_cxl_port_enumerate_dports = mock_cxl_port_enumerate_dports, .cxl_endpoint_parse_cdat = mock_cxl_endpoint_parse_cdat, .devm_cxl_add_dport_by_dev = mock_cxl_add_dport_by_dev, .list = LIST_HEAD_INIT(cxl_mock_ops.list), diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 995269a75cbd1..6fd4edb9215c4 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -172,21 +172,6 @@ int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_endpoint_decoders_setup, "CXL"); -int __wrap_devm_cxl_port_enumerate_dports(struct cxl_port *port) -{ - int rc, index; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_port(port->uport_dev)) - rc = ops->devm_cxl_port_enumerate_dports(port); - else - rc = devm_cxl_port_enumerate_dports(port); - put_cxl_mock_ops(index); - - return rc; -} -EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_port_enumerate_dports, "CXL"); - int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds) { int rc, index; diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 4ed932e76aae8..580f383862245 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -19,7 +19,6 @@ struct cxl_mock_ops { bool (*is_mock_bus)(struct pci_bus *bus); bool (*is_mock_port)(struct device *dev); bool (*is_mock_dev)(struct device *dev); - int (*devm_cxl_port_enumerate_dports)(struct cxl_port *port); int (*devm_cxl_switch_port_decoders_setup)(struct cxl_port *port); int (*devm_cxl_endpoint_decoders_setup)(struct cxl_port *port); void (*cxl_endpoint_parse_cdat)(struct cxl_port *port); From 8ba94b80a9a7648abca24e6f8d8d59fed130d4e1 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sat, 11 Oct 2025 11:30:44 -0700 Subject: [PATCH 032/143] cxl: fix typos in cdat.c comments - Corrected spelling of "bandwdith" -> "bandwidth" - Fixed "wht" -> "with" Signed-off-by: Alok Tiwari Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Signed-off-by: Dave Jiang (cherry picked from commit 040acb49bf862dd851144bfc0872555d4ac4ffd5) Signed-off-by: Jiandi An --- drivers/cxl/core/cdat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c4bd6e8a0cf03..7120b5f2e31fe 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -826,7 +826,7 @@ static struct xarray *cxl_switch_gather_bandwidth(struct cxl_region *cxlr, cxl_coordinates_combine(coords, coords, ctx->coord); /* - * Take the min of the calculated bandwdith and the upstream + * Take the min of the calculated bandwidth and the upstream * switch SSLBIS bandwidth if there's a parent switch */ if (!is_root) @@ -949,7 +949,7 @@ static struct xarray *cxl_hb_gather_bandwidth(struct xarray *xa) /** * cxl_region_update_bandwidth - Update the bandwidth access coordinates of a region * @cxlr: The region being operated on - * @input_xa: xarray holds cxl_perf_ctx wht calculated bandwidth per ACPI0017 instance + * @input_xa: xarray holds cxl_perf_ctx with calculated bandwidth per ACPI0017 instance */ static void cxl_region_update_bandwidth(struct cxl_region *cxlr, struct xarray *input_xa) From 866acc18eb466ce51702311277c570bb5d0cdbb7 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 30 Oct 2025 17:38:39 +0100 Subject: [PATCH 033/143] cxl/pci: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. system_wq should be the per-cpu workqueue, yet in this name nothing makes that clear, so replace system_wq with system_percpu_wq. The old wq (system_wq) will be kept for a few release cycles. See 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") for cause of changes. [ dj: Add reference to commit that initiated the change. ] Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Acked-by: Davidlohr Bueso Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang > --- Link: https://patch.msgid.link/20251030163839.307752-1-marco.crivellari@suse.com Signed-off-by: Dave Jiang (cherry picked from commit 952e9057e66c17a9718232664368ffdaca468f93) Signed-off-by: Jiandi An --- drivers/cxl/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index bd100ac31672d..0be4e508affe7 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -136,7 +136,7 @@ static irqreturn_t cxl_pci_mbox_irq(int irq, void *id) if (opcode == CXL_MBOX_OP_SANITIZE) { mutex_lock(&cxl_mbox->mbox_mutex); if (mds->security.sanitize_node) - mod_delayed_work(system_wq, &mds->security.poll_dwork, 0); + mod_delayed_work(system_percpu_wq, &mds->security.poll_dwork, 0); mutex_unlock(&cxl_mbox->mbox_mutex); } else { /* short-circuit the wait in __cxl_pci_mbox_send_cmd() */ From d385082c2cdb90d7c004aadb133e051100aaeb43 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 14 Oct 2025 01:24:30 -0700 Subject: [PATCH 034/143] cxl/region: Refactor address translation funcs for testing In preparation for adding a test module that exercises the address translation calculations, extract the core calculations into stand- alone functions that operate on base parameters without dependencies on struct cxl_region. Perform additional parameter validation to protect against a test module sending bad parameters. Export the validation function, as well as the three core translation functions for use by test module cxl_translate only. This refactoring enables unit testing of the address translation logic with controlled inputs, while preserving identical functionality in the existing code paths. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Signed-off-by: Alison Schofield Signed-off-by: Dave Jiang (cherry picked from commit b78b9e7b7979f86c7838f1ab7d084ca35a17702d) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 202 +++++++++++++++++++++++++++----------- drivers/cxl/cxl.h | 6 ++ 2 files changed, 148 insertions(+), 60 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index cc18f2672ee7b..3af7561e2973a 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2934,28 +2934,119 @@ static bool has_spa_to_hpa(struct cxl_root_decoder *cxlrd) return cxlrd->ops && cxlrd->ops->spa_to_hpa; } -u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, - u64 dpa) +#define CXL_POS_ZERO 0 +/** + * cxl_validate_translation_params + * @eiw: encoded interleave ways + * @eig: encoded interleave granularity + * @pos: position in interleave + * + * Callers pass CXL_POS_ZERO when no position parameter needs validating. + * + * Returns: 0 on success, -EINVAL on first invalid parameter + */ +int cxl_validate_translation_params(u8 eiw, u16 eig, int pos) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); - u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa; - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = NULL; - u16 eig = 0; - u8 eiw = 0; - int pos; + int ways, gran; - for (int i = 0; i < p->nr_targets; i++) { - cxled = p->targets[i]; - if (cxlmd == cxled_to_memdev(cxled)) - break; + if (eiw_to_ways(eiw, &ways)) { + pr_debug("%s: invalid eiw=%u\n", __func__, eiw); + return -EINVAL; + } + if (eig_to_granularity(eig, &gran)) { + pr_debug("%s: invalid eig=%u\n", __func__, eig); + return -EINVAL; } - if (!cxled || cxlmd != cxled_to_memdev(cxled)) + if (pos < 0 || pos >= ways) { + pr_debug("%s: invalid pos=%d for ways=%u\n", __func__, pos, + ways); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate"); + +u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig) +{ + u64 dpa_offset, bits_lower, bits_upper, temp; + int ret; + + ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO); + if (ret) return ULLONG_MAX; - pos = cxled->pos; - ways_to_eiw(p->interleave_ways, &eiw); - granularity_to_eig(p->interleave_granularity, &eig); + /* + * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13 + * Lower bits [IG+7:0] pass through unchanged + * (eiw < 8) + * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW) + * Clear the position bits to isolate upper section, then + * reverse the left shift by eiw that occurred during DPA->HPA + * (eiw >= 8) + * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3 + * Extract upper bits from the correct bit range and divide by 3 + * to recover the original DPA upper bits + */ + bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0); + if (eiw < 8) { + temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0); + dpa_offset = temp >> eiw; + } else { + bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3); + dpa_offset = bits_upper << (eig + 8); + } + dpa_offset |= bits_lower; + + return dpa_offset; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate"); + +int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig) +{ + unsigned int ways = 0; + u64 shifted, rem; + int pos, ret; + + ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO); + if (ret) + return ret; + + if (!eiw) + /* position is 0 if no interleaving */ + return 0; + + /* + * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13 + * eiw < 8 + * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8]. + * Per spec "remove IW bits starting with bit position IG+8" + * eiw >= 8 + * Position is not explicitly stored in HPA_OFFSET bits. It is + * derived from the modulo operation of the upper bits using + * the total number of interleave ways. + */ + if (eiw < 8) { + pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0); + } else { + shifted = hpa_offset >> (eig + 8); + eiw_to_ways(eiw, &ways); + div64_u64_rem(shifted, ways, &rem); + pos = rem; + } + + return pos; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate"); + +u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig) +{ + u64 mask_upper, hpa_offset, bits_upper; + int ret; + + ret = cxl_validate_translation_params(eiw, eig, pos); + if (ret) + return ULLONG_MAX; /* * The device position in the region interleave set was removed @@ -2967,9 +3058,6 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, * 8.2.4.19.13 Implementation Note: Device Decode Logic */ - /* Remove the dpa base */ - dpa_offset = dpa - cxl_dpa_resource_start(cxled); - mask_upper = GENMASK_ULL(51, eig + 8); if (eiw < 8) { @@ -2984,6 +3072,37 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, /* The lower bits remain unchanged */ hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0); + return hpa_offset; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate"); + +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled = NULL; + u64 dpa_offset, hpa_offset, hpa; + u16 eig = 0; + u8 eiw = 0; + int pos; + + for (int i = 0; i < p->nr_targets; i++) { + if (cxlmd == cxled_to_memdev(p->targets[i])) { + cxled = p->targets[i]; + break; + } + } + if (!cxled) + return ULLONG_MAX; + + pos = cxled->pos; + ways_to_eiw(p->interleave_ways, &eiw); + granularity_to_eig(p->interleave_granularity, &eig); + + dpa_offset = dpa - cxl_dpa_resource_start(cxled); + hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig); + /* Apply the hpa_offset to the region base address */ hpa = hpa_offset + p->res->start + p->cache_size; @@ -3016,8 +3135,6 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_endpoint_decoder *cxled; u64 hpa, hpa_offset, dpa_offset; - u64 bits_upper, bits_lower; - u64 shifted, rem, temp; u16 eig = 0; u8 eiw = 0; int pos; @@ -3039,50 +3156,15 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, } else { hpa_offset = offset; } - /* - * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13 - * eiw < 8 - * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8]. - * Per spec "remove IW bits starting with bit position IG+8" - * eiw >= 8 - * Position is not explicitly stored in HPA_OFFSET bits. It is - * derived from the modulo operation of the upper bits using - * the total number of interleave ways. - */ - if (eiw < 8) { - pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0); - } else { - shifted = hpa_offset >> (eig + 8); - div64_u64_rem(shifted, p->interleave_ways, &rem); - pos = rem; - } + + pos = cxl_calculate_position(hpa_offset, eiw, eig); if (pos < 0 || pos >= p->nr_targets) { dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n", pos, p->nr_targets); return -ENXIO; } - /* - * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13 - * Lower bits [IG+7:0] pass through unchanged - * (eiw < 8) - * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW) - * Clear the position bits to isolate upper section, then - * reverse the left shift by eiw that occurred during DPA->HPA - * (eiw >= 8) - * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3 - * Extract upper bits from the correct bit range and divide by 3 - * to recover the original DPA upper bits - */ - bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0); - if (eiw < 8) { - temp = hpa_offset &= ~((u64)GENMASK(eig + eiw + 8 - 1, 0)); - dpa_offset = temp >> eiw; - } else { - bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3); - dpa_offset = bits_upper << (eig + 8); - } - dpa_offset |= bits_lower; + dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig); /* Look-up and return the result: a memdev and a DPA */ for (int i = 0; i < p->nr_targets; i++) { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 231ddccf89773..10bee9aaa943e 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -738,6 +738,12 @@ static inline bool is_cxl_root(struct cxl_port *port) return port->uport_dev == port->dev.parent; } +/* Address translation functions exported to cxl_translate test module only */ +int cxl_validate_translation_params(u8 eiw, u16 eig, int pos); +u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig); +u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig); +int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig); + int cxl_num_decoders_committed(struct cxl_port *port); bool is_cxl_port(const struct device *dev); struct cxl_port *to_cxl_port(const struct device *dev); From 293cba361c3194bb6c8f4198be170f1da85b9084 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 14 Oct 2025 01:24:31 -0700 Subject: [PATCH 035/143] cxl/acpi: Make the XOR calculations available for testing In preparation for adding a test module that can exercise the address translation functions performed by the CXL Driver, refactor the XOR implementation like this: - Extract the core calculation into a standalone helper function, - Export the new function for use by test module cxl_translate only, - Enhance the parameter validation since this new function will be called from a test module with no guarantee of valid parameters, - Move the define of struct cxl_cxims_data to cxl.h so the test module can build xormaps. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Alison Schofield Signed-off-by: Dave Jiang (cherry picked from commit 4fe516d2ad1a6b827694db134fa2a0af97917b41) Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 41 ++++++++++++++++++++++++++++++----------- drivers/cxl/cxl.h | 13 +++++++++++++ 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index bd2e282ca93a0..a8069278cb565 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -11,25 +11,36 @@ #include "cxlpci.h" #include "cxl.h" -struct cxl_cxims_data { - int nr_maps; - u64 xormaps[] __counted_by(nr_maps); -}; - static const guid_t acpi_cxl_qtg_id_guid = GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071, 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52); -static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr) +#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1) +static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = { + [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4 +}; + +static const int valid_hbiw[] = { 1, 2, 3, 4, 6, 8, 12, 16 }; + +u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw) { - struct cxl_cxims_data *cximsd = cxlrd->platform_data; - int hbiw = cxlrd->cxlsd.nr_targets; + int nr_maps_to_apply = -1; u64 val; int pos; - /* No xormaps for host bridge interleave ways of 1 or 3 */ - if (hbiw == 1 || hbiw == 3) - return addr; + /* + * Strictly validate hbiw since this function is used for testing and + * that nullifies any expectation of trusted parameters from the CXL + * Region Driver. + */ + for (int i = 0; i < ARRAY_SIZE(valid_hbiw); i++) { + if (valid_hbiw[i] == hbiw) { + nr_maps_to_apply = hbiw_to_nr_maps[hbiw]; + break; + } + } + if (nr_maps_to_apply == -1 || nr_maps_to_apply > cximsd->nr_maps) + return ULLONG_MAX; /* * In regions using XOR interleave arithmetic the CXL HPA may not @@ -60,6 +71,14 @@ static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr) return addr; } +EXPORT_SYMBOL_FOR_MODULES(cxl_do_xormap_calc, "cxl_translate"); + +static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr) +{ + struct cxl_cxims_data *cximsd = cxlrd->platform_data; + + return cxl_do_xormap_calc(cximsd, addr, cxlrd->cxlsd.nr_targets); +} struct cxl_cxims_context { struct device *dev; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 10bee9aaa943e..e8931b626fc62 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -743,6 +743,19 @@ int cxl_validate_translation_params(u8 eiw, u16 eig, int pos); u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig); u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig); int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig); +struct cxl_cxims_data { + int nr_maps; + u64 xormaps[] __counted_by(nr_maps); +}; + +#if IS_ENABLED(CONFIG_CXL_ACPI) +u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw); +#else +static inline u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw) +{ + return ULLONG_MAX; +} +#endif int cxl_num_decoders_committed(struct cxl_port *port); bool is_cxl_port(const struct device *dev); From a6c82f917f6bfb1a73e77efd8065831dc40e2377 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 14 Oct 2025 01:24:32 -0700 Subject: [PATCH 036/143] cxl/test: Add cxl_translate module for address translation testing Add a loadable test module that validates CXL address translation calculations using parameterized test vectors. The module tests both host-to-device and device-to-host address translations for Modulo and XOR interleave arithmetic. Two types of testing are provided: 1. Parameterized test vectors: Test vectors are passed as module parameters in the format: "dpa pos r_eiw r_eig hb_ways math expected_spa". Round-trip validation is performed: - Translate a DPA and position to a SPA - Verify the result matches expected SPA - Translate that SPA back to a DPA and position - Verify round-trip consistency 2. Internal validation testing: When no test vectors are provided, the module performs validation of the translation functions by checking parameter boundaries and running 10,000 iterations of randomly generated valid parameters to exercise the core calculation functions. The module uses the CXL Driver translation functions through symbols exported exclusively for cxl_translate. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Signed-off-by: Alison Schofield Signed-off-by: Dave Jiang (cherry picked from commit 06377c54a133621d61fa76cdcea85077c5b958f4) Signed-off-by: Jiandi An --- tools/testing/cxl/test/Kbuild | 1 + tools/testing/cxl/test/cxl_translate.c | 445 +++++++++++++++++++++++++ 2 files changed, 446 insertions(+) create mode 100644 tools/testing/cxl/test/cxl_translate.c diff --git a/tools/testing/cxl/test/Kbuild b/tools/testing/cxl/test/Kbuild index 6b19278978561..af50972c8b6d3 100644 --- a/tools/testing/cxl/test/Kbuild +++ b/tools/testing/cxl/test/Kbuild @@ -4,6 +4,7 @@ ccflags-y := -I$(srctree)/drivers/cxl/ -I$(srctree)/drivers/cxl/core obj-m += cxl_test.o obj-m += cxl_mock.o obj-m += cxl_mock_mem.o +obj-m += cxl_translate.o cxl_test-y := cxl.o cxl_mock-y := mock.o diff --git a/tools/testing/cxl/test/cxl_translate.c b/tools/testing/cxl/test/cxl_translate.c new file mode 100644 index 0000000000000..2200ae21795c7 --- /dev/null +++ b/tools/testing/cxl/test/cxl_translate.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright(c) 2025 Intel Corporation. All rights reserved. + +/* Preface all log entries with "cxl_translate" */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Maximum number of test vectors and entry length */ +#define MAX_TABLE_ENTRIES 128 +#define MAX_ENTRY_LEN 128 + +/* Expected number of parameters in each test vector */ +#define EXPECTED_PARAMS 7 + +/* Module parameters for test vectors */ +static char *table[MAX_TABLE_ENTRIES]; +static int table_num; + +/* Interleave Arithmetic */ +#define MODULO_MATH 0 +#define XOR_MATH 1 + +/* + * XOR mapping configuration + * The test data sets all use the same set of xormaps. When additional + * data sets arrive for validation, this static setup will need to + * be changed to accept xormaps as additional parameters. + */ +struct cxl_cxims_data *cximsd; +static u64 xormaps[] = { + 0x2020900, + 0x4041200, + 0x1010400, + 0x800, +}; + +static int nr_maps = ARRAY_SIZE(xormaps); + +#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1) +static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = { + [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4 +}; + +/** + * to_hpa - calculate an HPA offset from a DPA offset and position + * + * dpa_offset: device physical address offset + * pos: devices position in interleave + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: host physical address offset + */ +static u64 to_hpa(u64 dpa_offset, int pos, u8 r_eiw, u16 r_eig, u8 hb_ways, + u8 math) +{ + u64 hpa_offset; + + /* Calculate base HPA offset from DPA and position */ + hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, r_eiw, r_eig); + + if (math == XOR_MATH) { + cximsd->nr_maps = hbiw_to_nr_maps[hb_ways]; + if (cximsd->nr_maps) + return cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + } + return hpa_offset; +} + +/** + * to_dpa - translate an HPA offset to DPA offset + * + * hpa_offset: host physical address offset + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: device physical address offset + */ +static u64 to_dpa(u64 hpa_offset, u8 r_eiw, u16 r_eig, u8 hb_ways, u8 math) +{ + u64 offset = hpa_offset; + + if (math == XOR_MATH) { + cximsd->nr_maps = hbiw_to_nr_maps[hb_ways]; + if (cximsd->nr_maps) + offset = + cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + } + return cxl_calculate_dpa_offset(offset, r_eiw, r_eig); +} + +/** + * to_pos - extract an interleave position from an HPA offset + * + * hpa_offset: host physical address offset + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: devices position in region interleave + */ +static u64 to_pos(u64 hpa_offset, u8 r_eiw, u16 r_eig, u8 hb_ways, u8 math) +{ + u64 offset = hpa_offset; + + /* Reverse XOR mapping if specified */ + if (math == XOR_MATH) + offset = cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + + return cxl_calculate_position(offset, r_eiw, r_eig); +} + +/** + * run_translation_test - execute forward and reverse translations + * + * @dpa: device physical address + * @pos: expected position in region interleave + * @r_eiw: region encoded interleave ways + * @r_eig: region encoded interleave granularity + * @hb_ways: host bridge interleave ways + * @math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * @expect_spa: expected system physical address + * + * Returns: 0 on success, -1 on failure + */ +static int run_translation_test(u64 dpa, int pos, u8 r_eiw, u16 r_eig, + u8 hb_ways, int math, u64 expect_hpa) +{ + u64 translated_spa, reverse_dpa; + int reverse_pos; + + /* Test Device to Host translation: DPA + POS -> SPA */ + translated_spa = to_hpa(dpa, pos, r_eiw, r_eig, hb_ways, math); + if (translated_spa != expect_hpa) { + pr_err("Device to host failed: expected HPA %llu, got %llu\n", + expect_hpa, translated_spa); + return -1; + } + + /* Test Host to Device DPA translation: SPA -> DPA */ + reverse_dpa = to_dpa(translated_spa, r_eiw, r_eig, hb_ways, math); + if (reverse_dpa != dpa) { + pr_err("Host to Device DPA failed: expected %llu, got %llu\n", + dpa, reverse_dpa); + return -1; + } + + /* Test Host to Device Position translation: SPA -> POS */ + reverse_pos = to_pos(translated_spa, r_eiw, r_eig, hb_ways, math); + if (reverse_pos != pos) { + pr_err("Position lookup failed: expected %d, got %d\n", pos, + reverse_pos); + return -1; + } + + return 0; +} + +/** + * parse_test_vector - parse a single test vector string + * + * entry: test vector string to parse + * dpa: device physical address + * pos: expected position in region interleave + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * expect_spa: expected system physical address + * + * Returns: 0 on success, negative error code on failure + */ +static int parse_test_vector(const char *entry, u64 *dpa, int *pos, u8 *r_eiw, + u16 *r_eig, u8 *hb_ways, int *math, + u64 *expect_hpa) +{ + unsigned int tmp_r_eiw, tmp_r_eig, tmp_hb_ways; + int parsed; + + parsed = sscanf(entry, "%llu %d %u %u %u %d %llu", dpa, pos, &tmp_r_eiw, + &tmp_r_eig, &tmp_hb_ways, math, expect_hpa); + + if (parsed != EXPECTED_PARAMS) { + pr_err("Parse error: expected %d parameters, got %d in '%s'\n", + EXPECTED_PARAMS, parsed, entry); + return -EINVAL; + } + if (tmp_r_eiw > U8_MAX || tmp_r_eig > U16_MAX || tmp_hb_ways > U8_MAX) { + pr_err("Parameter overflow in entry: '%s'\n", entry); + return -ERANGE; + } + if (*math != MODULO_MATH && *math != XOR_MATH) { + pr_err("Invalid math type %d in entry: '%s'\n", *math, entry); + return -EINVAL; + } + *r_eiw = tmp_r_eiw; + *r_eig = tmp_r_eig; + *hb_ways = tmp_hb_ways; + + return 0; +} + +/* + * setup_xor_mapping - Initialize XOR mapping data structure + * + * The test data sets all use the same HBIG so we can use one set + * of xormaps, and set the number to apply based on HBIW before + * calling cxl_do_xormap_calc(). + * + * When additional data sets arrive for validation with different + * HBIG's this static setup will need to be updated. + * + * Returns: 0 on success, negative error code on failure + */ +static int setup_xor_mapping(void) +{ + if (nr_maps <= 0) + return -EINVAL; + + cximsd = kzalloc(struct_size(cximsd, xormaps, nr_maps), GFP_KERNEL); + if (!cximsd) + return -ENOMEM; + + memcpy(cximsd->xormaps, xormaps, nr_maps * sizeof(*cximsd->xormaps)); + cximsd->nr_maps = nr_maps; + + return 0; +} + +static int test_random_params(void) +{ + u8 valid_eiws[] = { 0, 1, 2, 3, 4, 8, 9, 10 }; + u16 valid_eigs[] = { 0, 1, 2, 3, 4, 5, 6 }; + int i, ways, pos, reverse_pos; + u64 dpa, hpa, reverse_dpa; + int iterations = 10000; + int failures = 0; + + for (i = 0; i < iterations; i++) { + /* Generate valid random parameters for eiw, eig, pos, dpa */ + u8 eiw = valid_eiws[get_random_u32() % ARRAY_SIZE(valid_eiws)]; + u16 eig = valid_eigs[get_random_u32() % ARRAY_SIZE(valid_eigs)]; + + eiw_to_ways(eiw, &ways); + pos = get_random_u32() % ways; + dpa = get_random_u64() >> 12; + + hpa = cxl_calculate_hpa_offset(dpa, pos, eiw, eig); + reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig); + reverse_pos = cxl_calculate_position(hpa, eiw, eig); + + if (reverse_dpa != dpa || reverse_pos != pos) { + pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n", + i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw, + eig); + + if (failures++ > 10) { + pr_err("test random too many failures, stop\n"); + break; + } + } + } + pr_info("..... test random: PASS %d FAIL %d\n", i - failures, failures); + + if (failures) + return -EINVAL; + + return 0; +} + +struct param_test { + u8 eiw; + u16 eig; + int pos; + bool expect; /* true: expect pass, false: expect fail */ + const char *desc; +}; + +static struct param_test param_tests[] = { + { 0x0, 0, 0, true, "1-way, min eig=0, pos=0" }, + { 0x0, 3, 0, true, "1-way, mid eig=3, pos=0" }, + { 0x0, 6, 0, true, "1-way, max eig=6, pos=0" }, + { 0x1, 0, 0, true, "2-way, eig=0, pos=0" }, + { 0x1, 3, 1, true, "2-way, eig=3, max pos=1" }, + { 0x1, 6, 1, true, "2-way, eig=6, max pos=1" }, + { 0x2, 0, 0, true, "4-way, eig=0, pos=0" }, + { 0x2, 3, 3, true, "4-way, eig=3, max pos=3" }, + { 0x2, 6, 3, true, "4-way, eig=6, max pos=3" }, + { 0x3, 0, 0, true, "8-way, eig=0, pos=0" }, + { 0x3, 3, 7, true, "8-way, eig=3, max pos=7" }, + { 0x3, 6, 7, true, "8-way, eig=6, max pos=7" }, + { 0x4, 0, 0, true, "16-way, eig=0, pos=0" }, + { 0x4, 3, 15, true, "16-way, eig=3, max pos=15" }, + { 0x4, 6, 15, true, "16-way, eig=6, max pos=15" }, + { 0x8, 0, 0, true, "3-way, eig=0, pos=0" }, + { 0x8, 3, 2, true, "3-way, eig=3, max pos=2" }, + { 0x8, 6, 2, true, "3-way, eig=6, max pos=2" }, + { 0x9, 0, 0, true, "6-way, eig=0, pos=0" }, + { 0x9, 3, 5, true, "6-way, eig=3, max pos=5" }, + { 0x9, 6, 5, true, "6-way, eig=6, max pos=5" }, + { 0xA, 0, 0, true, "12-way, eig=0, pos=0" }, + { 0xA, 3, 11, true, "12-way, eig=3, max pos=11" }, + { 0xA, 6, 11, true, "12-way, eig=6, max pos=11" }, + { 0x5, 0, 0, false, "invalid eiw=5" }, + { 0x7, 0, 0, false, "invalid eiw=7" }, + { 0xB, 0, 0, false, "invalid eiw=0xB" }, + { 0xFF, 0, 0, false, "invalid eiw=0xFF" }, + { 0x1, 7, 0, false, "invalid eig=7 (out of range)" }, + { 0x2, 0x10, 0, false, "invalid eig=0x10" }, + { 0x3, 0xFFFF, 0, false, "invalid eig=0xFFFF" }, + { 0x1, 0, -1, false, "pos < 0" }, + { 0x1, 0, 2, false, "2-way, pos=2 (>= ways)" }, + { 0x2, 0, 4, false, "4-way, pos=4 (>= ways)" }, + { 0x3, 0, 8, false, "8-way, pos=8 (>= ways)" }, + { 0x4, 0, 16, false, "16-way, pos=16 (>= ways)" }, + { 0x8, 0, 3, false, "3-way, pos=3 (>= ways)" }, + { 0x9, 0, 6, false, "6-way, pos=6 (>= ways)" }, + { 0xA, 0, 12, false, "12-way, pos=12 (>= ways)" }, +}; + +static int test_cxl_validate_translation_params(void) +{ + int i, rc, failures = 0; + bool valid; + + for (i = 0; i < ARRAY_SIZE(param_tests); i++) { + struct param_test *t = ¶m_tests[i]; + + rc = cxl_validate_translation_params(t->eiw, t->eig, t->pos); + valid = (rc == 0); + + if (valid != t->expect) { + pr_err("test params failed: %s\n", t->desc); + failures++; + } + } + pr_info("..... test params: PASS %d FAIL %d\n", i - failures, failures); + + if (failures) + return -EINVAL; + + return 0; +} + +/* + * cxl_translate_init + * + * Run the internal validation tests when no params are passed. + * Otherwise, parse the parameters (test vectors), and kick off + * the translation test. + * + * Returns: 0 on success, negative error code on failure + */ +static int __init cxl_translate_init(void) +{ + int rc, i; + + /* If no tables are passed, validate module params only */ + if (table_num == 0) { + pr_info("Internal validation test start...\n"); + rc = test_cxl_validate_translation_params(); + if (rc) + return rc; + + rc = test_random_params(); + if (rc) + return rc; + + pr_info("Internal validation test completed successfully\n"); + + return 0; + } + + pr_info("CXL translate test module loaded with %d test vectors\n", + table_num); + + rc = setup_xor_mapping(); + if (rc) + return rc; + + /* Process each test vector */ + for (i = 0; i < table_num; i++) { + u64 dpa, expect_spa; + int pos, math; + u8 r_eiw, hb_ways; + u16 r_eig; + + pr_debug("Processing test vector %d: '%s'\n", i, table[i]); + + /* Parse the test vector */ + rc = parse_test_vector(table[i], &dpa, &pos, &r_eiw, &r_eig, + &hb_ways, &math, &expect_spa); + if (rc) { + pr_err("CXL Translate Test %d: FAIL\n" + " Failed to parse test vector '%s'\n", + i, table[i]); + continue; + } + /* Run the translation test */ + rc = run_translation_test(dpa, pos, r_eiw, r_eig, hb_ways, math, + expect_spa); + if (rc) { + pr_err("CXL Translate Test %d: FAIL\n" + " dpa=%llu pos=%d r_eiw=%u r_eig=%u hb_ways=%u math=%s expect_spa=%llu\n", + i, dpa, pos, r_eiw, r_eig, hb_ways, + (math == XOR_MATH) ? "XOR" : "MODULO", + expect_spa); + } else { + pr_info("CXL Translate Test %d: PASS\n", i); + } + } + + kfree(cximsd); + pr_info("CXL translate test completed\n"); + + return 0; +} + +static void __exit cxl_translate_exit(void) +{ + pr_info("CXL translate test module unloaded\n"); +} + +module_param_array(table, charp, &table_num, 0444); +MODULE_PARM_DESC(table, "Test vectors as space-separated decimal strings"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("cxl_test: cxl address translation test module"); +MODULE_IMPORT_NS("CXL"); + +module_init(cxl_translate_init); +module_exit(cxl_translate_exit); From 322aae9a659b97bce59c5655d69aba3ec2422fac Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 3 Oct 2025 11:55:09 -0700 Subject: [PATCH 037/143] cxl: Adjust extended linear cache failure emission in cxl_acpi The cxl_acpi module spams "Extended linear cache calculation failed" when the hmat memory target is not found for a node. This is normal when the memory target does not contain extended linear cache attributes. Adjust cxl_acpi_set_cache_size() to just return 0 if error is returned from hmat_get_extended_linear_cache_size(). That is the only error returned from hmat_get_extended_linear_cache_size() as -ENOENT. Also remove the check for -EOPNOTSUPP in cxl_setup_extended_linear_cache() since that errno is never returned by cxl_acpi_set_cache_size(). [dj: Flipped minor return logic suggested by Jonathan ] Suggested-by: Dan Williams Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251003185509.3215900-1-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit f0c5d3bc2830f04a72087f45d15807943eabfa10) Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index a8069278cb565..1a64e5c71fbd8 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -372,7 +372,7 @@ static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd) rc = hmat_get_extended_linear_cache_size(&res, nid, &cache_size); if (rc) - return rc; + return 0; /* * The cache range is expected to be within the CFMWS. @@ -397,21 +397,18 @@ static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd) int rc; rc = cxl_acpi_set_cache_size(cxlrd); - if (!rc) - return; - - if (rc != -EOPNOTSUPP) { + if (rc) { /* - * Failing to support extended linear cache region resize does not + * Failing to retrieve extended linear cache region resize does not * prevent the region from functioning. Only causes cxl list showing * incorrect region size. */ dev_warn(cxlrd->cxlsd.cxld.dev.parent, - "Extended linear cache calculation failed rc:%d\n", rc); - } + "Extended linear cache retrieval failed rc:%d\n", rc); - /* Ignoring return code */ - cxlrd->cache_size = 0; + /* Ignoring return code */ + cxlrd->cache_size = 0; + } } DEFINE_FREE(put_cxlrd, struct cxl_root_decoder *, From 72872e60142d5b8ffe786215d2810a80f3d8cd5d Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 22 Oct 2025 13:30:52 -0700 Subject: [PATCH 038/143] cxl/region: Add support to indicate region has extended linear cache Add a region sysfs attribute to show the size of the extended linear cache if there is any. The attribute is invisible when the cache size is 0, which indicates it does not exist. Moved the cxl_region_visible() location in order to pick up the new sysfs attribute definition. [ dj: Fixed spelling errors noted by Benjamin ] Reviewed-by: Alison Schofield Reviewed-by: Ben Cheatham Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251022203052.4078527-1-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit d6602e25819dea2c239972e98e09ba5db4aebd22) Signed-off-by: Jiandi An --- Documentation/ABI/testing/sysfs-bus-cxl | 11 ++++- drivers/cxl/core/region.c | 59 ++++++++++++++++++------- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 6b4e8c7a963da..c80a1b5a03dba 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -496,8 +496,17 @@ Description: changed, only freed by writing 0. The kernel makes no guarantees that data is maintained over an address space freeing event, and there is no guarantee that a free followed by an allocate - results in the same address being allocated. + results in the same address being allocated. If extended linear + cache is present, the size indicates extended linear cache size + plus the CXL region size. +What: /sys/bus/cxl/devices/regionZ/extended_linear_cache_size +Date: October, 2025 +KernelVersion: v6.19 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) The size of extended linear cache, if there is an extended + linear cache. Otherwise the attribute will not be visible. What: /sys/bus/cxl/devices/regionZ/mode Date: January, 2023 diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 3af7561e2973a..d8f34cb5e8129 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -461,21 +461,6 @@ static ssize_t commit_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RW(commit); -static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, - int n) -{ - struct device *dev = kobj_to_dev(kobj); - struct cxl_region *cxlr = to_cxl_region(dev); - - /* - * Support tooling that expects to find a 'uuid' attribute for all - * regions regardless of mode. - */ - if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM) - return 0444; - return a->mode; -} - static ssize_t interleave_ways_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -754,6 +739,21 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RW(size); +static ssize_t extended_linear_cache_size_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + return sysfs_emit(buf, "%#llx\n", p->cache_size); +} +static DEVICE_ATTR_RO(extended_linear_cache_size); + static struct attribute *cxl_region_attrs[] = { &dev_attr_uuid.attr, &dev_attr_commit.attr, @@ -762,9 +762,34 @@ static struct attribute *cxl_region_attrs[] = { &dev_attr_resource.attr, &dev_attr_size.attr, &dev_attr_mode.attr, + &dev_attr_extended_linear_cache_size.attr, NULL, }; +static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + + /* + * Support tooling that expects to find a 'uuid' attribute for all + * regions regardless of mode. + */ + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM) + return 0444; + + /* + * Don't display extended linear cache attribute if there is no + * extended linear cache. + */ + if (a == &dev_attr_extended_linear_cache_size.attr && + cxlr->params.cache_size == 0) + return 0; + + return a->mode; +} + static const struct attribute_group cxl_region_group = { .attrs = cxl_region_attrs, .is_visible = cxl_region_visible, @@ -3561,6 +3586,10 @@ static int __construct_region(struct cxl_region *cxlr, "Extended linear cache calculation failed rc:%d\n", rc); } + rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group); + if (rc) + return rc; + rc = insert_resource(cxlrd->res, res); if (rc) { /* From 6afdd2e2a5fcd20abd6580089e2ac98e9fd33980 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 5 Nov 2025 13:18:26 -0700 Subject: [PATCH 039/143] cxl: Add handling of locked CXL decoder When a decoder is locked, it means that its configuration cannot be changed. CXL spec r3.2 8.2.4.20.13 discusses the details regarding locked decoders. Locking happens when bit 8 of the decoder control register is set and then the decoder is committed afterwards (CXL spec r3.2 8.2.4.20.7). Given that the driver creates a virtual decoder for each CFMWS, the Fixed Device Configuration (bit 4) of the Window Restriction field is considered as locking for the virtual decoder by the driver. The current driver code disregards the locked status and a region can be destroyed regardless of the locking state. Add a region flag to indicate the region is in a locked configuration. The driver will considered a region locked if the CFMWS or any decoder is configured as locked. The consideration is all or nothing regarding the locked state. It is reasonable to determine the region "locked" status while the region is being assembled based on the decoders. Add a check in region commit_store() to intercept when a 0 is written to the commit sysfs attribute in order to prevent the destruction of a region when in locked state. This should be the only entry point from user space to destroy a region. Add a check is added to cxl_decoder_reset() to prevent resetting a locked decoder within the kernel driver. Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251105201826.2901915-1-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 2230c4bdc4120417799c74326ade3123da226d54) Signed-off-by: Jiandi An --- drivers/cxl/core/hdm.c | 3 +++ drivers/cxl/core/region.c | 19 +++++++++++++++++++ drivers/cxl/cxl.h | 8 ++++++++ 3 files changed, 30 insertions(+) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index de78601821e60..aff166798e353 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -905,6 +905,9 @@ static void cxl_decoder_reset(struct cxl_decoder *cxld) if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) return; + if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) + return; + if (port->commit_end == id) cxl_port_commit_reap(cxld); else diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d8f34cb5e8129..4056d75d27d08 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -245,6 +245,9 @@ static void cxl_region_decode_reset(struct cxl_region *cxlr, int count) struct cxl_region_params *p = &cxlr->params; int i; + if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags)) + return; + /* * Before region teardown attempt to flush, evict any data cached for * this region, or scream loudly about missing arch / platform support @@ -419,6 +422,9 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr, return len; } + if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags)) + return -EPERM; + rc = queue_reset(cxlr); if (rc) return rc; @@ -1084,6 +1090,16 @@ static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr, return 0; } +static void cxl_region_set_lock(struct cxl_region *cxlr, + struct cxl_decoder *cxld) +{ + if (!test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) + return; + + set_bit(CXL_REGION_F_LOCK, &cxlr->flags); + clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); +} + /** * cxl_port_attach_region() - track a region's interest in a port by endpoint * @port: port to add a new region reference 'struct cxl_region_ref' @@ -1195,6 +1211,8 @@ static int cxl_port_attach_region(struct cxl_port *port, } } + cxl_region_set_lock(cxlr, cxld); + rc = cxl_rr_ep_add(cxl_rr, cxled); if (rc) { dev_dbg(&cxlr->dev, @@ -2464,6 +2482,7 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i dev->bus = &cxl_bus_type; dev->type = &cxl_region_type; cxlr->id = id; + cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld); return cxlr; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index e8931b626fc62..6cfe65a35c95a 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -517,6 +517,14 @@ enum cxl_partition_mode { */ #define CXL_REGION_F_NEEDS_RESET 1 +/* + * Indicate whether this region is locked due to 1 or more decoders that have + * been locked. The approach of all or nothing is taken with regard to the + * locked attribute. CXL_REGION_F_NEEDS_RESET should not be set if this flag is + * set. + */ +#define CXL_REGION_F_LOCK 2 + /** * struct cxl_region - CXL region * @dev: This region's device From 7992ac79bfc6452c2559db0a84f09bc6928d477c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 5 Nov 2025 16:51:14 -0700 Subject: [PATCH 040/143] acpi/hmat: Return when generic target is updated With the current code flow, once the generic target is updated target->registered is set and the remaining code is skipped. So return immediately instead of going through the checks and then skip. Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251105235115.85062-2-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 15e14267889bde30b7b82bc03432483222b4b42c) Signed-off-by: Jiandi An --- drivers/acpi/numa/hmat.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 11e4483685c9c..77a81627aaefd 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -910,12 +910,13 @@ static void hmat_register_target(struct memory_target *target) * Register generic port perf numbers. The nid may not be * initialized and is still NUMA_NO_NODE. */ - mutex_lock(&target_lock); - if (*(u16 *)target->gen_port_device_handle) { - hmat_update_generic_target(target); - target->registered = true; + scoped_guard(mutex, &target_lock) { + if (*(u16 *)target->gen_port_device_handle) { + hmat_update_generic_target(target); + target->registered = true; + return; + } } - mutex_unlock(&target_lock); hmat_hotplug_target(target); } From 24366091ed5b99b5367b9a1938b38a4338848066 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 6 Nov 2025 10:01:07 -0700 Subject: [PATCH 041/143] cxl: Rename region_res_match_cxl_range() to spa_maps_hpa() The function name region_res_match_cxl_range() does not accurately convey the operation of address comparison with cache size. Rename to spa_maps_hpa() to provide a better function name. Suggested-by: Dan Williams Link: https://lore.kernel.org/linux-cxl/68eea19c7e67e_2f899100a8@dwillia2-mobl4.notmuch/ Reviewed-by: Jonathan Cameron Reviewed-by: Gregory Price Link: https://patch.msgid.link/20251106170108.1468304-2-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit c43521b9db7f5ed481cfdfb04ad2e7fe0cb9dcf5) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 4056d75d27d08..88dbd83379587 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -869,8 +869,8 @@ static int match_free_decoder(struct device *dev, const void *data) return 1; } -static bool region_res_match_cxl_range(const struct cxl_region_params *p, - const struct range *range) +static bool spa_maps_hpa(const struct cxl_region_params *p, + const struct range *range) { if (!p->res) return false; @@ -896,7 +896,7 @@ static int match_auto_decoder(struct device *dev, const void *data) cxld = to_cxl_decoder(dev); r = &cxld->hpa_range; - if (region_res_match_cxl_range(p, r)) + if (spa_maps_hpa(p, r)) return 1; return 0; @@ -1508,7 +1508,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || (iw > 1 && cxld->interleave_granularity != ig) || - !region_res_match_cxl_range(p, &cxld->hpa_range) || + !spa_maps_hpa(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, "%s:%s %s expected iw: %d ig: %d %pr\n", @@ -3524,7 +3524,7 @@ static int match_region_by_range(struct device *dev, const void *data) p = &cxlr->params; guard(rwsem_read)(&cxl_rwsem.region); - return region_res_match_cxl_range(p, r); + return spa_maps_hpa(p, r); } static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, From c0154b151982efd450c5766fd5a3d1e17d8c8c31 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 6 Nov 2025 10:01:08 -0700 Subject: [PATCH 042/143] cxl: Clarify comment in spa_maps_hpa() Update the comment in spa_maps_hpa() to clearly convey the construction of extended linear cache. Suggested-by: Dan Williams Link: https://lore.kernel.org/linux-cxl/68eea19c7e67e_2f899100a8@dwillia2-mobl4.notmuch/ Reviewed-by: Jonathan Cameron Reviewed-by: Gregory Price Link: https://patch.msgid.link/20251106170108.1468304-3-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 8d27dd0b219f00fc1e0548ae5008abd7bb350611) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 88dbd83379587..70e7bf809c08b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -876,9 +876,9 @@ static bool spa_maps_hpa(const struct cxl_region_params *p, return false; /* - * If an extended linear cache region then the CXL range is assumed - * to be fronted by the DRAM range in current known implementation. - * This assumption will be made until a variant implementation exists. + * The extended linear cache region is constructed by a 1:1 ratio + * where the SPA maps equal amounts of DRAM and CXL HPA capacity with + * CXL decoders at the high end of the SPA range. */ return p->res->start + p->cache_size == range->start && p->res->end == range->end; From b46ca0eebf111acfb12b3e3721b1e899c400fca1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 14 Nov 2025 08:58:41 +0100 Subject: [PATCH 043/143] cxl: Simplify cxl_rd_ops allocation and handling A root decoder's callback handlers are collected in struct cxl_rd_ops. The structure is dynamically allocated, though it contains only a few pointers in it. This also requires to check two pointes to check for the existence of a callback. Simplify the allocation, release and handler check by embedding the ops statically in struct cxl_root_decoder. Implementation is equivalent to how struct cxl_root_ops handles the callbacks. [ dj: Fix spelling error in commit log. ] Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Robert Richter Link: https://patch.msgid.link/20251114075844.1315805-2-rrichter@amd.com Signed-off-by: Dave Jiang (backported from commit 6123133ee90fc55a5437364d442dd5876648628d) [jan: Resolve minor conflict due to code lines shift] Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 8 ++------ drivers/cxl/core/port.c | 1 - drivers/cxl/core/region.c | 20 +++++--------------- drivers/cxl/cxl.h | 2 +- 4 files changed, 8 insertions(+), 23 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 1a64e5c71fbd8..908a5e460a2db 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -491,12 +491,8 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, cxlrd->qos_class = cfmws->qtg_id; if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) { - cxlrd->ops = kzalloc(sizeof(*cxlrd->ops), GFP_KERNEL); - if (!cxlrd->ops) - return -ENOMEM; - - cxlrd->ops->hpa_to_spa = cxl_apply_xor_maps; - cxlrd->ops->spa_to_hpa = cxl_apply_xor_maps; + cxlrd->ops.hpa_to_spa = cxl_apply_xor_maps; + cxlrd->ops.spa_to_hpa = cxl_apply_xor_maps; } rc = cxl_decoder_add(cxld); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 8128fd2b5b317..fef3aa0c6680c 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -459,7 +459,6 @@ static void cxl_root_decoder_release(struct device *dev) if (atomic_read(&cxlrd->region_id) >= 0) memregion_free(atomic_read(&cxlrd->region_id)); __cxl_decoder_release(&cxlrd->cxlsd.cxld); - kfree(cxlrd->ops); kfree(cxlrd); } diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 70e7bf809c08b..d2143b79d17b8 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2968,16 +2968,6 @@ static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos) return false; } -static bool has_hpa_to_spa(struct cxl_root_decoder *cxlrd) -{ - return cxlrd->ops && cxlrd->ops->hpa_to_spa; -} - -static bool has_spa_to_hpa(struct cxl_root_decoder *cxlrd) -{ - return cxlrd->ops && cxlrd->ops->spa_to_hpa; -} - #define CXL_POS_ZERO 0 /** * cxl_validate_translation_params @@ -3151,8 +3141,8 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, hpa = hpa_offset + p->res->start + p->cache_size; /* Root decoder translation overrides typical modulo decode */ - if (has_hpa_to_spa(cxlrd)) - hpa = cxlrd->ops->hpa_to_spa(cxlrd, hpa); + if (cxlrd->ops.hpa_to_spa) + hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa); if (!cxl_resource_contains_addr(p->res, hpa)) { dev_dbg(&cxlr->dev, @@ -3161,7 +3151,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, } /* Simple chunk check, by pos & gran, only applies to modulo decodes */ - if (!has_hpa_to_spa(cxlrd) && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos))) + if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos)) return ULLONG_MAX; return hpa; @@ -3194,8 +3184,8 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, * If the root decoder has SPA to CXL HPA callback, use it. Otherwise * CXL HPA is assumed to equal SPA. */ - if (has_spa_to_hpa(cxlrd)) { - hpa = cxlrd->ops->spa_to_hpa(cxlrd, p->res->start + offset); + if (cxlrd->ops.spa_to_hpa) { + hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset); hpa_offset = hpa - p->res->start; } else { hpa_offset = offset; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 6cfe65a35c95a..ba17fa86d249e 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -451,7 +451,7 @@ struct cxl_root_decoder { void *platform_data; struct mutex range_lock; int qos_class; - struct cxl_rd_ops *ops; + struct cxl_rd_ops ops; struct cxl_switch_decoder cxlsd; }; From 19b248c812baff03009fcbc9983160cc48df604f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 14 Nov 2025 08:58:42 +0100 Subject: [PATCH 044/143] cxl/acpi: Group xor arithmetric setup code in a single block Simplify the xor arithmetric setup code by grouping it in a single block. No need to split the block for QoS setup. It is safe to reorder the call of cxl_setup_extended_linear_cache() because there are no dependencies. Reviewed-by: Jonathan Cameron Signed-off-by: Robert Richter Tested-by: Gregory Price Link: https://patch.msgid.link/20251114075844.1315805-3-rrichter@amd.com Signed-off-by: Dave Jiang (cherry picked from commit c42a4d2ee3b2c432ada9080e29343f4b27ad72bf) Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 908a5e460a2db..77ac940e30138 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -469,8 +469,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, ig = CXL_DECODER_MIN_GRANULARITY; cxld->interleave_granularity = ig; - cxl_setup_extended_linear_cache(cxlrd); - if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) { if (ways != 1 && ways != 3) { cxims_ctx = (struct cxl_cxims_context) { @@ -486,15 +484,14 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, return -EINVAL; } } - } - - cxlrd->qos_class = cfmws->qtg_id; - - if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) { cxlrd->ops.hpa_to_spa = cxl_apply_xor_maps; cxlrd->ops.spa_to_hpa = cxl_apply_xor_maps; } + cxl_setup_extended_linear_cache(cxlrd); + + cxlrd->qos_class = cfmws->qtg_id; + rc = cxl_decoder_add(cxld); if (rc) return rc; From 515662c3d66f9d71463f66a3c18305b9b62215f1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 14 Nov 2025 08:58:43 +0100 Subject: [PATCH 045/143] cxl/region: Remove local variable @inc in cxl_port_setup_targets() Simplify the code by removing local variable @inc. The variable is not used elsewhere, remove it and directly increment the target number. Reviewed-by: Jonathan Cameron Signed-off-by: Robert Richter Link: https://patch.msgid.link/20251114075844.1315805-4-rrichter@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 7e71fa6e015e46275efd900a728a42d5fcd75179) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d2143b79d17b8..b251cb998892a 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1371,7 +1371,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_endpoint_decoder *cxled) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); - int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos; + int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); @@ -1563,9 +1563,8 @@ static int cxl_port_setup_targets(struct cxl_port *port, cxlsd->target[cxl_rr->nr_targets_set] = ep->dport; cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id; } - inc = 1; + cxl_rr->nr_targets_set++; out_target_set: - cxl_rr->nr_targets_set += inc; dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n", dev_name(port->uport_dev), dev_name(&port->dev), cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev), From d2a6cf577da207e237c1fb57a03890a49c24a782 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 17 Nov 2025 07:46:09 -0700 Subject: [PATCH 046/143] cxl/test: Standardize CXL auto region size Create a global define for the size of the mock CXL auto region used in cxl_test. Remove the declared size in mock_init_hdm_decoder() function. Reviewed-by: Jonathan Cameron Tested-by: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Fabio M. De Francesco Link: https://patch.msgid.link/20251117144611.903692-2-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit fa59c35167afdba043efcc80cf460863868141e7) Signed-off-by: Jiandi An --- tools/testing/cxl/test/cxl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index fc271561827b6..4cfb3b39b4c68 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -26,6 +26,9 @@ static int interleave_arithmetic; #define NR_CXL_PORT_DECODERS 8 #define NR_BRIDGES (NR_CXL_HOST_BRIDGES + NR_CXL_SINGLE_HOST + NR_CXL_RCH) +#define MOCK_AUTO_REGION_SIZE_DEFAULT SZ_512M +static int mock_auto_region_size = MOCK_AUTO_REGION_SIZE_DEFAULT; + static struct platform_device *cxl_acpi; static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES]; #define NR_MULTI_ROOT (NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS) @@ -801,7 +804,6 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) struct cxl_endpoint_decoder *cxled; struct cxl_switch_decoder *cxlsd; struct cxl_port *port, *iter; - const int size = SZ_512M; struct cxl_memdev *cxlmd; struct cxl_dport *dport; struct device *dev; @@ -859,7 +861,7 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->hpa_range = (struct range) { .start = base, - .end = base + size - 1, + .end = base + mock_auto_region_size - 1, }; cxld->interleave_ways = 2; @@ -868,7 +870,8 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->flags = CXL_DECODER_F_ENABLE; cxled->state = CXL_DECODER_STATE_AUTO; port->commit_end = cxld->id; - devm_cxl_dpa_reserve(cxled, 0, size / cxld->interleave_ways, 0); + devm_cxl_dpa_reserve(cxled, 0, + mock_auto_region_size / cxld->interleave_ways, 0); cxld->commit = mock_decoder_commit; cxld->reset = mock_decoder_reset; @@ -917,7 +920,7 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->interleave_granularity = 4096; cxld->hpa_range = (struct range) { .start = base, - .end = base + size - 1, + .end = base + mock_auto_region_size - 1, }; put_device(dev); From 8c693dc67a2ebdc1441aff20ab8dc1eec7626edb Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 17 Nov 2025 07:46:10 -0700 Subject: [PATCH 047/143] cxl/test: Add cxl_test CFMWS support for extended linear cache Add a module parameter to allow activation of extended linear cache on the auto region for cxl_test. The current platform implementation for extended linear cache is 1:1 of DRAM and CXL memory. A CFMWS is created with the size of both memory together where DRAM takes the first part of the memory range and CXL covers the second part. The current CXL auto region on cxl_test consists of 2 256M devices that creates a 512M region. The new extended linear cache setup will have 512M DRAM and 512M CXL memory for a total of 1G CFMWS. The hardware decoders must have their starting offset moved to after the DRAM region to handle the CXL regions. [ dj: Fixup commenting style. (Jonathan) ] Reviewed-by: Jonathan Cameron Tested-by: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Fabio M. De Francesco Link: https://patch.msgid.link/20251117144611.903692-3-dave.jiang@intel.com Signed-off-by: Dave Jiang (backported from commit 4b1c0466c8fbe23d688a1f54584670a9d1dceabd) [jan: Resolve minor conflict due to code line "base = window->base_hpa" being moved] Signed-off-by: Jiandi An --- tools/testing/cxl/test/cxl.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 4cfb3b39b4c68..79c462a15c99c 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -15,6 +15,7 @@ #include "mock.h" static int interleave_arithmetic; +static bool extended_linear_cache; #define FAKE_QTG_ID 42 @@ -429,6 +430,22 @@ static struct cxl_mock_res *alloc_mock_res(resource_size_t size, int align) return res; } +/* Only update CFMWS0 as this is used by the auto region. */ +static void cfmws_elc_update(struct acpi_cedt_cfmws *window, int index) +{ + if (!extended_linear_cache) + return; + + if (index != 0) + return; + + /* + * The window size should be 2x of the CXL region size where half is + * DRAM and half is CXL + */ + window->window_size = mock_auto_region_size * 2; +} + static int populate_cedt(void) { struct cxl_mock_res *res; @@ -453,6 +470,7 @@ static int populate_cedt(void) for (i = cfmws_start; i <= cfmws_end; i++) { struct acpi_cedt_cfmws *window = mock_cfmws[i]; + cfmws_elc_update(window, i); res = alloc_mock_res(window->window_size, SZ_256M); if (!res) return -ENOMEM; @@ -859,6 +877,9 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) return; } + if (extended_linear_cache) + base += mock_auto_region_size; + cxld->hpa_range = (struct range) { .start = base, .end = base + mock_auto_region_size - 1, @@ -1669,6 +1690,8 @@ static __exit void cxl_test_exit(void) module_param(interleave_arithmetic, int, 0444); MODULE_PARM_DESC(interleave_arithmetic, "Modulo:0, XOR:1"); +module_param(extended_linear_cache, bool, 0444); +MODULE_PARM_DESC(extended_linear_cache, "Enable extended linear cache support"); module_init(cxl_test_init); module_exit(cxl_test_exit); MODULE_LICENSE("GPL v2"); From c371e8708871d172698cfde589c6959398816534 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 17 Nov 2025 07:46:11 -0700 Subject: [PATCH 048/143] cxl/test: Add support for acpi extended linear cache Add the mock wrappers for hmat_get_extended_linear_cache_size() in order to emulate the ACPI helper function for the regions that are mock'd by cxl_test. Reviewed-by: Jonathan Cameron Tested-by: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Fabio M. De Francesco Link: https://patch.msgid.link/20251117144611.903692-4-dave.jiang@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 68f4a852e18329e84bb5d36168a45b0a52cdf236) Signed-off-by: Jiandi An --- tools/testing/cxl/Kbuild | 1 + tools/testing/cxl/test/cxl.c | 21 +++++++++++++++++++++ tools/testing/cxl/test/mock.c | 20 ++++++++++++++++++++ tools/testing/cxl/test/mock.h | 3 +++ 4 files changed, 45 insertions(+) diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 3dae06ac7fba5..68b38863605b2 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -10,6 +10,7 @@ ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat ldflags-y += --wrap=cxl_dport_init_ras_reporting ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup +ldflags-y += --wrap=hmat_get_extended_linear_cache_size DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 79c462a15c99c..e68bf64460996 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -612,6 +612,25 @@ mock_acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, return AE_OK; } +static int +mock_hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *cache_size) +{ + struct acpi_cedt_cfmws *window = mock_cfmws[0]; + struct resource cfmws0_res = + DEFINE_RES_MEM(window->base_hpa, window->window_size); + + if (!extended_linear_cache || + !resource_contains(&cfmws0_res, backing_res)) { + return hmat_get_extended_linear_cache_size(backing_res, + nid, cache_size); + } + + *cache_size = mock_auto_region_size; + + return 0; +} + static struct pci_bus mock_pci_bus[NR_BRIDGES]; static struct acpi_pci_root mock_pci_root[ARRAY_SIZE(mock_pci_bus)] = { [0] = { @@ -1201,6 +1220,8 @@ static struct cxl_mock_ops cxl_mock_ops = { .devm_cxl_endpoint_decoders_setup = mock_cxl_endpoint_decoders_setup, .cxl_endpoint_parse_cdat = mock_cxl_endpoint_parse_cdat, .devm_cxl_add_dport_by_dev = mock_cxl_add_dport_by_dev, + .hmat_get_extended_linear_cache_size = + mock_hmat_get_extended_linear_cache_size, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 6fd4edb9215c4..6eb15991a414b 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -111,6 +111,26 @@ acpi_status __wrap_acpi_evaluate_integer(acpi_handle handle, } EXPORT_SYMBOL(__wrap_acpi_evaluate_integer); +int __wrap_hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, + resource_size_t *cache_size) +{ + int index, rc; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops) + rc = ops->hmat_get_extended_linear_cache_size(backing_res, nid, + cache_size); + else + rc = hmat_get_extended_linear_cache_size(backing_res, nid, + cache_size); + + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_GPL(__wrap_hmat_get_extended_linear_cache_size); + struct acpi_pci_root *__wrap_acpi_pci_find_root(acpi_handle handle) { int index; diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 580f383862245..2684b89c8aa2d 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -24,6 +24,9 @@ struct cxl_mock_ops { void (*cxl_endpoint_parse_cdat)(struct cxl_port *port); struct cxl_dport *(*devm_cxl_add_dport_by_dev)(struct cxl_port *port, struct device *dport_dev); + int (*hmat_get_extended_linear_cache_size)(struct resource *backing_res, + int nid, + resource_size_t *cache_size); }; void register_cxl_mock_ops(struct cxl_mock_ops *ops); From c3f092072f7f454800b5027f7cc7a8c87737583d Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Tue, 18 Nov 2025 18:22:02 +0000 Subject: [PATCH 049/143] cxl/test: remove unused mock function for cxl_rcd_component_reg_phys() Since commit 733b57f262b0 ("cxl/pci: Early setup RCH dport component registers from RCRB") is not necessary under mocking tests. [ dj: Fixup commit representation flagged by checkpatch. ] [ dj: Ammend subject line to indicate which function. ] Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang > --- Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20251118182202.2083244-1-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 26c5b0d9c080ff753c66de0b19d6e3e014a24877) Signed-off-by: Jiandi An --- tools/testing/cxl/Kbuild | 1 - tools/testing/cxl/test/mock.c | 17 ----------------- 2 files changed, 18 deletions(-) diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 68b38863605b2..0e151d0572d1f 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -6,7 +6,6 @@ ldflags-y += --wrap=acpi_pci_find_root ldflags-y += --wrap=nvdimm_bus_register ldflags-y += --wrap=cxl_await_media_ready ldflags-y += --wrap=devm_cxl_add_rch_dport -ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat ldflags-y += --wrap=cxl_dport_init_ras_reporting ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 6eb15991a414b..44bce80ef3ff5 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -231,23 +231,6 @@ struct cxl_dport *__wrap_devm_cxl_add_rch_dport(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_rch_dport, "CXL"); -resource_size_t __wrap_cxl_rcd_component_reg_phys(struct device *dev, - struct cxl_dport *dport) -{ - int index; - resource_size_t component_reg_phys; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_port(dev)) - component_reg_phys = CXL_RESOURCE_NONE; - else - component_reg_phys = cxl_rcd_component_reg_phys(dev, dport); - put_cxl_mock_ops(index); - - return component_reg_phys; -} -EXPORT_SYMBOL_NS_GPL(__wrap_cxl_rcd_component_reg_phys, "CXL"); - void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) { int index; From 544c132bad87690e1305f770280710788e303ff9 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Sat, 15 Nov 2025 17:37:49 -0800 Subject: [PATCH 050/143] cxl/test: Remove ret_limit race condition in mock_get_event() Commit 364ee9f3265e ("cxl/test: Enhance event testing") changed the loop iterator in mock_get_event() from a static constant, CXL_TEST_EVENT_CNT, to a dynamic global variable, ret_limit. The intent was to vary the number of events returned per call to simulate events occurring while logs are being read. However, ret_limit is modified without synchronization. When multiple threads call mock_get_event() concurrently, one thread may read ret_limit, another thread may increment it, and the first thread's loop condition and size calculation see and use the updated value. This is visible during cxl_test module load when all memdevs are initializing simultaneously, which includes getting event records. It is not tied to the cxl-events.sh unit test specifically, as that operates on a single memdev. While no actual harm results (the buffer is always large enough and the record count fields correctly reflect what was written), this is a correctness issue. The race creates an inconsistent state within mock_get_event() and adding variability based on a race appears unintended. Make ret_limit a local variable populated from an atomic counter. Each call gets a stable value that won't change during execution. That preserves the intended behavior of varying the return counts across calls while eliminating the race condition. This implementation uses "+ 1" to produce the full range of 1 to CXL_TEST_EVENT_RET_MAX (4) records. Previously only 1, 2, 3 were produced. Signed-off-by: Alison Schofield Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang > --- Link: https://patch.msgid.link/20251116013819.1713780-1-alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit b6369daf0d6a96db5048edd26b07fc1aaed77dd1) Signed-off-by: Jiandi An --- tools/testing/cxl/test/mem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index d533481672b78..6809c4a26f5e4 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -250,22 +250,22 @@ static void mes_add_event(struct mock_event_store *mes, * Vary the number of events returned to simulate events occuring while the * logs are being read. */ -static int ret_limit = 0; +static atomic_t event_counter = ATOMIC_INIT(0); static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) { struct cxl_get_event_payload *pl; struct mock_event_log *log; u16 nr_overflow; + int ret_limit; u8 log_type; int i; if (cmd->size_in != sizeof(log_type)) return -EINVAL; - ret_limit = (ret_limit + 1) % CXL_TEST_EVENT_RET_MAX; - if (!ret_limit) - ret_limit = 1; + /* Vary return limit from 1 to CXL_TEST_EVENT_RET_MAX */ + ret_limit = (atomic_inc_return(&event_counter) % CXL_TEST_EVENT_RET_MAX) + 1; if (cmd->size_out < struct_size(pl, records, ret_limit)) return -EINVAL; From 33b4ed1c58e5be33e8b57ac162bcc84d85bdd388 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Sat, 15 Nov 2025 17:30:32 -0800 Subject: [PATCH 051/143] cxl/test: Assign overflow_err_count from log->nr_overflow mock_get_event() uses an uninitialized local variable, nr_overflow, to populate the overflow_err_count field. That results in incorrect overflow_err_count values in mocked cxl_overflow trace events, such as this case where the records are reported as 0 and should be non-zero: [] cxl_overflow: memdev=mem7 host=cxl_mem.6 serial=7: log=Failure : 0 records from 1763228189130895685 to 1763228193130896180 Fix by using log->nr_overflow and remove the unused local variable. A follow-up change was considered in cxl_mem_get_records_log() to confirm that the overflow_err_count is non-zero when the overflow flag is set [1]. Since the driver has no functional dependency on this constraint, and a device that violates this specific requirement does not cause incorrect driver behavior, no validation check is added. [1] CXL 3.2, Table 8-65 Get Event Records Output Payload Signed-off-by: Alison Schofield Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang > --- Link: https://patch.msgid.link/20251116013036.1713313-1-alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit f1840efdb2bf4f8d0e698eebec8f676c6d745c6d) Signed-off-by: Jiandi An --- tools/testing/cxl/test/mem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 6809c4a26f5e4..176dcde570cdd 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -256,7 +256,6 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) { struct cxl_get_event_payload *pl; struct mock_event_log *log; - u16 nr_overflow; int ret_limit; u8 log_type; int i; @@ -299,7 +298,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) u64 ns; pl->flags |= CXL_GET_EVENT_FLAG_OVERFLOW; - pl->overflow_err_count = cpu_to_le16(nr_overflow); + pl->overflow_err_count = cpu_to_le16(log->nr_overflow); ns = ktime_get_real_ns(); ns -= 5000000000; /* 5s ago */ pl->first_overflow_timestamp = cpu_to_le64(ns); From e0655d0381feeec4ff1d88fa720d5ece58344704 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 26 Sep 2025 16:31:31 +0200 Subject: [PATCH 052/143] soc: apple: mailbox: fix device leak on lookup Make sure to drop the reference taken to the mbox platform device when looking up its driver data. Note that holding a reference to a device does not prevent its driver data from going away so there is no point in keeping the reference. Fixes: 6e1457fcad3f ("soc: apple: mailbox: Add ASC/M3 mailbox driver") Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Johan Hovold Reviewed-by: Neal Gompa Signed-off-by: Sven Peter (cherry picked from commit f401671e90ccc26b3022f177c4156a429c024f6c) Signed-off-by: Jiandi An --- drivers/soc/apple/mailbox.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/soc/apple/mailbox.c b/drivers/soc/apple/mailbox.c index 49a0955e82d6c..1685da1da23d0 100644 --- a/drivers/soc/apple/mailbox.c +++ b/drivers/soc/apple/mailbox.c @@ -299,11 +299,18 @@ struct apple_mbox *apple_mbox_get(struct device *dev, int index) return ERR_PTR(-EPROBE_DEFER); mbox = platform_get_drvdata(pdev); - if (!mbox) - return ERR_PTR(-EPROBE_DEFER); + if (!mbox) { + mbox = ERR_PTR(-EPROBE_DEFER); + goto out_put_pdev; + } + + if (!device_link_add(dev, &pdev->dev, DL_FLAG_AUTOREMOVE_CONSUMER)) { + mbox = ERR_PTR(-ENODEV); + goto out_put_pdev; + } - if (!device_link_add(dev, &pdev->dev, DL_FLAG_AUTOREMOVE_CONSUMER)) - return ERR_PTR(-ENODEV); +out_put_pdev: + put_device(&pdev->dev); return mbox; } From 3522058bef8de1cfb73f5cd68223a5e69e010ac1 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 26 Sep 2025 16:31:32 +0200 Subject: [PATCH 053/143] soc: apple: sart: drop device reference after lookup Holding a reference to a device does not prevent its driver data from going away so there is no point in keeping the reference after looking up the sart device. Signed-off-by: Johan Hovold Reviewed-by: Neal Gompa Signed-off-by: Sven Peter (cherry picked from commit f95f3bceade25914cca30c871187b2d33db23f34) Signed-off-by: Jiandi An --- drivers/soc/apple/sart.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/soc/apple/sart.c b/drivers/soc/apple/sart.c index afa1117368997..6952afc41308a 100644 --- a/drivers/soc/apple/sart.c +++ b/drivers/soc/apple/sart.c @@ -164,17 +164,11 @@ static int apple_sart_probe(struct platform_device *pdev) return 0; } -static void apple_sart_put_device(void *dev) -{ - put_device(dev); -} - struct apple_sart *devm_apple_sart_get(struct device *dev) { struct device_node *sart_node; struct platform_device *sart_pdev; struct apple_sart *sart; - int ret; sart_node = of_parse_phandle(dev->of_node, "apple,sart", 0); if (!sart_node) @@ -192,14 +186,11 @@ struct apple_sart *devm_apple_sart_get(struct device *dev) return ERR_PTR(-EPROBE_DEFER); } - ret = devm_add_action_or_reset(dev, apple_sart_put_device, - &sart_pdev->dev); - if (ret) - return ERR_PTR(ret); - device_link_add(dev, &sart_pdev->dev, DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_SUPPLIER); + put_device(&sart_pdev->dev); + return sart; } EXPORT_SYMBOL_GPL(devm_apple_sart_get); From 028cb3bc8c0bb4ac85816f1c3d31d1931ea776bf Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 26 Sep 2025 16:24:53 +0200 Subject: [PATCH 054/143] soc: amlogic: canvas: fix device leak on lookup Make sure to drop the reference taken to the canvas platform device when looking up its driver data. Note that holding a reference to a device does not prevent its driver data from going away so there is no point in keeping the reference. Also note that commit 28f851e6afa8 ("soc: amlogic: canvas: add missing put_device() call in meson_canvas_get()") fixed the leak in a lookup error path, but the reference is still leaking on success. Fixes: d4983983d987 ("soc: amlogic: add meson-canvas driver") Cc: stable@vger.kernel.org # 4.20: 28f851e6afa8 Cc: Yu Kuai Signed-off-by: Johan Hovold Reviewed-by: Martin Blumenstingl Link: https://patch.msgid.link/20250926142454.5929-2-johan@kernel.org Signed-off-by: Neil Armstrong (cherry picked from commit 32200f4828de9d7e6db379909898e718747f4e18) Signed-off-by: Jiandi An --- drivers/soc/amlogic/meson-canvas.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/soc/amlogic/meson-canvas.c b/drivers/soc/amlogic/meson-canvas.c index b6e06c4d2117f..0711088da5dcd 100644 --- a/drivers/soc/amlogic/meson-canvas.c +++ b/drivers/soc/amlogic/meson-canvas.c @@ -73,10 +73,9 @@ struct meson_canvas *meson_canvas_get(struct device *dev) * current state, this driver probe cannot return -EPROBE_DEFER */ canvas = dev_get_drvdata(&canvas_pdev->dev); - if (!canvas) { - put_device(&canvas_pdev->dev); + put_device(&canvas_pdev->dev); + if (!canvas) return ERR_PTR(-EINVAL); - } return canvas; } From f97cbe15045c4ebe14f8d60f8237d8df919b8956 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 26 Sep 2025 16:24:54 +0200 Subject: [PATCH 055/143] soc: amlogic: canvas: simplify lookup error handling Simplify the canvas lookup error handling by dropping the OF node reference sooner. Signed-off-by: Johan Hovold Reviewed-by: Martin Blumenstingl Link: https://patch.msgid.link/20250926142454.5929-3-johan@kernel.org Signed-off-by: Neil Armstrong (cherry picked from commit 075daf22641870e435a16ec2129bfd3b3134c487) Signed-off-by: Jiandi An --- drivers/soc/amlogic/meson-canvas.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/soc/amlogic/meson-canvas.c b/drivers/soc/amlogic/meson-canvas.c index 0711088da5dcd..79681afea8c61 100644 --- a/drivers/soc/amlogic/meson-canvas.c +++ b/drivers/soc/amlogic/meson-canvas.c @@ -60,12 +60,9 @@ struct meson_canvas *meson_canvas_get(struct device *dev) return ERR_PTR(-ENODEV); canvas_pdev = of_find_device_by_node(canvas_node); - if (!canvas_pdev) { - of_node_put(canvas_node); - return ERR_PTR(-EPROBE_DEFER); - } - of_node_put(canvas_node); + if (!canvas_pdev) + return ERR_PTR(-EPROBE_DEFER); /* * If priv is NULL, it's probably because the canvas hasn't From 13c26c0fc20f75d9123ece927099cd8d8d750210 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 13 Oct 2025 18:45:33 +0100 Subject: [PATCH 056/143] dt-bindings: soc: microchip: document the simple-mfd syscon on PolarFire SoC "mss-top-sysreg" contains clocks, pinctrl, resets, an interrupt controller and more. At this point, only the reset controller child is described as that's all that is described by the existing bindings. The clock controller already has a dedicated node, and will retain it as there are other clock regions, so like the mailbox, a compatible-based lookup of the syscon is sufficient to keep the clock driver working as before, so no child is needed. There's also an interrupt multiplexing service provided by this syscon, for which there is work in progress at [1]. Link: https://lore.kernel.org/linux-gpio/20240723-uncouple-enforcer-7c48e4a4fefe@wendy/ [1] Reviewed-by: Krzysztof Kozlowski Signed-off-by: Conor Dooley (cherry picked from commit feaa716adc514fb5fbcb60b3e1620ac5dcf8505a) Signed-off-by: Jiandi An --- .../microchip,mpfs-mss-top-sysreg.yaml | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 Documentation/devicetree/bindings/soc/microchip/microchip,mpfs-mss-top-sysreg.yaml diff --git a/Documentation/devicetree/bindings/soc/microchip/microchip,mpfs-mss-top-sysreg.yaml b/Documentation/devicetree/bindings/soc/microchip/microchip,mpfs-mss-top-sysreg.yaml new file mode 100644 index 0000000000000..1ab691db87950 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/microchip/microchip,mpfs-mss-top-sysreg.yaml @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/soc/microchip/microchip,mpfs-mss-top-sysreg.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip PolarFire SoC Microprocessor Subsystem (MSS) sysreg register region + +maintainers: + - Conor Dooley + +description: + An wide assortment of registers that control elements of the MSS on PolarFire + SoC, including pinmuxing, resets and clocks among others. + +properties: + compatible: + items: + - const: microchip,mpfs-mss-top-sysreg + - const: syscon + + reg: + maxItems: 1 + + '#reset-cells': + description: + The AHB/AXI peripherals on the PolarFire SoC have reset support, so + from CLK_ENVM to CLK_CFM. The reset consumer should specify the + desired peripheral via the clock ID in its "resets" phandle cell. + See include/dt-bindings/clock/microchip,mpfs-clock.h for the full list + of PolarFire clock/reset IDs. + const: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + syscon@20002000 { + compatible = "microchip,mpfs-mss-top-sysreg", "syscon"; + reg = <0x20002000 0x1000>; + #reset-cells = <1>; + }; + From aff6a6491e965c387edf236329325bcdd3526059 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 13 Oct 2025 18:45:34 +0100 Subject: [PATCH 057/143] soc: microchip: add mfd drivers for two syscon regions on PolarFire SoC The control-scb and mss-top-sysreg regions on PolarFire SoC both fulfill multiple purposes. The former is used for mailbox functions in addition to the temperature & voltage sensor while the latter is used for clocks, resets, interrupt muxing and pinctrl. Signed-off-by: Conor Dooley (cherry picked from commit 4aac11c9a6e72efc025113e1ed62a1f084294300) Signed-off-by: Jiandi An --- drivers/soc/microchip/Kconfig | 12 ++++++ drivers/soc/microchip/Makefile | 1 + drivers/soc/microchip/mpfs-control-scb.c | 38 ++++++++++++++++++ drivers/soc/microchip/mpfs-mss-top-sysreg.c | 44 +++++++++++++++++++++ 4 files changed, 95 insertions(+) create mode 100644 drivers/soc/microchip/mpfs-control-scb.c create mode 100644 drivers/soc/microchip/mpfs-mss-top-sysreg.c diff --git a/drivers/soc/microchip/Kconfig b/drivers/soc/microchip/Kconfig index 19f4b576f822b..bcf5546025610 100644 --- a/drivers/soc/microchip/Kconfig +++ b/drivers/soc/microchip/Kconfig @@ -9,3 +9,15 @@ config POLARFIRE_SOC_SYS_CTRL module will be called mpfs_system_controller. If unsure, say N. + +config POLARFIRE_SOC_SYSCONS + bool "PolarFire SoC (MPFS) syscon drivers" + default y + depends on ARCH_MICROCHIP + select MFD_CORE + help + These drivers add support for the syscons on PolarFire SoC (MPFS). + Without these drivers core parts of the kernel such as clocks + and resets will not function correctly. + + If unsure, and on a PolarFire SoC, say y. diff --git a/drivers/soc/microchip/Makefile b/drivers/soc/microchip/Makefile index 14489919fe4b3..1a3a1594b089b 100644 --- a/drivers/soc/microchip/Makefile +++ b/drivers/soc/microchip/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_POLARFIRE_SOC_SYS_CTRL) += mpfs-sys-controller.o +obj-$(CONFIG_POLARFIRE_SOC_SYSCONS) += mpfs-control-scb.o mpfs-mss-top-sysreg.o diff --git a/drivers/soc/microchip/mpfs-control-scb.c b/drivers/soc/microchip/mpfs-control-scb.c new file mode 100644 index 0000000000000..f0b84b1f49cbc --- /dev/null +++ b/drivers/soc/microchip/mpfs-control-scb.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +static const struct mfd_cell mpfs_control_scb_devs[] = { + MFD_CELL_NAME("mpfs-tvs"), +}; + +static int mpfs_control_scb_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + + return mfd_add_devices(dev, PLATFORM_DEVID_NONE, mpfs_control_scb_devs, + ARRAY_SIZE(mpfs_control_scb_devs), NULL, 0, NULL); +} + +static const struct of_device_id mpfs_control_scb_of_match[] = { + { .compatible = "microchip,mpfs-control-scb", }, + {}, +}; +MODULE_DEVICE_TABLE(of, mpfs_control_scb_of_match); + +static struct platform_driver mpfs_control_scb_driver = { + .driver = { + .name = "mpfs-control-scb", + .of_match_table = mpfs_control_scb_of_match, + }, + .probe = mpfs_control_scb_probe, +}; +module_platform_driver(mpfs_control_scb_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Conor Dooley "); +MODULE_DESCRIPTION("PolarFire SoC control scb driver"); diff --git a/drivers/soc/microchip/mpfs-mss-top-sysreg.c b/drivers/soc/microchip/mpfs-mss-top-sysreg.c new file mode 100644 index 0000000000000..b2244e44ff0fa --- /dev/null +++ b/drivers/soc/microchip/mpfs-mss-top-sysreg.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell mpfs_mss_top_sysreg_devs[] = { + MFD_CELL_NAME("mpfs-reset"), +}; + +static int mpfs_mss_top_sysreg_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + int ret; + + ret = mfd_add_devices(dev, PLATFORM_DEVID_NONE, mpfs_mss_top_sysreg_devs, + ARRAY_SIZE(mpfs_mss_top_sysreg_devs) , NULL, 0, NULL); + if (ret) + return ret; + + return devm_of_platform_populate(dev); +} + +static const struct of_device_id mpfs_mss_top_sysreg_of_match[] = { + { .compatible = "microchip,mpfs-mss-top-sysreg", }, + {}, +}; +MODULE_DEVICE_TABLE(of, mpfs_mss_top_sysreg_of_match); + +static struct platform_driver mpfs_mss_top_sysreg_driver = { + .driver = { + .name = "mpfs-mss-top-sysreg", + .of_match_table = mpfs_mss_top_sysreg_of_match, + }, + .probe = mpfs_mss_top_sysreg_probe, +}; +module_platform_driver(mpfs_mss_top_sysreg_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Conor Dooley "); +MODULE_DESCRIPTION("PolarFire SoC mss top sysreg driver"); From 1af94efafea207512f56cca1ce834d04248ee403 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 10 Nov 2025 11:23:53 +0000 Subject: [PATCH 058/143] MAINTAINERS: add new soc drivers to Microchip RISC-V entry Add the two new syscon drivers to the RISC-V entry for Microchip platforms. Signed-off-by: Conor Dooley (cherry picked from commit 587c0a5e810b72c93fa44ee06d60dd555f52360b) Signed-off-by: Jiandi An --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9a83ca44c7d29..da0c686bc7994 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21748,6 +21748,8 @@ F: drivers/pci/controller/plda/pcie-microchip-host.c F: drivers/pwm/pwm-microchip-core.c F: drivers/reset/reset-mpfs.c F: drivers/rtc/rtc-mpfs.c +F: drivers/soc/microchip/mpfs-control-scb.c +F: drivers/soc/microchip/mpfs-mss-top-sysreg.c F: drivers/soc/microchip/mpfs-sys-controller.c F: drivers/spi/spi-microchip-core-qspi.c F: drivers/spi/spi-microchip-core.c From ed7e3decfd152b60633d7c849e91e433ac1395fc Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 10 Nov 2025 11:23:54 +0000 Subject: [PATCH 059/143] MAINTAINERS: rename Microchip RISC-V entry There's now non-FPGA RISC-V SoCs from Microchip, so rename the entry to reflect that. Signed-off-by: Conor Dooley (cherry picked from commit 66c6ceb41ed375773491c5d024167a2cbe6fe944) Signed-off-by: Jiandi An --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index da0c686bc7994..0dda21bf8d21d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21722,7 +21722,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux.git F: Documentation/devicetree/bindings/iommu/riscv,iommu.yaml F: drivers/iommu/riscv/ -RISC-V MICROCHIP FPGA SUPPORT +RISC-V MICROCHIP SUPPORT M: Conor Dooley M: Daire McNamara L: linux-riscv@lists.infradead.org From 6e5191437c9a035636ae988b11eded02ad6e4530 Mon Sep 17 00:00:00 2001 From: Pierre-Henry Moussay Date: Mon, 17 Nov 2025 14:24:37 +0000 Subject: [PATCH 060/143] dt-bindings: cache: sifive,ccache0: add a pic64gx compatible The pic64gx use the same IP than mpfs, therefore add compatibility with mpfs as fallback. Signed-off-by: Pierre-Henry Moussay Acked-by: Rob Herring (Arm) Signed-off-by: Conor Dooley (cherry picked from commit d52341da4db0cd993d3549aa20cbdf063b412c3b) Signed-off-by: Jiandi An --- Documentation/devicetree/bindings/cache/sifive,ccache0.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/cache/sifive,ccache0.yaml b/Documentation/devicetree/bindings/cache/sifive,ccache0.yaml index 579bacb66f348..c0e5ebb1fa4c7 100644 --- a/Documentation/devicetree/bindings/cache/sifive,ccache0.yaml +++ b/Documentation/devicetree/bindings/cache/sifive,ccache0.yaml @@ -48,6 +48,11 @@ properties: - const: microchip,mpfs-ccache - const: sifive,fu540-c000-ccache - const: cache + - items: + - const: microchip,pic64gx-ccache + - const: microchip,mpfs-ccache + - const: sifive,fu540-c000-ccache + - const: cache cache-block-size: const: 64 From 6485a3d7e1155d54bd32651adf5db3beff31f14c Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 17 Nov 2025 10:47:54 +0000 Subject: [PATCH 061/143] memregion: Drop unused IORES_DESC_* parameter from cpu_cache_invalidate_memregion() The res_desc parameter was originally introduced for documentation purposes and with the idea that with HDM-DB CXL invalidation could be triggered from the device. That has not come to pass and the continued existence of the option is confusing when we add a range in the following patch which might not be a strict subset of the res_desc. So avoid that confusion by dropping the parameter. Link: https://lore.kernel.org/linux-mm/686eedb25ed02_24471002e@dwillia2-xfh.jf.intel.com.notmuch/ Reviewed-by: Dan Williams Suggested-by: Dan Williams Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley (cherry picked from commit f49ae86483c494ddc793d889f6df5ea68d138569) Signed-off-by: Jiandi An --- arch/x86/mm/pat/set_memory.c | 2 +- drivers/cxl/core/region.c | 2 +- drivers/nvdimm/region.c | 2 +- drivers/nvdimm/region_devs.c | 2 +- include/linux/memregion.h | 7 +++---- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 8834c76f91c9e..4019b17fb65ed 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -368,7 +368,7 @@ bool cpu_cache_has_invalidate_memregion(void) } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); -int cpu_cache_invalidate_memregion(int res_desc) +int cpu_cache_invalidate_memregion(void) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index b251cb998892a..ffab8efa3d66d 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -236,7 +236,7 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr) return -ENXIO; } - cpu_cache_invalidate_memregion(IORES_DESC_CXL); + cpu_cache_invalidate_memregion(); return 0; } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 88dc062af5f84..c43506448edf8 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -110,7 +110,7 @@ static void nd_region_remove(struct device *dev) * here is ok. */ if (cpu_cache_has_invalidate_memregion()) - cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY); + cpu_cache_invalidate_memregion(); } static int child_notify(struct device *dev, void *data) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index de1ee5ebc8516..3cdd93d40997f 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -90,7 +90,7 @@ static int nd_region_invalidate_memregion(struct nd_region *nd_region) } } - cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY); + cpu_cache_invalidate_memregion(); out: for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; diff --git a/include/linux/memregion.h b/include/linux/memregion.h index c013214677897..945646bde825c 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -26,8 +26,7 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for - * memregions described by @res_desc - * @res_desc: one of the IORES_DESC_* types + * memregion * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -46,7 +45,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(int res_desc); +int cpu_cache_invalidate_memregion(void); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -54,7 +53,7 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(int res_desc) +static inline int cpu_cache_invalidate_memregion(void) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; From cb2c6f8d76b26fa3017df1c671db3f2efe8d9fa0 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Nov 2025 10:47:55 +0000 Subject: [PATCH 062/143] memregion: Support fine grained invalidate by cpu_cache_invalidate_memregion() Extend cpu_cache_invalidate_memregion() to support invalidating a particular range of memory by introducing start and length parameters. Control of types of invalidation is left for when use cases turn up. For now everything is Clean and Invalidate. Where the range is unknown, use the provided cpu_cache_invalidate_all() helper to act as documentation of intent in a fashion that is clearer than passing (0, -1) to cpu_cache_invalidate_memregion(). Signed-off-by: Yicong Yang Reviewed-by: Dan Williams Acked-by: Davidlohr Bueso Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley (cherry picked from commit b43652d867cf2a5f31b14e3d9a320ad01fca0992) Signed-off-by: Jiandi An --- arch/x86/mm/pat/set_memory.c | 2 +- drivers/cxl/core/region.c | 5 ++++- drivers/nvdimm/region.c | 2 +- drivers/nvdimm/region_devs.c | 2 +- include/linux/memregion.h | 13 +++++++++++-- 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 4019b17fb65ed..292c7202faed9 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -368,7 +368,7 @@ bool cpu_cache_has_invalidate_memregion(void) } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); -int cpu_cache_invalidate_memregion(void) +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index ffab8efa3d66d..2ef7ac530f4d8 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -236,7 +236,10 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr) return -ENXIO; } - cpu_cache_invalidate_memregion(); + if (!cxlr->params.res) + return -ENXIO; + cpu_cache_invalidate_memregion(cxlr->params.res->start, + resource_size(cxlr->params.res)); return 0; } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index c43506448edf8..42e982db5b049 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -110,7 +110,7 @@ static void nd_region_remove(struct device *dev) * here is ok. */ if (cpu_cache_has_invalidate_memregion()) - cpu_cache_invalidate_memregion(); + cpu_cache_invalidate_all(); } static int child_notify(struct device *dev, void *data) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 3cdd93d40997f..e27fc380f6c0b 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -90,7 +90,7 @@ static int nd_region_invalidate_memregion(struct nd_region *nd_region) } } - cpu_cache_invalidate_memregion(); + cpu_cache_invalidate_all(); out: for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; diff --git a/include/linux/memregion.h b/include/linux/memregion.h index 945646bde825c..a55f62cc52660 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -27,6 +27,9 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for * memregion + * @start: start physical address of the target memory region. + * @len: length of the target memory region. -1 for all the regions of + * the target type. * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -45,7 +48,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(void); +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -53,10 +56,16 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(void) +static inline int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; } #endif + +static inline int cpu_cache_invalidate_all(void) +{ + return cpu_cache_invalidate_memregion(0, -1); +} + #endif /* _MEMREGION_H_ */ From 7ac957b1917c29e39effa2a24967a7717341fdc3 Mon Sep 17 00:00:00 2001 From: Xianwei Zhao Date: Wed, 19 Nov 2025 10:52:22 +0800 Subject: [PATCH 063/143] dt-bindings: arm: amlogic: meson-gx-ao-secure: support more SoCs Add new compatible for ao-secure of Amlogic SoCs(S6,S7,S7D). Acked-by: Conor Dooley Signed-off-by: Xianwei Zhao Link: https://patch.msgid.link/20251119-soc-info-s6-s7-s7d-v3-1-1764c1995c04@amlogic.com Signed-off-by: Neil Armstrong (cherry picked from commit 1d80bed4e35710287c584f998e51980a34fb3a4e) Signed-off-by: Jiandi An --- .../bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml index b4f6695a60152..fa7c403c874a6 100644 --- a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml +++ b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml @@ -34,6 +34,9 @@ properties: - amlogic,a4-ao-secure - amlogic,c3-ao-secure - amlogic,s4-ao-secure + - amlogic,s6-ao-secure + - amlogic,s7-ao-secure + - amlogic,s7d-ao-secure - amlogic,t7-ao-secure - const: amlogic,meson-gx-ao-secure - const: syscon From 86f030df5e131ffb24b562a37e7450ac4bb9a689 Mon Sep 17 00:00:00 2001 From: Xianwei Zhao Date: Wed, 19 Nov 2025 10:52:23 +0800 Subject: [PATCH 064/143] soc: amlogic: meson-gx-socinfo: add new SoCs id Add new definition for Amlogic SoCs, include S6, S7, S7D. Reviewed-by: Neil Armstrong Reviewed-by: Martin Blumenstingl Signed-off-by: Xianwei Zhao Link: https://patch.msgid.link/20251119-soc-info-s6-s7-s7d-v3-2-1764c1995c04@amlogic.com Signed-off-by: Neil Armstrong (cherry picked from commit ba8abbdfd09e64f51ead8b86afc6b586505919b4) Signed-off-by: Jiandi An --- drivers/soc/amlogic/meson-gx-socinfo.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/soc/amlogic/meson-gx-socinfo.c b/drivers/soc/amlogic/meson-gx-socinfo.c index 7549f1644e5ea..2a54ca43cd13e 100644 --- a/drivers/soc/amlogic/meson-gx-socinfo.c +++ b/drivers/soc/amlogic/meson-gx-socinfo.c @@ -46,6 +46,9 @@ static const struct meson_gx_soc_id { { "A5", 0x3c }, { "C3", 0x3d }, { "A4", 0x40 }, + { "S7", 0x46 }, + { "S7D", 0x47 }, + { "S6", 0x48 }, }; static const struct meson_gx_package_id { @@ -86,6 +89,9 @@ static const struct meson_gx_package_id { { "A311D2", 0x36, 0x1, 0xf }, { "A113X2", 0x3c, 0x1, 0xf }, { "A113L2", 0x40, 0x1, 0xf }, + { "S805X3", 0x46, 0x3, 0xf }, + { "S905X5M", 0x47, 0x1, 0xf }, + { "S905X5", 0x48, 0x1, 0xf }, }; static inline unsigned int socinfo_to_major(u32 socinfo) From e4291ec0e847fb5603071ba2aac11274349fe733 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Nov 2025 10:47:56 +0000 Subject: [PATCH 065/143] lib: Support ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION provides the mechanism for invalidating certain memory regions in a cache-incoherent manner. Currently this is used by NVDIMM and CXL memory drivers in cases where it is necessary to flush all data from caches by physical address range. The operations in question are effectively memory hotplug, where stale data might otherwise remain in the caches. This is separate from the invalidates done to enable use of non-coherent DMA masters, primarily in terms of when it is needed (not related to DMA mappings) and how deep the flush must push data. The flushes done for non-coherent DMA only need to reach the Point of Coherence of a single host (which is often nearer CPUs and DMA masters than the physical storage). This operation must push the data out of non architectural caches (memory-side caches, write buffers etc) and typically all the way to the memory device. In some architectures these operations are supported by system components that may become available only later in boot as they are either present on a discoverable bus, or via a firmware description of an MMIO interface (e.g. ACPI DSDT). Provide a framework to handle this case. Architectures can opt in for this support via CONFIG_GENERIC_CPU_CACHE_MAINTENANCE Add a registration framework. Each driver provides an ops structure and the first op is Write Back and Invalidate by PA Range. The driver may over invalidate. For systems that can perform this operation asynchronously an optional completion check operation is also provided. If present that must be called to ensure that the action has finished. This provides a considerable performance advantage if multiple agents are involved in the maintenance operation. When multiple agents are present in the system each should register with this framework and the core code will issue the invalidate to all of them before checking for completion on each. This is done to avoid need for filtering in the core code which can become complex when interleave, potentially across different cache coherency hardware is going on, so it is easier to tell everyone and let those who don't care do nothing. Signed-off-by: Yicong Yang Co-developed-by: Jonathan Cameron Signed-off-by: Jonathan Cameron Acked-by: Conor Dooley Signed-off-by: Conor Dooley (cherry picked from commit c460697d3472d4252917fba9bbc1d1a23eafc124) Signed-off-by: Jiandi An --- include/linux/cache_coherency.h | 61 ++++++++++++++ lib/Kconfig | 3 + lib/Makefile | 2 + lib/cache_maint.c | 138 ++++++++++++++++++++++++++++++++ 4 files changed, 204 insertions(+) create mode 100644 include/linux/cache_coherency.h create mode 100644 lib/cache_maint.c diff --git a/include/linux/cache_coherency.h b/include/linux/cache_coherency.h new file mode 100644 index 0000000000000..cc81c5733e316 --- /dev/null +++ b/include/linux/cache_coherency.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Cache coherency maintenance operation device drivers + * + * Copyright Huawei 2025 + */ +#ifndef _LINUX_CACHE_COHERENCY_H_ +#define _LINUX_CACHE_COHERENCY_H_ + +#include +#include +#include + +struct cc_inval_params { + phys_addr_t addr; + size_t size; +}; + +struct cache_coherency_ops_inst; + +struct cache_coherency_ops { + int (*wbinv)(struct cache_coherency_ops_inst *cci, + struct cc_inval_params *invp); + int (*done)(struct cache_coherency_ops_inst *cci); +}; + +struct cache_coherency_ops_inst { + struct kref kref; + struct list_head node; + const struct cache_coherency_ops *ops; +}; + +int cache_coherency_ops_instance_register(struct cache_coherency_ops_inst *cci); +void cache_coherency_ops_instance_unregister(struct cache_coherency_ops_inst *cci); + +struct cache_coherency_ops_inst * +_cache_coherency_ops_instance_alloc(const struct cache_coherency_ops *ops, + size_t size); +/** + * cache_coherency_ops_instance_alloc - Allocate cache coherency ops instance + * @ops: Cache maintenance operations + * @drv_struct: structure that contains the struct cache_coherency_ops_inst + * @member: Name of the struct cache_coherency_ops_inst member in @drv_struct. + * + * This allocates a driver specific structure and initializes the + * cache_coherency_ops_inst embedded in the drv_struct. Upon success the + * pointer must be freed via cache_coherency_ops_instance_put(). + * + * Returns a &drv_struct * on success, %NULL on error. + */ +#define cache_coherency_ops_instance_alloc(ops, drv_struct, member) \ + ({ \ + static_assert(__same_type(struct cache_coherency_ops_inst, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_cache_coherency_ops_instance_alloc(ops, \ + sizeof(drv_struct)); \ + }) +void cache_coherency_ops_instance_put(struct cache_coherency_ops_inst *cci); + +#endif diff --git a/lib/Kconfig b/lib/Kconfig index c483951b624ff..cd8e5844f9bb6 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -543,6 +543,9 @@ config MEMREGION config ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION bool +config GENERIC_CPU_CACHE_MAINTENANCE + bool + config ARCH_HAS_MEMREMAP_COMPAT_ALIGN bool diff --git a/lib/Makefile b/lib/Makefile index 392ff808c9b90..eed20c50f358b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -130,6 +130,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o +obj-$(CONFIG_GENERIC_CPU_CACHE_MAINTENANCE) += cache_maint.o + lib-y += logic_pio.o lib-$(CONFIG_INDIRECT_IOMEM) += logic_iomem.o diff --git a/lib/cache_maint.c b/lib/cache_maint.c new file mode 100644 index 0000000000000..9256a9ffc34c7 --- /dev/null +++ b/lib/cache_maint.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic support for Memory System Cache Maintenance operations. + * + * Coherency maintenance drivers register with this simple framework that will + * iterate over each registered instance to first kick off invalidation and + * then to wait until it is complete. + * + * If no implementations are registered yet cpu_cache_has_invalidate_memregion() + * will return false. If this runs concurrently with unregistration then a + * race exists but this is no worse than the case where the operations instance + * responsible for a given memory region has not yet registered. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static LIST_HEAD(cache_ops_instance_list); +static DECLARE_RWSEM(cache_ops_instance_list_lock); + +static void __cache_coherency_ops_instance_free(struct kref *kref) +{ + struct cache_coherency_ops_inst *cci = + container_of(kref, struct cache_coherency_ops_inst, kref); + kfree(cci); +} + +void cache_coherency_ops_instance_put(struct cache_coherency_ops_inst *cci) +{ + kref_put(&cci->kref, __cache_coherency_ops_instance_free); +} +EXPORT_SYMBOL_GPL(cache_coherency_ops_instance_put); + +static int cache_inval_one(struct cache_coherency_ops_inst *cci, void *data) +{ + if (!cci->ops) + return -EINVAL; + + return cci->ops->wbinv(cci, data); +} + +static int cache_inval_done_one(struct cache_coherency_ops_inst *cci) +{ + if (!cci->ops) + return -EINVAL; + + if (!cci->ops->done) + return 0; + + return cci->ops->done(cci); +} + +static int cache_invalidate_memregion(phys_addr_t addr, size_t size) +{ + int ret; + struct cache_coherency_ops_inst *cci; + struct cc_inval_params params = { + .addr = addr, + .size = size, + }; + + guard(rwsem_read)(&cache_ops_instance_list_lock); + list_for_each_entry(cci, &cache_ops_instance_list, node) { + ret = cache_inval_one(cci, ¶ms); + if (ret) + return ret; + } + list_for_each_entry(cci, &cache_ops_instance_list, node) { + ret = cache_inval_done_one(cci); + if (ret) + return ret; + } + + return 0; +} + +struct cache_coherency_ops_inst * +_cache_coherency_ops_instance_alloc(const struct cache_coherency_ops *ops, + size_t size) +{ + struct cache_coherency_ops_inst *cci; + + if (!ops || !ops->wbinv) + return NULL; + + cci = kzalloc(size, GFP_KERNEL); + if (!cci) + return NULL; + + cci->ops = ops; + INIT_LIST_HEAD(&cci->node); + kref_init(&cci->kref); + + return cci; +} +EXPORT_SYMBOL_NS_GPL(_cache_coherency_ops_instance_alloc, "CACHE_COHERENCY"); + +int cache_coherency_ops_instance_register(struct cache_coherency_ops_inst *cci) +{ + guard(rwsem_write)(&cache_ops_instance_list_lock); + list_add(&cci->node, &cache_ops_instance_list); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cache_coherency_ops_instance_register, "CACHE_COHERENCY"); + +void cache_coherency_ops_instance_unregister(struct cache_coherency_ops_inst *cci) +{ + guard(rwsem_write)(&cache_ops_instance_list_lock); + list_del(&cci->node); +} +EXPORT_SYMBOL_NS_GPL(cache_coherency_ops_instance_unregister, "CACHE_COHERENCY"); + +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) +{ + return cache_invalidate_memregion(start, len); +} +EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM"); + +/* + * Used for optimization / debug purposes only as removal can race + * + * Machines that do not support invalidation, e.g. VMs, will not have any + * operations instance to register and so this will always return false. + */ +bool cpu_cache_has_invalidate_memregion(void) +{ + guard(rwsem_read)(&cache_ops_instance_list_lock); + return !list_empty(&cache_ops_instance_list); +} +EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); From d76476aeb05d8f363dd3e5cfb058a37ee42a2247 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 17 Nov 2025 10:47:57 +0000 Subject: [PATCH 066/143] arm64: Select GENERIC_CPU_CACHE_MAINTENANCE The generic CPU cache maintenance framework provides a way to register drivers for devices implementing the underlying support for cpu_cache_has_invalidate_memregion(). Enable it for arm64 by selecting GENERIC_CPU_CACHE_MAINTENANCE which provides the implementation for, and in turn selects, ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION. Signed-off-by: Jonathan Cameron Acked-by: Catalin Marinas Signed-off-by: Conor Dooley (cherry picked from commit 4d873c5dc3ed5a189a39fcbddad8bcd2bd2a1785) Signed-off-by: Jiandi An --- arch/arm64/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1684ec2454369..70f023a98af99 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -21,6 +21,7 @@ config ARM64 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CC_PLATFORM + select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -146,6 +147,7 @@ config ARM64 select GENERIC_ARCH_TOPOLOGY select GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_CACHE_MAINTENANCE select GENERIC_CPU_DEVICES select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP From 8a2c90f2b4852e3abacce200db8b981e0c305153 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 17 Nov 2025 10:47:58 +0000 Subject: [PATCH 067/143] MAINTAINERS: Add Jonathan Cameron to drivers/cache and add lib/cache_maint.c + header Seems unfair to inflict the cache-coherency drivers on Conor with out also stepping up as a second maintainer for drivers/cache. Include the library support for cache-coherency maintenance drivers to the existing entry. Signed-off-by: Jonathan Cameron Acked-by: Conor Dooley Signed-off-by: Conor Dooley (cherry picked from commit 9b9de5a56a62c86472848ec7d48ca939411511e6) Signed-off-by: Jiandi An --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0dda21bf8d21d..13e62f538099b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24050,10 +24050,13 @@ F: drivers/staging/ STANDALONE CACHE CONTROLLER DRIVERS M: Conor Dooley +M: Jonathan Cameron S: Maintained T: git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/ F: Documentation/devicetree/bindings/cache/ F: drivers/cache +F: include/cache_coherency.h +F: lib/cache_maint.c STARFIRE/DURALAN NETWORK DRIVER M: Ion Badulescu From 809b667fd62a01f19ce494f252d33e3a66206835 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 17 Nov 2025 10:47:59 +0000 Subject: [PATCH 068/143] cache: Make top level Kconfig menu a boolean dependent on RISCV The next patch will add a new type of cache maintenance driver responsible for flushing deeper than is necessary for non coherent DMA (current use case of drivers/cache drivers), as needed when performing operations such as memory hotplug and security unlocking of persistent memory. The two types of operation are similar enough to share a drivers/cache directory and MAINTAINERS but are otherwise currently unrelated. To avoid confusion have two separate menus. Each has dependencies that are implemented by making them boolean symbols, here CACHEMAINT_FOR_DMA which is dependent on RISCV as all driver are currently for platforms of that architecture. Set new symbol default to y to avoid breaking existing configs. This has no affect on actual code built, just visibility of the menu. Suggested-by: Arnd Bergmann Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley (cherry picked from commit 4d1608d0ab3365d1ef9447bdbc0cb4c0962f1774) Signed-off-by: Jiandi An --- drivers/cache/Kconfig | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/cache/Kconfig b/drivers/cache/Kconfig index db51386c663a6..59a79df4c0ce7 100644 --- a/drivers/cache/Kconfig +++ b/drivers/cache/Kconfig @@ -1,9 +1,17 @@ # SPDX-License-Identifier: GPL-2.0 -menu "Cache Drivers" + +menuconfig CACHEMAINT_FOR_DMA + bool "Cache management for noncoherent DMA" + depends on RISCV + default y + help + These drivers implement support for noncoherent DMA master devices + on platforms that lack the standard CPU interfaces for this. + +if CACHEMAINT_FOR_DMA config AX45MP_L2_CACHE bool "Andes Technology AX45MP L2 Cache controller" - depends on RISCV select RISCV_NONSTANDARD_CACHE_OPS help Support for the L2 cache controller on Andes Technology AX45MP platforms. @@ -16,7 +24,6 @@ config SIFIVE_CCACHE config STARFIVE_STARLINK_CACHE bool "StarFive StarLink Cache controller" - depends on RISCV depends on ARCH_STARFIVE depends on 64BIT select RISCV_DMA_NONCOHERENT @@ -24,4 +31,4 @@ config STARFIVE_STARLINK_CACHE help Support for the StarLink cache controller IP from StarFive. -endmenu +endif #CACHEMAINT_FOR_DMA From 1cbb3506ec12ab30ae278f01c0769e4f3e5dc6cc Mon Sep 17 00:00:00 2001 From: Yushan Wang Date: Mon, 17 Nov 2025 10:48:00 +0000 Subject: [PATCH 069/143] cache: Support cache maintenance for HiSilicon SoC Hydra Home Agent Hydra Home Agent is a device used to maintain cache coherency. Add support for explicit cache maintenance operations using it. A system has multiple of these agents. Whilst only one agent is responsible for a given cache line, interleave means that for a range operation, responsibility for the cache lines making up the range will typically be spread across multiple instances. Put this driver on a new Kconfig menu under drivers/cache. The short description as memory hotplug like operations is intended to cover the somewhat complex set of cases where this unit applies and differentiate it clearly from typical non coherent DMA flows. Co-developed-by: Yicong Yang Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley (cherry picked from commit 2ec3b54a6ff04046c07b7050d02321e406c4dcd1) Signed-off-by: Jiandi An --- drivers/cache/Kconfig | 22 ++++ drivers/cache/Makefile | 2 + drivers/cache/hisi_soc_hha.c | 194 +++++++++++++++++++++++++++++++++++ 3 files changed, 218 insertions(+) create mode 100644 drivers/cache/hisi_soc_hha.c diff --git a/drivers/cache/Kconfig b/drivers/cache/Kconfig index 59a79df4c0ce7..1518449d47b51 100644 --- a/drivers/cache/Kconfig +++ b/drivers/cache/Kconfig @@ -32,3 +32,25 @@ config STARFIVE_STARLINK_CACHE Support for the StarLink cache controller IP from StarFive. endif #CACHEMAINT_FOR_DMA + +menuconfig CACHEMAINT_FOR_HOTPLUG + bool "Cache management for memory hot plug like operations" + depends on GENERIC_CPU_CACHE_MAINTENANCE + help + These drivers implement cache management for flows where it is necessary + to flush data from all host caches. + +if CACHEMAINT_FOR_HOTPLUG + +config HISI_SOC_HHA + tristate "HiSilicon Hydra Home Agent (HHA) device driver" + depends on (ARM64 && ACPI) || COMPILE_TEST + help + The Hydra Home Agent (HHA) is responsible for cache coherency + on the SoC. This drivers enables the cache maintenance functions of + the HHA. + + This driver can be built as a module. If so, the module will be + called hisi_soc_hha. + +endif #CACHEMAINT_FOR_HOTPLUG diff --git a/drivers/cache/Makefile b/drivers/cache/Makefile index 55c5e851034da..b3362b15d6c15 100644 --- a/drivers/cache/Makefile +++ b/drivers/cache/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_AX45MP_L2_CACHE) += ax45mp_cache.o obj-$(CONFIG_SIFIVE_CCACHE) += sifive_ccache.o obj-$(CONFIG_STARFIVE_STARLINK_CACHE) += starfive_starlink_cache.o + +obj-$(CONFIG_HISI_SOC_HHA) += hisi_soc_hha.o diff --git a/drivers/cache/hisi_soc_hha.c b/drivers/cache/hisi_soc_hha.c new file mode 100644 index 0000000000000..25ff0f5ae79b3 --- /dev/null +++ b/drivers/cache/hisi_soc_hha.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for HiSilicon Hydra Home Agent (HHA). + * + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. + * Author: Yicong Yang + * Yushan Wang + * + * A system typically contains multiple HHAs. Each is responsible for a subset + * of the physical addresses in the system, but interleave can make the mapping + * from a particular cache line to a responsible HHA complex. As such no + * filtering is done in the driver, with the hardware being responsible for + * responding with success for even if it was not responsible for any addresses + * in the range on which the operation was requested. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HISI_HHA_CTRL 0x5004 +#define HISI_HHA_CTRL_EN BIT(0) +#define HISI_HHA_CTRL_RANGE BIT(1) +#define HISI_HHA_CTRL_TYPE GENMASK(3, 2) +#define HISI_HHA_START_L 0x5008 +#define HISI_HHA_START_H 0x500c +#define HISI_HHA_LEN_L 0x5010 +#define HISI_HHA_LEN_H 0x5014 + +/* The maintain operation performs in a 128 Byte granularity */ +#define HISI_HHA_MAINT_ALIGN 128 + +#define HISI_HHA_POLL_GAP_US 10 +#define HISI_HHA_POLL_TIMEOUT_US 50000 + +struct hisi_soc_hha { + /* Must be first element */ + struct cache_coherency_ops_inst cci; + /* Locks HHA instance to forbid overlapping access. */ + struct mutex lock; + void __iomem *base; +}; + +static bool hisi_hha_cache_maintain_wait_finished(struct hisi_soc_hha *soc_hha) +{ + u32 val; + + return !readl_poll_timeout_atomic(soc_hha->base + HISI_HHA_CTRL, val, + !(val & HISI_HHA_CTRL_EN), + HISI_HHA_POLL_GAP_US, + HISI_HHA_POLL_TIMEOUT_US); +} + +static int hisi_soc_hha_wbinv(struct cache_coherency_ops_inst *cci, + struct cc_inval_params *invp) +{ + struct hisi_soc_hha *soc_hha = + container_of(cci, struct hisi_soc_hha, cci); + phys_addr_t top, addr = invp->addr; + size_t size = invp->size; + u32 reg; + + if (!size) + return -EINVAL; + + addr = ALIGN_DOWN(addr, HISI_HHA_MAINT_ALIGN); + top = ALIGN(addr + size, HISI_HHA_MAINT_ALIGN); + size = top - addr; + + guard(mutex)(&soc_hha->lock); + + if (!hisi_hha_cache_maintain_wait_finished(soc_hha)) + return -EBUSY; + + /* + * Hardware will search for addresses ranging [addr, addr + size - 1], + * last byte included, and perform maintenance in 128 byte granules + * on those cachelines which contain the addresses. If a given instance + * is either not responsible for a cacheline or that cacheline is not + * currently present then the search will fail, no operation will be + * necessary and the device will report success. + */ + size -= 1; + + writel(lower_32_bits(addr), soc_hha->base + HISI_HHA_START_L); + writel(upper_32_bits(addr), soc_hha->base + HISI_HHA_START_H); + writel(lower_32_bits(size), soc_hha->base + HISI_HHA_LEN_L); + writel(upper_32_bits(size), soc_hha->base + HISI_HHA_LEN_H); + + reg = FIELD_PREP(HISI_HHA_CTRL_TYPE, 1); /* Clean Invalid */ + reg |= HISI_HHA_CTRL_RANGE | HISI_HHA_CTRL_EN; + writel(reg, soc_hha->base + HISI_HHA_CTRL); + + return 0; +} + +static int hisi_soc_hha_done(struct cache_coherency_ops_inst *cci) +{ + struct hisi_soc_hha *soc_hha = + container_of(cci, struct hisi_soc_hha, cci); + + guard(mutex)(&soc_hha->lock); + if (!hisi_hha_cache_maintain_wait_finished(soc_hha)) + return -ETIMEDOUT; + + return 0; +} + +static const struct cache_coherency_ops hha_ops = { + .wbinv = hisi_soc_hha_wbinv, + .done = hisi_soc_hha_done, +}; + +static int hisi_soc_hha_probe(struct platform_device *pdev) +{ + struct hisi_soc_hha *soc_hha; + struct resource *mem; + int ret; + + soc_hha = cache_coherency_ops_instance_alloc(&hha_ops, + struct hisi_soc_hha, cci); + if (!soc_hha) + return -ENOMEM; + + platform_set_drvdata(pdev, soc_hha); + + mutex_init(&soc_hha->lock); + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!mem) { + ret = -ENOMEM; + goto err_free_cci; + } + + soc_hha->base = ioremap(mem->start, resource_size(mem)); + if (!soc_hha->base) { + ret = dev_err_probe(&pdev->dev, -ENOMEM, + "failed to remap io memory"); + goto err_free_cci; + } + + ret = cache_coherency_ops_instance_register(&soc_hha->cci); + if (ret) + goto err_iounmap; + + return 0; + +err_iounmap: + iounmap(soc_hha->base); +err_free_cci: + cache_coherency_ops_instance_put(&soc_hha->cci); + return ret; +} + +static void hisi_soc_hha_remove(struct platform_device *pdev) +{ + struct hisi_soc_hha *soc_hha = platform_get_drvdata(pdev); + + cache_coherency_ops_instance_unregister(&soc_hha->cci); + iounmap(soc_hha->base); + cache_coherency_ops_instance_put(&soc_hha->cci); +} + +static const struct acpi_device_id hisi_soc_hha_ids[] = { + { "HISI0511", }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_soc_hha_ids); + +static struct platform_driver hisi_soc_hha_driver = { + .driver = { + .name = "hisi_soc_hha", + .acpi_match_table = hisi_soc_hha_ids, + }, + .probe = hisi_soc_hha_probe, + .remove = hisi_soc_hha_remove, +}; + +module_platform_driver(hisi_soc_hha_driver); + +MODULE_IMPORT_NS("CACHE_COHERENCY"); +MODULE_DESCRIPTION("HiSilicon Hydra Home Agent driver supporting cache maintenance"); +MODULE_AUTHOR("Yicong Yang "); +MODULE_AUTHOR("Yushan Wang "); +MODULE_LICENSE("GPL"); From 9fa4cc68151995fa46691a0df17ddf4c10f9dbd1 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 17 Nov 2025 11:53:11 +0100 Subject: [PATCH 070/143] MAINTAINERS: refer to intended file in STANDALONE CACHE CONTROLLER DRIVERS Commit 23db6eed72bd ("MAINTAINERS: Add Jonathan Cameron to drivers/cache and add lib/cache_maint.c + header") intends to add a file entry pointing to the cache_coherency.h file, but messes up to name the right path. Update the entry to the intended file. Signed-off-by: Lukas Bulwahn Acked-by: Jonathan Cameron Signed-off-by: Conor Dooley (cherry picked from commit 055bcc552b5181da208038c1de9437e9cca69380) Signed-off-by: Jiandi An --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 13e62f538099b..3be1319994827 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24055,7 +24055,7 @@ S: Maintained T: git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/ F: Documentation/devicetree/bindings/cache/ F: drivers/cache -F: include/cache_coherency.h +F: include/linux/cache_coherency.h F: lib/cache_maint.c STARFIRE/DURALAN NETWORK DRIVER From bac9db58a0a7d1b68725fe693db3a420a6b3780d Mon Sep 17 00:00:00 2001 From: "Christophe Leroy (CS GROUP)" Date: Wed, 26 Nov 2025 12:26:57 +0100 Subject: [PATCH 071/143] MAINTAINERS: Update email address for Christophe Leroy My address at csgroup.eu is redirected to the new one at cs-soprasteria.com which is a Professionnal Microsoft account without SMTP gateway. We still have the SMTP gateway for csgroup.eu but it is not maintained anymore and might stop working at anytime. In addition the DKIM signature is not performed allthough the domain has DMARC set up. Switch to kernel.org email address and add entries in mailmap. Link: https://lore.kernel.org/r/d9b6758297d7dcddf79feb4459ceaedd7d6f1f2e.1764155757.git.chleroy@kernel.org Signed-off-by: Christophe Leroy (CS GROUP) (cherry picked from commit 3fca89b7756c5bb885e3a41df1443aa39f35951b) Signed-off-by: Jiandi An --- .mailmap | 3 +++ MAINTAINERS | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.mailmap b/.mailmap index 8db24be50158d..d64a8ed369b29 100644 --- a/.mailmap +++ b/.mailmap @@ -182,6 +182,9 @@ Christian Brauner Christian Brauner Christian Brauner Christian Marangi +Christophe Leroy +Christophe Leroy +Christophe Leroy Christophe Ricard Christopher Obbard Christoph Hellwig diff --git a/MAINTAINERS b/MAINTAINERS index 3be1319994827..1c7561495148e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4498,7 +4498,7 @@ F: drivers/net/ethernet/netronome/nfp/bpf/ BPF JIT for POWERPC (32-BIT AND 64-BIT) M: Hari Bathini -M: Christophe Leroy +M: Christophe Leroy (CS GROUP) R: Naveen N Rao L: bpf@vger.kernel.org S: Supported @@ -9871,7 +9871,7 @@ F: drivers/spi/spi-fsl-qspi.c FREESCALE QUICC ENGINE LIBRARY M: Qiang Zhao -M: Christophe Leroy +M: Christophe Leroy (CS GROUP) L: linuxppc-dev@lists.ozlabs.org S: Maintained F: drivers/soc/fsl/qe/ @@ -9924,7 +9924,7 @@ S: Maintained F: drivers/tty/serial/ucc_uart.c FREESCALE SOC DRIVERS -M: Christophe Leroy +M: Christophe Leroy (CS GROUP) L: linuxppc-dev@lists.ozlabs.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -14117,7 +14117,7 @@ LINUX FOR POWERPC (32-BIT AND 64-BIT) M: Madhavan Srinivasan M: Michael Ellerman R: Nicholas Piggin -R: Christophe Leroy +R: Christophe Leroy (CS GROUP) L: linuxppc-dev@lists.ozlabs.org S: Supported W: https://github.com/linuxppc/wiki/wiki @@ -14173,7 +14173,7 @@ F: Documentation/devicetree/bindings/powerpc/fsl/ F: arch/powerpc/platforms/85xx/ LINUX FOR POWERPC EMBEDDED PPC8XX AND PPC83XX -M: Christophe Leroy +M: Christophe Leroy (CS GROUP) L: linuxppc-dev@lists.ozlabs.org S: Maintained F: arch/powerpc/platforms/8xx/ From 49109306411668d981ee0f67b4b2aec782511afa Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 7 Nov 2025 16:29:50 +0100 Subject: [PATCH 072/143] soc: fsl: qbman: add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueues a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/r/20251107152950.293899-1-marco.crivellari@suse.com Signed-off-by: Christophe Leroy (CS GROUP) (cherry picked from commit c181703a290a13c088ca2ac7b984ec8e676acb2b) Signed-off-by: Jiandi An --- drivers/soc/fsl/qbman/qman.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index 9be240999f877..6b392b3ad4b15 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -1073,7 +1073,7 @@ EXPORT_SYMBOL(qman_portal_set_iperiod); int qman_wq_alloc(void) { - qm_portal_wq = alloc_workqueue("qman_portal_wq", 0, 1); + qm_portal_wq = alloc_workqueue("qman_portal_wq", WQ_PERCPU, 1); if (!qm_portal_wq) return -ENOMEM; return 0; From 72964cb0fb65c9197929dc7a8bd547e141e342c6 Mon Sep 17 00:00:00 2001 From: Gongwei Li Date: Fri, 21 Nov 2025 14:10:22 +0800 Subject: [PATCH 073/143] soc: fsl: qbman: use kmalloc_array() instead of kmalloc() Replace kmalloc() with kmalloc_array() to prevent potential overflow, as recommended in Documentation/process/deprecated.rst. Signed-off-by: Gongwei Li Reviewed-by: Fushuai Wang Link: https://lore.kernel.org/r/20251121061022.114609-1-13875017792@163.com Signed-off-by: Christophe Leroy (CS GROUP) (cherry picked from commit 760b8eec2cf861c5b013f62c4af8ee06c959853e) Signed-off-by: Jiandi An --- drivers/soc/fsl/qbman/qman_test_stash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/fsl/qbman/qman_test_stash.c b/drivers/soc/fsl/qbman/qman_test_stash.c index f4d3c2146f4f0..18131a5d5979a 100644 --- a/drivers/soc/fsl/qbman/qman_test_stash.c +++ b/drivers/soc/fsl/qbman/qman_test_stash.c @@ -219,7 +219,7 @@ static int allocate_frame_data(void) pcfg = qman_get_qm_portal_config(qman_dma_portal); - __frame_ptr = kmalloc(4 * HP_NUM_WORDS, GFP_KERNEL); + __frame_ptr = kmalloc_array(4, HP_NUM_WORDS, GFP_KERNEL); if (!__frame_ptr) return -ENOMEM; From 2f317e54cd68f9c546d40d6cac25a643a63dff81 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:11 -0800 Subject: [PATCH 074/143] cxl/mem: Fix devm_cxl_memdev_edac_release() confusion A device release method is only for undoing allocations on the path to preparing the device for device_add(). In contrast, devm allocations are post device_add(), are acquired during / after ->probe() and are released synchronous with ->remove(). So, a "devm" helper in a "release" method is a clear anti-pattern. Move this devm release action where it belongs, an action created at edac object creation time. Otherwise, this leaks resources until cxl_memdev_release() time which may be long after these xarray and error record caches have gone idle. Note, this also fixes up the type of @cxlmd->err_rec_array which needlessly dropped type-safety. Fixes: 0b5ccb0de1e2 ("cxl/edac: Support for finding memory operation attributes from the current boot") Cc: Dave Jiang Cc: Jonathan Cameron Cc: Shiju Jose Cc: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Shiju Jose Reviewed-by: Shiju Jose Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-2-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 10016118b6fade907143a32a7aeaa777063dc79c) Signed-off-by: Jiandi An --- drivers/cxl/core/edac.c | 64 ++++++++++++++++++++++----------------- drivers/cxl/core/memdev.c | 1 - drivers/cxl/cxlmem.h | 5 +-- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/drivers/cxl/core/edac.c b/drivers/cxl/core/edac.c index 79994ca9bc9f3..81160260e26b7 100644 --- a/drivers/cxl/core/edac.c +++ b/drivers/cxl/core/edac.c @@ -1988,6 +1988,40 @@ static int cxl_memdev_soft_ppr_init(struct cxl_memdev *cxlmd, return 0; } +static void err_rec_free(void *_cxlmd) +{ + struct cxl_memdev *cxlmd = _cxlmd; + struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; + struct cxl_event_gen_media *rec_gen_media; + struct cxl_event_dram *rec_dram; + unsigned long index; + + cxlmd->err_rec_array = NULL; + xa_for_each(&array_rec->rec_dram, index, rec_dram) + kfree(rec_dram); + xa_destroy(&array_rec->rec_dram); + + xa_for_each(&array_rec->rec_gen_media, index, rec_gen_media) + kfree(rec_gen_media); + xa_destroy(&array_rec->rec_gen_media); + kfree(array_rec); +} + +static int devm_cxl_memdev_setup_err_rec(struct cxl_memdev *cxlmd) +{ + struct cxl_mem_err_rec *array_rec = + kzalloc(sizeof(*array_rec), GFP_KERNEL); + + if (!array_rec) + return -ENOMEM; + + xa_init(&array_rec->rec_gen_media); + xa_init(&array_rec->rec_dram); + cxlmd->err_rec_array = array_rec; + + return devm_add_action_or_reset(&cxlmd->dev, err_rec_free, cxlmd); +} + int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd) { struct edac_dev_feature ras_features[CXL_NR_EDAC_DEV_FEATURES]; @@ -2038,15 +2072,9 @@ int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd) } if (repair_inst) { - struct cxl_mem_err_rec *array_rec = - devm_kzalloc(&cxlmd->dev, sizeof(*array_rec), - GFP_KERNEL); - if (!array_rec) - return -ENOMEM; - - xa_init(&array_rec->rec_gen_media); - xa_init(&array_rec->rec_dram); - cxlmd->err_rec_array = array_rec; + rc = devm_cxl_memdev_setup_err_rec(cxlmd); + if (rc) + return rc; } } @@ -2088,22 +2116,4 @@ int devm_cxl_region_edac_register(struct cxl_region *cxlr) } EXPORT_SYMBOL_NS_GPL(devm_cxl_region_edac_register, "CXL"); -void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd) -{ - struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; - struct cxl_event_gen_media *rec_gen_media; - struct cxl_event_dram *rec_dram; - unsigned long index; - - if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec) - return; - - xa_for_each(&array_rec->rec_dram, index, rec_dram) - kfree(rec_dram); - xa_destroy(&array_rec->rec_dram); - xa_for_each(&array_rec->rec_gen_media, index, rec_gen_media) - kfree(rec_gen_media); - xa_destroy(&array_rec->rec_gen_media); -} -EXPORT_SYMBOL_NS_GPL(devm_cxl_memdev_edac_release, "CXL"); diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index e370d733e4400..4dff7f44d908e 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -27,7 +27,6 @@ static void cxl_memdev_release(struct device *dev) struct cxl_memdev *cxlmd = to_cxl_memdev(dev); ida_free(&cxl_memdev_ida, cxlmd->id); - devm_cxl_memdev_edac_release(cxlmd); kfree(cxlmd); } diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 434031a0c1f74..c12ab4fc95123 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -63,7 +63,7 @@ struct cxl_memdev { int depth; u8 scrub_cycle; int scrub_region_id; - void *err_rec_array; + struct cxl_mem_err_rec *err_rec_array; }; static inline struct cxl_memdev *to_cxl_memdev(struct device *dev) @@ -877,7 +877,6 @@ int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd); int devm_cxl_region_edac_register(struct cxl_region *cxlr); int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, union cxl_event *evt); int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt); -void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd); #else static inline int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd) { return 0; } @@ -889,8 +888,6 @@ static inline int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, static inline int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt) { return 0; } -static inline void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd) -{ return; } #endif #ifdef CONFIG_CXL_SUSPEND From 604d26b32a6cffc33262c685baa80317a5ce2b5e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:12 -0800 Subject: [PATCH 075/143] cxl/mem: Arrange for always-synchronous memdev attach In preparation for CXL accelerator drivers that have a hard dependency on CXL capability initialization, arrange for cxl_mem_probe() to always run synchronous with the device_add() of cxl_memdev instances. I.e. cxl_mem_driver registration is always complete before the first memdev creation event. At present, cxl_pci does not care about the attach state of the cxl_memdev because all generic memory expansion functionality can be handled by the cxl_core. For accelerators, however, that driver needs to perform driver specific initialization if CXL is available, or execute a fallback to PCIe only operation. This synchronous attach guarantee is also needed for Soft Reserve Recovery, which is an effort that needs to assert that devices have had a chance to attach before making a go / no-go decision on proceeding with CXL subsystem initialization. By moving devm_cxl_add_memdev() to cxl_mem.ko it removes async module loading as one reason that a memdev may not be attached upon return from devm_cxl_add_memdev(). Cc: Smita Koralahalli Cc: Alejandro Lucero Reviewed-by: Jonathan Cameron Tested-by: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-3-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 1f1cb7f0c25574cf51501f8c8cece0047d7e8848) Signed-off-by: Jiandi An --- drivers/cxl/Kconfig | 2 +- drivers/cxl/core/memdev.c | 10 +++++++--- drivers/cxl/cxlmem.h | 2 ++ drivers/cxl/mem.c | 17 +++++++++++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 48b7314afdb88..f1361ed6a0d48 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -22,6 +22,7 @@ if CXL_BUS config CXL_PCI tristate "PCI manageability" default CXL_BUS + select CXL_MEM help The CXL specification defines a "CXL memory device" sub-class in the PCI "memory controller" base class of devices. Device's identified by @@ -89,7 +90,6 @@ config CXL_PMEM config CXL_MEM tristate "CXL: Memory Expansion" - depends on CXL_PCI default CXL_BUS help The CXL.mem protocol allows a device to act as a provider of "System diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 4dff7f44d908e..7a4153e1c6a78 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1050,8 +1050,12 @@ static const struct file_operations cxl_memdev_fops = { .llseek = noop_llseek, }; -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +/* + * Core helper for devm_cxl_add_memdev() that wants to both create a device and + * assert to the caller that upon return cxl_mem::probe() has been invoked. + */ +struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, + struct cxl_dev_state *cxlds) { struct cxl_memdev *cxlmd; struct device *dev; @@ -1093,7 +1097,7 @@ struct cxl_memdev *devm_cxl_add_memdev(struct device *host, put_device(dev); return ERR_PTR(rc); } -EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); +EXPORT_SYMBOL_FOR_MODULES(__devm_cxl_add_memdev, "cxl_mem"); static void sanitize_teardown_notifier(void *data) { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index c12ab4fc95123..012e68acad342 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -95,6 +95,8 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } +struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, + struct cxl_dev_state *cxlds); struct cxl_memdev *devm_cxl_add_memdev(struct device *host, struct cxl_dev_state *cxlds); int devm_cxl_sanitize_setup_notifier(struct device *host, diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 6e6777b7bafb5..55883797ab2db 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -201,6 +201,22 @@ static int cxl_mem_probe(struct device *dev) return devm_add_action_or_reset(dev, enable_suspend, NULL); } +/** + * devm_cxl_add_memdev - Add a CXL memory device + * @host: devres alloc/release context and parent for the memdev + * @cxlds: CXL device state to associate with the memdev + * + * Upon return the device will have had a chance to attach to the + * cxl_mem driver, but may fail if the CXL topology is not ready + * (hardware CXL link down, or software platform CXL root not attached) + */ +struct cxl_memdev *devm_cxl_add_memdev(struct device *host, + struct cxl_dev_state *cxlds) +{ + return __devm_cxl_add_memdev(host, cxlds); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); + static ssize_t trigger_poison_list_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -248,6 +264,7 @@ static struct cxl_driver cxl_mem_driver = { .probe = cxl_mem_probe, .id = CXL_DEVICE_MEMORY_EXPANDER, .drv = { + .probe_type = PROBE_FORCE_SYNCHRONOUS, .dev_groups = cxl_mem_groups, }, }; From 61c9f04507984a7b15c67deefad4911fff1350f5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:13 -0800 Subject: [PATCH 076/143] cxl/port: Arrange for always synchronous endpoint attach Make it so that upon return from devm_cxl_add_endpoint() that cxl_mem_probe() can assume that the endpoint has had a chance to complete cxl_port_probe(). I.e. cxl_port module loading has completed prior to device registration. Delete the MODULE_SOFTDEP() as it is not sufficient for this purpose, but a hard link-time dependency is reliable. Specifically MODULE_SOFTDEP() does not guarantee that the module loading has completed prior to the completion of the current module's init. Cc: Smita Koralahalli Cc: Alejandro Lucero Reviewed-by: Jonathan Cameron Tested-by: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-4-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit ae201a0092362ffdec7206efa1ec85e260fab8d2) Signed-off-by: Jiandi An --- drivers/cxl/cxl.h | 2 ++ drivers/cxl/mem.c | 43 ------------------------------------------- drivers/cxl/port.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ba17fa86d249e..c796c3db36e0b 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -780,6 +780,8 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct cxl_dport *parent_dport); struct cxl_root *devm_cxl_add_root(struct device *host, const struct cxl_root_ops *ops); +int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd, + struct cxl_dport *parent_dport); struct cxl_root *find_cxl_root(struct cxl_port *port); DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev)) diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 55883797ab2db..d62931526fd41 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -45,44 +45,6 @@ static int cxl_mem_dpa_show(struct seq_file *file, void *data) return 0; } -static int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd, - struct cxl_dport *parent_dport) -{ - struct cxl_port *parent_port = parent_dport->port; - struct cxl_port *endpoint, *iter, *down; - int rc; - - /* - * Now that the path to the root is established record all the - * intervening ports in the chain. - */ - for (iter = parent_port, down = NULL; !is_cxl_root(iter); - down = iter, iter = to_cxl_port(iter->dev.parent)) { - struct cxl_ep *ep; - - ep = cxl_ep_load(iter, cxlmd); - ep->next = down; - } - - /* Note: endpoint port component registers are derived from @cxlds */ - endpoint = devm_cxl_add_port(host, &cxlmd->dev, CXL_RESOURCE_NONE, - parent_dport); - if (IS_ERR(endpoint)) - return PTR_ERR(endpoint); - - rc = cxl_endpoint_autoremove(cxlmd, endpoint); - if (rc) - return rc; - - if (!endpoint->dev.driver) { - dev_err(&cxlmd->dev, "%s failed probe\n", - dev_name(&endpoint->dev)); - return -ENXIO; - } - - return 0; -} - static int cxl_debugfs_poison_inject(void *data, u64 dpa) { struct cxl_memdev *cxlmd = data; @@ -275,8 +237,3 @@ MODULE_DESCRIPTION("CXL: Memory Expansion"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS("CXL"); MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER); -/* - * create_endpoint() wants to validate port driver attach immediately after - * endpoint registration. - */ -MODULE_SOFTDEP("pre: cxl_port"); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 51c8f2f84717a..7937e7e53797c 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -156,10 +156,50 @@ static struct cxl_driver cxl_port_driver = { .probe = cxl_port_probe, .id = CXL_DEVICE_PORT, .drv = { + .probe_type = PROBE_FORCE_SYNCHRONOUS, .dev_groups = cxl_port_attribute_groups, }, }; +int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd, + struct cxl_dport *parent_dport) +{ + struct cxl_port *parent_port = parent_dport->port; + struct cxl_port *endpoint, *iter, *down; + int rc; + + /* + * Now that the path to the root is established record all the + * intervening ports in the chain. + */ + for (iter = parent_port, down = NULL; !is_cxl_root(iter); + down = iter, iter = to_cxl_port(iter->dev.parent)) { + struct cxl_ep *ep; + + ep = cxl_ep_load(iter, cxlmd); + ep->next = down; + } + + /* Note: endpoint port component registers are derived from @cxlds */ + endpoint = devm_cxl_add_port(host, &cxlmd->dev, CXL_RESOURCE_NONE, + parent_dport); + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + + rc = cxl_endpoint_autoremove(cxlmd, endpoint); + if (rc) + return rc; + + if (!endpoint->dev.driver) { + dev_err(&cxlmd->dev, "%s failed probe\n", + dev_name(&endpoint->dev)); + return -ENXIO; + } + + return 0; +} +EXPORT_SYMBOL_FOR_MODULES(devm_cxl_add_endpoint, "cxl_mem"); + static int __init cxl_port_init(void) { return cxl_driver_register(&cxl_port_driver); From eb57cbc51ec38886dc1e88d48c6ec5dd134a3b69 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:14 -0800 Subject: [PATCH 077/143] cxl/mem: Convert devm_cxl_add_memdev() to scope-based-cleanup In preparation for adding more setup steps, convert the current implementation to scope-based cleanup. The cxl_memdev_shutdown() is only required after cdev_device_add(). With that moved to a helper function it precludes the need to add scope-based-handler for that cleanup if devm_add_action_or_reset() fails. Cc: Smita Koralahalli Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Tested-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251216005616.3090129-5-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 6e1d21903ff213f1384ce43daa279c0965904116) Signed-off-by: Jiandi An --- drivers/cxl/core/memdev.c | 70 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 7a4153e1c6a78..92aea95859fb6 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1050,6 +1050,45 @@ static const struct file_operations cxl_memdev_fops = { .llseek = noop_llseek, }; +/* + * Activate ioctl operations, no cxl_memdev_rwsem manipulation needed as this is + * ordered with cdev_add() publishing the device. + */ +static int cxlmd_add(struct cxl_memdev *cxlmd, struct cxl_dev_state *cxlds) +{ + int rc; + + cxlmd->cxlds = cxlds; + cxlds->cxlmd = cxlmd; + + rc = cdev_device_add(&cxlmd->cdev, &cxlmd->dev); + if (rc) { + /* + * The cdev was briefly live, shutdown any ioctl operations that + * saw that state. + */ + cxl_memdev_shutdown(&cxlmd->dev); + return rc; + } + + return 0; +} + +DEFINE_FREE(put_cxlmd, struct cxl_memdev *, + if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) + +static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) +{ + int rc; + + rc = devm_add_action_or_reset(cxlmd->cxlds->dev, cxl_memdev_unregister, + cxlmd); + if (rc) + return ERR_PTR(rc); + + return cxlmd; +} + /* * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. @@ -1057,45 +1096,24 @@ static const struct file_operations cxl_memdev_fops = { struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, struct cxl_dev_state *cxlds) { - struct cxl_memdev *cxlmd; struct device *dev; - struct cdev *cdev; int rc; - cxlmd = cxl_memdev_alloc(cxlds, &cxl_memdev_fops); + struct cxl_memdev *cxlmd __free(put_cxlmd) = + cxl_memdev_alloc(cxlds, &cxl_memdev_fops); if (IS_ERR(cxlmd)) return cxlmd; dev = &cxlmd->dev; rc = dev_set_name(dev, "mem%d", cxlmd->id); if (rc) - goto err; - - /* - * Activate ioctl operations, no cxl_memdev_rwsem manipulation - * needed as this is ordered with cdev_add() publishing the device. - */ - cxlmd->cxlds = cxlds; - cxlds->cxlmd = cxlmd; - - cdev = &cxlmd->cdev; - rc = cdev_device_add(cdev, dev); - if (rc) - goto err; + return ERR_PTR(rc); - rc = devm_add_action_or_reset(host, cxl_memdev_unregister, cxlmd); + rc = cxlmd_add(cxlmd, cxlds); if (rc) return ERR_PTR(rc); - return cxlmd; -err: - /* - * The cdev was briefly live, shutdown any ioctl operations that - * saw that state. - */ - cxl_memdev_shutdown(dev); - put_device(dev); - return ERR_PTR(rc); + return cxl_memdev_autoremove(no_free_ptr(cxlmd)); } EXPORT_SYMBOL_FOR_MODULES(__devm_cxl_add_memdev, "cxl_mem"); From 18ff2a23cb3f0484f9f8efb8cc7d41af995a3d97 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:15 -0800 Subject: [PATCH 078/143] cxl/mem: Drop @host argument to devm_cxl_add_memdev() In all cases the device that created the 'struct cxl_dev_state' instance is also the device to host the devm cleanup of devm_cxl_add_memdev(). This simplifies the function prototype, and limits a degree of freedom of the API. Cc: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-6-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit f2546eba53bbe38c4bb950f78625ccf4b1a2cbc8) Signed-off-by: Jiandi An --- drivers/cxl/core/memdev.c | 3 +-- drivers/cxl/cxlmem.h | 6 ++---- drivers/cxl/mem.c | 9 +++++---- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 92aea95859fb6..935a163f1527d 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1093,8 +1093,7 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. */ -struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds) { struct device *dev; int rc; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 012e68acad342..9db31c7993c48 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -95,10 +95,8 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } -struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds); -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds); +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index d62931526fd41..677996c652724 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -165,17 +165,18 @@ static int cxl_mem_probe(struct device *dev) /** * devm_cxl_add_memdev - Add a CXL memory device - * @host: devres alloc/release context and parent for the memdev * @cxlds: CXL device state to associate with the memdev * * Upon return the device will have had a chance to attach to the * cxl_mem driver, but may fail if the CXL topology is not ready * (hardware CXL link down, or software platform CXL root not attached) + * + * The parent of the resulting device and the devm context for allocations is + * @cxlds->dev. */ -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) { - return __devm_cxl_add_memdev(host, cxlds); + return __devm_cxl_add_memdev(cxlds); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0be4e508affe7..1c6fc53348069 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "No CXL Features discovered\n"); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 176dcde570cdd..8a22b76016273 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); From 14289aa0bc9c7b88982ae27af756c6fbc476f9e4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:16 -0800 Subject: [PATCH 079/143] cxl/mem: Introduce cxl_memdev_attach for CXL-dependent operation Unlike the cxl_pci class driver that opportunistically enables memory expansion with no other dependent functionality, CXL accelerator drivers have distinct PCIe-only and CXL-enhanced operation states. If CXL is available some additional coherent memory/cache operations can be enabled, otherwise traditional DMA+MMIO over PCIe/CXL.io is a fallback. This constitutes a new mode of operation where the caller of devm_cxl_add_memdev() wants to make a "go/no-go" decision about running in CXL accelerated mode or falling back to PCIe-only operation. Part of that decision making process likely also includes additional CXL-acceleration-specific resource setup. Encapsulate both of those requirements into 'struct cxl_memdev_attach' that provides a ->probe() callback. The probe callback runs in cxl_mem_probe() context, after the port topology is successfully attached for the given memdev. It supports a contract where, upon successful return from devm_cxl_add_memdev(), everything needed for CXL accelerated operation has been enabled. Additionally the presence of @cxlmd->attach indicates that the accelerator driver be detached when CXL operation ends. This conceptually makes a CXL link loss event mirror a PCIe link loss event which results in triggering the ->remove() callback of affected devices+drivers. A driver can re-attach to recover back to PCIe-only operation. Live recovery, i.e. without a ->remove()/->probe() cycle, is left as a future consideration. [ dj: Repalce with updated commit log from Dan ] Cc: Smita Koralahalli Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Tested-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251216005616.3090129-7-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 29317f8dc6ed601ec54575689c2cd55cc470bcce) Signed-off-by: Jiandi An --- drivers/cxl/core/memdev.c | 33 +++++++++++++++++++++++++++++---- drivers/cxl/cxlmem.h | 12 ++++++++++-- drivers/cxl/mem.c | 20 ++++++++++++++++---- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 935a163f1527d..af3d0cc651387 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -641,14 +641,24 @@ static void detach_memdev(struct work_struct *work) struct cxl_memdev *cxlmd; cxlmd = container_of(work, typeof(*cxlmd), detach_work); - device_release_driver(&cxlmd->dev); + + /* + * When the creator of @cxlmd sets ->attach it indicates CXL operation + * is required. In that case, @cxlmd detach escalates to parent device + * detach. + */ + if (cxlmd->attach) + device_release_driver(cxlmd->dev.parent); + else + device_release_driver(&cxlmd->dev); put_device(&cxlmd->dev); } static struct lock_class_key cxl_memdev_key; static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, - const struct file_operations *fops) + const struct file_operations *fops, + const struct cxl_memdev_attach *attach) { struct cxl_memdev *cxlmd; struct device *dev; @@ -664,6 +674,8 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, goto err; cxlmd->id = rc; cxlmd->depth = -1; + cxlmd->attach = attach; + cxlmd->endpoint = ERR_PTR(-ENXIO); dev = &cxlmd->dev; device_initialize(dev); @@ -1081,6 +1093,18 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) { int rc; + /* + * If @attach is provided fail if the driver is not attached upon + * return. Note that failure here could be the result of a race to + * teardown the CXL port topology. I.e. cxl_mem_probe() could have + * succeeded and then cxl_mem unbound before the lock is acquired. + */ + guard(device)(&cxlmd->dev); + if (cxlmd->attach && !cxlmd->dev.driver) { + cxl_memdev_unregister(cxlmd); + return ERR_PTR(-ENXIO); + } + rc = devm_add_action_or_reset(cxlmd->cxlds->dev, cxl_memdev_unregister, cxlmd); if (rc) @@ -1093,13 +1117,14 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. */ -struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach) { struct device *dev; int rc; struct cxl_memdev *cxlmd __free(put_cxlmd) = - cxl_memdev_alloc(cxlds, &cxl_memdev_fops); + cxl_memdev_alloc(cxlds, &cxl_memdev_fops, attach); if (IS_ERR(cxlmd)) return cxlmd; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 9db31c7993c48..ef202b34e5ea4 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,6 +34,10 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -43,6 +47,7 @@ * @cxl_nvb: coordinate removal of @cxl_nvd if present * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem * @endpoint: connection to the CXL port topology for this memory device + * @attach: creator of this memdev depends on CXL link attach to operate * @id: id number of this memdev instance. * @depth: endpoint port depth * @scrub_cycle: current scrub cycle set for this device @@ -59,6 +64,7 @@ struct cxl_memdev { struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_nvdimm *cxl_nvd; struct cxl_port *endpoint; + const struct cxl_memdev_attach *attach; int id; int depth; u8 scrub_cycle; @@ -95,8 +101,10 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } -struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 677996c652724..333c366b69e76 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -142,6 +142,12 @@ static int cxl_mem_probe(struct device *dev) return rc; } + if (cxlmd->attach) { + rc = cxlmd->attach->probe(cxlmd); + if (rc) + return rc; + } + rc = devm_cxl_memdev_edac_register(cxlmd); if (rc) dev_dbg(dev, "CXL memdev EDAC registration failed rc=%d\n", rc); @@ -166,17 +172,23 @@ static int cxl_mem_probe(struct device *dev) /** * devm_cxl_add_memdev - Add a CXL memory device * @cxlds: CXL device state to associate with the memdev + * @attach: Caller depends on CXL topology attachment * * Upon return the device will have had a chance to attach to the - * cxl_mem driver, but may fail if the CXL topology is not ready - * (hardware CXL link down, or software platform CXL root not attached) + * cxl_mem driver, but may fail to attach if the CXL topology is not ready + * (hardware CXL link down, or software platform CXL root not attached). + * + * When @attach is NULL it indicates the caller wants the memdev to remain + * registered even if it does not immediately attach to the CXL hierarchy. When + * @attach is provided a cxl_mem_probe() failure leads to failure of this routine. * * The parent of the resulting device and the devm context for allocations is * @cxlds->dev. */ -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach) { - return __devm_cxl_add_memdev(cxlds); + return __devm_cxl_add_memdev(cxlds, attach); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 1c6fc53348069..549368a9c868f 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "No CXL Features discovered\n"); - cxlmd = devm_cxl_add_memdev(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 8a22b76016273..cb87e8c0e63c0 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); From 91c8344a5d164078eaf354646c84bda3ace826ce Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:22 -0600 Subject: [PATCH 080/143] PCI: Move CXL DVSEC definitions into uapi/linux/pci_regs.h The CXL DVSECs are currently defined in cxl/core/cxlpci.h. These are not accessible to other subsystems. Move these to uapi/linux/pci_regs.h. The CXL DVSEC definitions will be renamed and reformatted to fit better with existing defines. Signed-off-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Signed-off-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-2-terry.bowman@amd.com Signed-off-by: Dave Jiang (backported from commit 0f7afd80d81b739c4a9a6e4e24109ba1030c9c56) [jan: Resolve minor conflict due to common anchor not existing in 6.17] Signed-off-by: Jiandi An --- drivers/cxl/cxlpci.h | 53 --------------------------------- include/uapi/linux/pci_regs.h | 56 +++++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 56 deletions(-) diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 1d526bea84312..cdb7cf3dbcb43 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -7,59 +7,6 @@ #define CXL_MEMORY_PROGIF 0x10 -/* - * See section 8.1 Configuration Space Registers in the CXL 2.0 - * Specification. Names are taken straight from the specification with "CXL" and - * "DVSEC" redundancies removed. When obvious, abbreviations may be used. - */ -#define PCI_DVSEC_HEADER1_LENGTH_MASK GENMASK(31, 20) - -/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ -#define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_MEM_CAPABLE BIT(2) -#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) -#define CXL_DVSEC_CTRL_OFFSET 0xC -#define CXL_DVSEC_MEM_ENABLE BIT(2) -#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) -#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) -#define CXL_DVSEC_MEM_INFO_VALID BIT(0) -#define CXL_DVSEC_MEM_ACTIVE BIT(1) -#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) -#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) -#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) -#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) - -#define CXL_DVSEC_RANGE_MAX 2 - -/* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */ -#define CXL_DVSEC_FUNCTION_MAP 2 - -/* CXL 2.0 8.1.5: CXL 2.0 Extensions DVSEC for Ports */ -#define CXL_DVSEC_PORT_EXTENSIONS 3 - -/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */ -#define CXL_DVSEC_PORT_GPF 4 -#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8) -#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8) - -/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */ -#define CXL_DVSEC_DEVICE_GPF 5 - -/* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */ -#define CXL_DVSEC_PCIE_FLEXBUS_PORT 7 - -/* CXL 2.0 8.1.9: Register Locator DVSEC */ -#define CXL_DVSEC_REG_LOCATOR 8 -#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC -#define CXL_DVSEC_REG_LOCATOR_BIR_MASK GENMASK(2, 0) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK GENMASK(15, 8) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK GENMASK(31, 16) - /* * NOTE: Currently all the functions which are enabled for CXL require their * vectors to be in the first 16. Use this as the default max. diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index bfa9ada355c9b..8ea78d7e08a3d 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1239,9 +1239,59 @@ #define PCI_DVSEC_CXL_PORT_CTL 0x0c #define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 -/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ +/* + * Compute Express Link (CXL r3.2, sec 8.1) + * + * Note that CXL DVSEC id 3 and 7 to be ignored when the CXL link state + * is "disconnected" (CXL r3.2, sec 9.12.3). Re-enumerate these + * registers on downstream link-up events. + */ +#define PCI_DVSEC_HEADER1_LENGTH_MASK __GENMASK(31, 20) + +/* CXL 3.2 8.1.3: PCIe DVSEC for CXL Device */ #define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_CACHE_CAPABLE BIT(0) +#define CXL_DVSEC_CAP_OFFSET 0xA +#define CXL_DVSEC_CACHE_CAPABLE _BITUL(0) +#define CXL_DVSEC_MEM_CAPABLE _BITUL(2) +#define CXL_DVSEC_HDM_COUNT_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CTRL_OFFSET 0xC +#define CXL_DVSEC_MEM_ENABLE _BITUL(2) +#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define CXL_DVSEC_MEM_INFO_VALID _BITUL(0) +#define CXL_DVSEC_MEM_ACTIVE _BITUL(1) +#define CXL_DVSEC_MEM_SIZE_LOW_MASK __GENMASK(31, 28) +#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define CXL_DVSEC_MEM_BASE_LOW_MASK __GENMASK(31, 28) + +#define CXL_DVSEC_RANGE_MAX 2 + +/* CXL 3.2 8.1.4: Non-CXL Function Map DVSEC */ +#define CXL_DVSEC_FUNCTION_MAP 2 + +/* CXL 3.2 8.1.5: Extensions DVSEC for Ports */ +#define CXL_DVSEC_PORT 3 +#define CXL_DVSEC_PORT_CTL 0x0c +#define CXL_DVSEC_PORT_CTL_UNMASK_SBR 0x00000001 + +/* CXL 3.2 8.1.6: GPF DVSEC for CXL Port */ +#define CXL_DVSEC_PORT_GPF 4 +#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK __GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK __GENMASK(11, 8) +#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK __GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK __GENMASK(11, 8) + +/* CXL 3.2 8.1.7: GPF DVSEC for CXL Device */ +#define CXL_DVSEC_DEVICE_GPF 5 + +/* CXL 3.2 8.1.9: Register Locator DVSEC */ +#define CXL_DVSEC_REG_LOCATOR 8 +#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC +#define CXL_DVSEC_REG_LOCATOR_BIR_MASK __GENMASK(2, 0) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK __GENMASK(15, 8) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK __GENMASK(31, 16) #endif /* LINUX_PCI_REGS_H */ From ed38046d9db5cc8dc07647503d9ddf1d2482833c Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:23 -0600 Subject: [PATCH 081/143] PCI: Update CXL DVSEC definitions CXL DVSEC definitions were recently moved into uapi/pci_regs.h, but the newly added macros do not follow the file's existing naming conventions. The current format uses CXL_DVSEC_XYZ, while the new CXL entries must instead use the PCI_DVSEC_CXL_XYZ prefix to match the conventions already established in pci_regs.h. The new CXL DVSEC macros also introduce _MASK and _OFFSET suffixes, which are not used anywhere else in the file. These suffixes lengthen the identifiers and reduce readability. Remove _MASK and _OFFSET from the recently added definitions. Additionally, remove PCI_DVSEC_HEADER1_LENGTH, as it duplicates the existing PCI_DVSEC_HEADER1_LEN() macro. Update all existing references to use the new macro names. Finally, update the inline documentation to reference the latest revision of the CXL specification. Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-3-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 6612bd9ff0b1001cff5f5d79db6ce44427d2e99c) [jan: Resolve conflict due to commit ancestor anchor missing in 6.17 and CXL_DVSEC_CACHE_CAPABLE introduced by 72bd823fb4f1] Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 58 ++++++++++----------- drivers/cxl/core/regs.c | 14 ++--- drivers/cxl/pci.c | 2 +- drivers/pci/ats.c | 6 +-- include/uapi/linux/pci_regs.h | 96 ++++++++++++++++------------------- 5 files changed, 85 insertions(+), 91 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 5b023a0178a47..077b386e0c8d6 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -86,12 +86,12 @@ static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) i = 1; do { rc = pci_read_config_dword(pdev, - d + CXL_DVSEC_RANGE_SIZE_LOW(id), + d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(id), &temp); if (rc) return rc; - valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp); + valid = FIELD_GET(PCI_DVSEC_CXL_MEM_INFO_VALID, temp); if (valid) break; msleep(1000); @@ -121,11 +121,11 @@ static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) /* Check MEM ACTIVE bit, up to 60s timeout by default */ for (i = media_ready_timeout; i; i--) { rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(id), &temp); if (rc) return rc; - active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp); + active = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE, temp); if (active) break; msleep(1000); @@ -154,11 +154,11 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) u16 cap; rc = pci_read_config_word(pdev, - d + CXL_DVSEC_CAP_OFFSET, &cap); + d + PCI_DVSEC_CXL_CAP, &cap); if (rc) return rc; - hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + hdm_count = FIELD_GET(PCI_DVSEC_CXL_HDM_COUNT, cap); for (i = 0; i < hdm_count; i++) { rc = cxl_dvsec_mem_range_valid(cxlds, i); if (rc) @@ -186,16 +186,16 @@ static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val) u16 ctrl; int rc; - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, &ctrl); if (rc < 0) return rc; - if ((ctrl & CXL_DVSEC_MEM_ENABLE) == val) + if ((ctrl & PCI_DVSEC_CXL_MEM_ENABLE) == val) return 1; - ctrl &= ~CXL_DVSEC_MEM_ENABLE; + ctrl &= ~PCI_DVSEC_CXL_MEM_ENABLE; ctrl |= val; - rc = pci_write_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, ctrl); + rc = pci_write_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, ctrl); if (rc < 0) return rc; @@ -211,7 +211,7 @@ static int devm_cxl_enable_mem(struct device *host, struct cxl_dev_state *cxlds) { int rc; - rc = cxl_set_mem_enable(cxlds, CXL_DVSEC_MEM_ENABLE); + rc = cxl_set_mem_enable(cxlds, PCI_DVSEC_CXL_MEM_ENABLE); if (rc < 0) return rc; if (rc > 0) @@ -273,11 +273,11 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, return -ENXIO; } - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CAP, &cap); if (rc) return rc; - if (!(cap & CXL_DVSEC_MEM_CAPABLE)) { + if (!(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) { dev_dbg(dev, "Not MEM Capable\n"); return -ENXIO; } @@ -288,7 +288,7 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, * driver is for a spec defined class code which must be CXL.mem * capable, there is no point in continuing to enable CXL.mem. */ - hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + hdm_count = FIELD_GET(PCI_DVSEC_CXL_HDM_COUNT, cap); if (!hdm_count || hdm_count > 2) return -EINVAL; @@ -297,11 +297,11 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, * disabled, and they will remain moot after the HDM Decoder * capability is enabled. */ - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, &ctrl); if (rc) return rc; - info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); + info->mem_enabled = FIELD_GET(PCI_DVSEC_CXL_MEM_ENABLE, ctrl); if (!info->mem_enabled) return 0; @@ -314,35 +314,35 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, return rc; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i), &temp); if (rc) return rc; size = (u64)temp << 32; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(i), &temp); if (rc) return rc; - size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK; + size |= temp & PCI_DVSEC_CXL_MEM_SIZE_LOW; if (!size) { continue; } rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), &temp); if (rc) return rc; base = (u64)temp << 32; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), &temp); if (rc) return rc; - base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK; + base |= temp & PCI_DVSEC_CXL_MEM_BASE_LOW; info->dvsec_range[ranges++] = (struct range) { .start = base, @@ -1068,7 +1068,7 @@ u16 cxl_gpf_get_dvsec(struct device *dev) is_port = false; dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, - is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF); + is_port ? PCI_DVSEC_CXL_PORT_GPF : PCI_DVSEC_CXL_DEVICE_GPF); if (!dvsec) dev_warn(dev, "%s GPF DVSEC not present\n", is_port ? "Port" : "Device"); @@ -1084,14 +1084,14 @@ static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) switch (phase) { case 1: - offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET; - base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK; - scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK; + offset = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_CONTROL; + base = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_BASE; + scale = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_SCALE; break; case 2: - offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET; - base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK; - scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK; + offset = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_CONTROL; + base = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_BASE; + scale = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_SCALE; break; default: return -EINVAL; diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 5ca7b0eed568b..a010b32143422 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -271,10 +271,10 @@ EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, "CXL"); static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, struct cxl_register_map *map) { - u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); - int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); + u8 reg_type = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID, reg_lo); + int bar = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BIR, reg_lo); u64 offset = ((u64)reg_hi << 32) | - (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); + (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW); if (offset > pci_resource_len(pdev, bar)) { dev_warn(&pdev->dev, @@ -311,15 +311,15 @@ static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_ty }; regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, - CXL_DVSEC_REG_LOCATOR); + PCI_DVSEC_CXL_REG_LOCATOR); if (!regloc) return -ENXIO; pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); - regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size); + regloc_size = PCI_DVSEC_HEADER1_LEN(regloc_size); - regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET; - regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8; + regloc += PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1; + regblocks = (regloc_size - PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1) / 8; for (i = 0; i < regblocks; i++, regloc += 8) { u32 reg_lo, reg_hi; diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 549368a9c868f..d03292e7b9b99 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -933,7 +933,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) cxlds->rcd = is_cxl_restricted(pdev); cxlds->serial = pci_get_dsn(pdev); cxlds->cxl_dvsec = pci_find_dvsec_capability( - pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); + pdev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_DEVICE); if (!cxlds->cxl_dvsec) dev_warn(&pdev->dev, "Device DVSEC not present, skip CXL.mem init\n"); diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 6db45ae2cc8e3..ae3152be018a7 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -218,12 +218,12 @@ static bool pci_cxl_ats_always_on(struct pci_dev *pdev) u16 cap; offset = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, - CXL_DVSEC_PCIE_DEVICE); + PCI_DVSEC_CXL_DEVICE); if (!offset) return false; - pci_read_config_word(pdev, offset + CXL_DVSEC_CAP_OFFSET, &cap); - if (cap & CXL_DVSEC_CACHE_CAPABLE) + pci_read_config_word(pdev, offset + PCI_DVSEC_CXL_CAP, &cap); + if (cap & PCI_DVSEC_CXL_CACHE_CAPABLE) return true; return false; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 8ea78d7e08a3d..22e22cea2c4f2 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1234,64 +1234,58 @@ /* Deprecated old name, replaced with PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE */ #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE -/* Compute Express Link (CXL r3.1, sec 8.1.5) */ -#define PCI_DVSEC_CXL_PORT 3 -#define PCI_DVSEC_CXL_PORT_CTL 0x0c -#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 - /* - * Compute Express Link (CXL r3.2, sec 8.1) + * Compute Express Link (CXL r4.0, sec 8.1) * * Note that CXL DVSEC id 3 and 7 to be ignored when the CXL link state - * is "disconnected" (CXL r3.2, sec 9.12.3). Re-enumerate these + * is "disconnected" (CXL r4.0, sec 9.12.3). Re-enumerate these * registers on downstream link-up events. */ -#define PCI_DVSEC_HEADER1_LENGTH_MASK __GENMASK(31, 20) - -/* CXL 3.2 8.1.3: PCIe DVSEC for CXL Device */ -#define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_CACHE_CAPABLE _BITUL(0) -#define CXL_DVSEC_MEM_CAPABLE _BITUL(2) -#define CXL_DVSEC_HDM_COUNT_MASK __GENMASK(5, 4) -#define CXL_DVSEC_CTRL_OFFSET 0xC -#define CXL_DVSEC_MEM_ENABLE _BITUL(2) -#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) -#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) -#define CXL_DVSEC_MEM_INFO_VALID _BITUL(0) -#define CXL_DVSEC_MEM_ACTIVE _BITUL(1) -#define CXL_DVSEC_MEM_SIZE_LOW_MASK __GENMASK(31, 28) -#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) -#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) -#define CXL_DVSEC_MEM_BASE_LOW_MASK __GENMASK(31, 28) + +/* CXL r4.0, 8.1.3: PCIe DVSEC for CXL Device */ +#define PCI_DVSEC_CXL_DEVICE 0 +#define PCI_DVSEC_CXL_CAP 0xA +#define PCI_DVSEC_CXL_CACHE_CAPABLE _BITUL(0) +#define PCI_DVSEC_CXL_MEM_CAPABLE _BITUL(2) +#define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) +#define PCI_DVSEC_CXL_CTRL 0xC +#define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) +#define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) +#define PCI_DVSEC_CXL_MEM_ACTIVE _BITUL(1) +#define PCI_DVSEC_CXL_MEM_SIZE_LOW __GENMASK(31, 28) +#define PCI_DVSEC_CXL_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define PCI_DVSEC_CXL_MEM_BASE_LOW __GENMASK(31, 28) #define CXL_DVSEC_RANGE_MAX 2 -/* CXL 3.2 8.1.4: Non-CXL Function Map DVSEC */ -#define CXL_DVSEC_FUNCTION_MAP 2 - -/* CXL 3.2 8.1.5: Extensions DVSEC for Ports */ -#define CXL_DVSEC_PORT 3 -#define CXL_DVSEC_PORT_CTL 0x0c -#define CXL_DVSEC_PORT_CTL_UNMASK_SBR 0x00000001 - -/* CXL 3.2 8.1.6: GPF DVSEC for CXL Port */ -#define CXL_DVSEC_PORT_GPF 4 -#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK __GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK __GENMASK(11, 8) -#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK __GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK __GENMASK(11, 8) - -/* CXL 3.2 8.1.7: GPF DVSEC for CXL Device */ -#define CXL_DVSEC_DEVICE_GPF 5 - -/* CXL 3.2 8.1.9: Register Locator DVSEC */ -#define CXL_DVSEC_REG_LOCATOR 8 -#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC -#define CXL_DVSEC_REG_LOCATOR_BIR_MASK __GENMASK(2, 0) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK __GENMASK(15, 8) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK __GENMASK(31, 16) +/* CXL r4.0, 8.1.4: Non-CXL Function Map DVSEC */ +#define PCI_DVSEC_CXL_FUNCTION_MAP 2 + +/* CXL r4.0, 8.1.5: Extensions DVSEC for Ports */ +#define PCI_DVSEC_CXL_PORT 3 +#define PCI_DVSEC_CXL_PORT_CTL 0x0c +#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 + +/* CXL r4.0, 8.1.6: GPF DVSEC for CXL Port */ +#define PCI_DVSEC_CXL_PORT_GPF 4 +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_CONTROL 0x0C +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_BASE __GENMASK(3, 0) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_SCALE __GENMASK(11, 8) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_CONTROL 0xE +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_BASE __GENMASK(3, 0) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_SCALE __GENMASK(11, 8) + +/* CXL r4.0, 8.1.7: GPF DVSEC for CXL Device */ +#define PCI_DVSEC_CXL_DEVICE_GPF 5 + +/* CXL r4.0, 8.1.9: Register Locator DVSEC */ +#define PCI_DVSEC_CXL_REG_LOCATOR 8 +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 0xC +#define PCI_DVSEC_CXL_REG_LOCATOR_BIR __GENMASK(2, 0) +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID __GENMASK(15, 8) +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW __GENMASK(31, 16) #endif /* LINUX_PCI_REGS_H */ From b9b8e4556e3da9fcc8bf5d9e3658010de6634a6c Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:25 -0600 Subject: [PATCH 082/143] cxl/pci: Remove unnecessary CXL Endpoint handling helper functions The CXL driver's cxl_handle_endpoint_cor_ras()/cxl_handle_endpoint_ras() are unnecessary helper functions used only for Endpoints. Remove these functions as they are not common for all CXL devices and do not provide value for EP handling. Rename __cxl_handle_ras to cxl_handle_ras() and __cxl_handle_cor_ras() to cxl_handle_cor_ras(). Signed-off-by: Terry Bowman Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Tested-by: Joshua Hahn Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-5-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit ca3d1a53e62093d17436abd447463da9c0f4e56b) Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 077b386e0c8d6..3ec7407f0c5da 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,8 +632,8 @@ void read_cdat_data(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) +static void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, + void __iomem *ras_base) { void __iomem *addr; u32 status; @@ -649,11 +649,6 @@ static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, } } -static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) -{ - return __cxl_handle_cor_ras(cxlds, cxlds->regs.ras); -} - /* CXL spec rev3.0 8.2.4.16.1 */ static void header_log_copy(void __iomem *ras_base, u32 *log) { @@ -675,8 +670,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log) * Log the state of the RAS status registers and prepare them to log the * next error status. Return 1 if reset needed. */ -static bool __cxl_handle_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) +static bool cxl_handle_ras(struct cxl_dev_state *cxlds, + void __iomem *ras_base) { u32 hl[CXL_HEADERLOG_SIZE_U32]; void __iomem *addr; @@ -709,11 +704,6 @@ static bool __cxl_handle_ras(struct cxl_dev_state *cxlds, return true; } -static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds) -{ - return __cxl_handle_ras(cxlds, cxlds->regs.ras); -} - #ifdef CONFIG_PCIEAER_CXL static void cxl_dport_map_rch_aer(struct cxl_dport *dport) @@ -792,13 +782,13 @@ EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, struct cxl_dport *dport) { - return __cxl_handle_cor_ras(cxlds, dport->regs.ras); + return cxl_handle_cor_ras(cxlds, dport->regs.ras); } static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds, struct cxl_dport *dport) { - return __cxl_handle_ras(cxlds, dport->regs.ras); + return cxl_handle_ras(cxlds, dport->regs.ras); } /* @@ -895,7 +885,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev) if (cxlds->rcd) cxl_handle_rdport_errors(cxlds); - cxl_handle_endpoint_cor_ras(cxlds); + cxl_handle_cor_ras(cxlds, cxlds->regs.ras); } } EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); @@ -924,7 +914,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, * chance the situation is recoverable dump the status of the RAS * capability registers and bounce the active state of the memdev. */ - ue = cxl_handle_endpoint_ras(cxlds); + ue = cxl_handle_ras(cxlds, cxlds->regs.ras); } From e0c700e71cf38b7a07f510b585b97beeaba67d5f Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:26 -0600 Subject: [PATCH 083/143] cxl/pci: Remove unnecessary CXL RCH handling helper functions cxl_handle_rdport_cor_ras() and cxl_handle_rdport_ras() are specific to Restricted CXL Host (RCH) handling. Improve readability and maintainability by replacing these and instead using the common cxl_handle_cor_ras() and cxl_handle_ras() functions. Signed-off-by: Terry Bowman Reviewed-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-6-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit eb78ef4d6f0e51243c1ee117f801dbc503e886ab) Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 3ec7407f0c5da..51bb0f372e40d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -779,18 +779,6 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) } EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); -static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, - struct cxl_dport *dport) -{ - return cxl_handle_cor_ras(cxlds, dport->regs.ras); -} - -static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds, - struct cxl_dport *dport) -{ - return cxl_handle_ras(cxlds, dport->regs.ras); -} - /* * Copy the AER capability registers using 32 bit read accesses. * This is necessary because RCRB AER capability is MMIO mapped. Clear the @@ -860,9 +848,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) pci_print_aer(pdev, severity, &aer_regs); if (severity == AER_CORRECTABLE) - cxl_handle_rdport_cor_ras(cxlds, dport); + cxl_handle_cor_ras(cxlds, dport->regs.ras); else - cxl_handle_rdport_ras(cxlds, dport); + cxl_handle_ras(cxlds, dport->regs.ras); } #else From df7634c649cee0352c72b17948ef16dd0033de25 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 14 Jan 2026 12:20:28 -0600 Subject: [PATCH 084/143] cxl/pci: Remove CXL VH handling in CONFIG_PCIEAER_CXL conditional blocks from core/pci.c Create new config CONFIG_CXL_RAS and put all CXL RAS items behind the config. The config will depend on CPER and PCIE AER to build. Move the related VH RAS code from core/pci.c to core/ras.c. Restricted CXL host (RCH) RAS functions will be moved in a future patch. Cc: Robert Richter Reviewed-by: Joshua Hahn Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Reviewed-by: Alison Schofield Co-developed-by: Terry Bowman Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-8-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 7ff8b1d60881c5f97b5ae426e14d2822917d3b69) Signed-off-by: Jiandi An --- drivers/cxl/Kconfig | 4 + drivers/cxl/core/Makefile | 2 +- drivers/cxl/core/core.h | 31 +++++++ drivers/cxl/core/pci.c | 189 +------------------------------------- drivers/cxl/core/ras.c | 176 +++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 8 -- drivers/cxl/cxlpci.h | 16 ++++ tools/testing/cxl/Kbuild | 2 +- 8 files changed, 233 insertions(+), 195 deletions(-) diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index f1361ed6a0d48..6b8fb3284e7e4 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -233,4 +233,8 @@ config CXL_MCE def_bool y depends on X86_MCE && MEMORY_FAILURE +config CXL_RAS + def_bool y + depends on ACPI_APEI_GHES && PCIEAER && CXL_PCI + endif diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 5ad8fef210b5c..b2930cc54f8ba 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -14,9 +14,9 @@ cxl_core-y += pci.o cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o -cxl_core-y += ras.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o +cxl_core-$(CONFIG_CXL_RAS) += ras.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 1fb66132b7777..bc818de87cccc 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -144,8 +144,39 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +#ifdef CONFIG_CXL_RAS int cxl_ras_init(void); void cxl_ras_exit(void); +bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +#else +static inline int cxl_ras_init(void) +{ + return 0; +} + +static inline void cxl_ras_exit(void) +{ +} + +static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + return false; +} +static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } +#endif /* CONFIG_CXL_RAS */ + +/* Restricted CXL Host specific RAS functions */ +#ifdef CONFIG_CXL_RAS +void cxl_dport_map_rch_aer(struct cxl_dport *dport); +void cxl_disable_rch_root_ints(struct cxl_dport *dport); +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); +#else +static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } +static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } +static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } +#endif /* CONFIG_CXL_RAS */ + int cxl_gpf_port_setup(struct cxl_dport *dport); struct cxl_hdm; diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 51bb0f372e40d..e132fff809792 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,81 +632,8 @@ void read_cdat_data(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -static void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) -{ - void __iomem *addr; - u32 status; - - if (!ras_base) - return; - - addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); - trace_cxl_aer_correctable_error(cxlds->cxlmd, status); - } -} - -/* CXL spec rev3.0 8.2.4.16.1 */ -static void header_log_copy(void __iomem *ras_base, u32 *log) -{ - void __iomem *addr; - u32 *log_addr; - int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); - - addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; - log_addr = log; - - for (i = 0; i < log_u32_size; i++) { - *log_addr = readl(addr); - log_addr++; - addr += sizeof(u32); - } -} - -/* - * Log the state of the RAS status registers and prepare them to log the - * next error status. Return 1 if reset needed. - */ -static bool cxl_handle_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) -{ - u32 hl[CXL_HEADERLOG_SIZE_U32]; - void __iomem *addr; - u32 status; - u32 fe; - - if (!ras_base) - return false; - - addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) - return false; - - /* If multiple errors, log header points to first error from ctrl reg */ - if (hweight32(status) > 1) { - void __iomem *rcc_addr = - ras_base + CXL_RAS_CAP_CONTROL_OFFSET; - - fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, - readl(rcc_addr))); - } else { - fe = status; - } - - header_log_copy(ras_base, hl); - trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); - writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); - - return true; -} - -#ifdef CONFIG_PCIEAER_CXL - -static void cxl_dport_map_rch_aer(struct cxl_dport *dport) +#ifdef CONFIG_CXL_RAS +void cxl_dport_map_rch_aer(struct cxl_dport *dport) { resource_size_t aer_phys; struct device *host; @@ -721,19 +648,7 @@ static void cxl_dport_map_rch_aer(struct cxl_dport *dport) } } -static void cxl_dport_map_ras(struct cxl_dport *dport) -{ - struct cxl_register_map *map = &dport->reg_map; - struct device *dev = dport->dport_dev; - - if (!map->component_map.ras.valid) - dev_dbg(dev, "RAS registers not found\n"); - else if (cxl_map_component_regs(map, &dport->regs.component, - BIT(CXL_CM_CAP_CAP_ID_RAS))) - dev_dbg(dev, "Failed to map RAS capability.\n"); -} - -static void cxl_disable_rch_root_ints(struct cxl_dport *dport) +void cxl_disable_rch_root_ints(struct cxl_dport *dport) { void __iomem *aer_base = dport->regs.dport_aer; u32 aer_cmd_mask, aer_cmd; @@ -757,28 +672,6 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport) writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); } -/** - * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport - * @dport: the cxl_dport that needs to be initialized - * @host: host device for devm operations - */ -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) -{ - dport->reg_map.host = host; - cxl_dport_map_ras(dport); - - if (dport->rch) { - struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); - - if (!host_bridge->native_aer) - return; - - cxl_dport_map_rch_aer(dport); - cxl_disable_rch_root_ints(dport); - } -} -EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); - /* * Copy the AER capability registers using 32 bit read accesses. * This is necessary because RCRB AER capability is MMIO mapped. Clear the @@ -827,7 +720,7 @@ static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, return false; } -static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); struct aer_capability_regs aer_regs; @@ -852,82 +745,8 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) else cxl_handle_ras(cxlds, dport->regs.ras); } - -#else -static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } #endif -void cxl_cor_error_detected(struct pci_dev *pdev) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct device *dev = &cxlds->cxlmd->dev; - - scoped_guard(device, dev) { - if (!dev->driver) { - dev_warn(&pdev->dev, - "%s: memdev disabled, abort error handling\n", - dev_name(dev)); - return; - } - - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); - - cxl_handle_cor_ras(cxlds, cxlds->regs.ras); - } -} -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); - -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, - pci_channel_state_t state) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct cxl_memdev *cxlmd = cxlds->cxlmd; - struct device *dev = &cxlmd->dev; - bool ue; - - scoped_guard(device, dev) { - if (!dev->driver) { - dev_warn(&pdev->dev, - "%s: memdev disabled, abort error handling\n", - dev_name(dev)); - return PCI_ERS_RESULT_DISCONNECT; - } - - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); - /* - * A frozen channel indicates an impending reset which is fatal to - * CXL.mem operation, and will likely crash the system. On the off - * chance the situation is recoverable dump the status of the RAS - * capability registers and bounce the active state of the memdev. - */ - ue = cxl_handle_ras(cxlds, cxlds->regs.ras); - } - - - switch (state) { - case pci_channel_io_normal: - if (ue) { - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - } - return PCI_ERS_RESULT_CAN_RECOVER; - case pci_channel_io_frozen: - dev_warn(&pdev->dev, - "%s: frozen state error detected, disable CXL.mem\n", - dev_name(dev)); - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - case pci_channel_io_perm_failure: - dev_warn(&pdev->dev, - "failure state error detected, request disconnect\n"); - return PCI_ERS_RESULT_DISCONNECT; - } - return PCI_ERS_RESULT_NEED_RESET; -} -EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); - static int cxl_flit_size(struct pci_dev *pdev) { if (cxl_pci_flit_256(pdev)) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 2731ba3a07993..b933030b8e1e7 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "trace.h" static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, @@ -124,3 +125,178 @@ void cxl_ras_exit(void) cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); cancel_work_sync(&cxl_cper_prot_err_work); } + +static void cxl_dport_map_ras(struct cxl_dport *dport) +{ + struct cxl_register_map *map = &dport->reg_map; + struct device *dev = dport->dport_dev; + + if (!map->component_map.ras.valid) + dev_dbg(dev, "RAS registers not found\n"); + else if (cxl_map_component_regs(map, &dport->regs.component, + BIT(CXL_CM_CAP_CAP_ID_RAS))) + dev_dbg(dev, "Failed to map RAS capability.\n"); +} + +/** + * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport + * @dport: the cxl_dport that needs to be initialized + * @host: host device for devm operations + */ +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) +{ + dport->reg_map.host = host; + cxl_dport_map_ras(dport); + + if (dport->rch) { + struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); + + if (!host_bridge->native_aer) + return; + + cxl_dport_map_rch_aer(dport); + cxl_disable_rch_root_ints(dport); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); + +void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + void __iomem *addr; + u32 status; + + if (!ras_base) + return; + + addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + } +} + +/* CXL spec rev3.0 8.2.4.16.1 */ +static void header_log_copy(void __iomem *ras_base, u32 *log) +{ + void __iomem *addr; + u32 *log_addr; + int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); + + addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; + log_addr = log; + + for (i = 0; i < log_u32_size; i++) { + *log_addr = readl(addr); + log_addr++; + addr += sizeof(u32); + } +} + +/* + * Log the state of the RAS status registers and prepare them to log the + * next error status. Return 1 if reset needed. + */ +bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + u32 hl[CXL_HEADERLOG_SIZE_U32]; + void __iomem *addr; + u32 status; + u32 fe; + + if (!ras_base) + return false; + + addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) + return false; + + /* If multiple errors, log header points to first error from ctrl reg */ + if (hweight32(status) > 1) { + void __iomem *rcc_addr = + ras_base + CXL_RAS_CAP_CONTROL_OFFSET; + + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + readl(rcc_addr))); + } else { + fe = status; + } + + header_log_copy(ras_base, hl); + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); + writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); + + return true; +} + +void cxl_cor_error_detected(struct pci_dev *pdev) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct device *dev = &cxlds->cxlmd->dev; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + + cxl_handle_cor_ras(cxlds, cxlds->regs.ras); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); + +pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct device *dev = &cxlmd->dev; + bool ue; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + /* + * A frozen channel indicates an impending reset which is fatal to + * CXL.mem operation, and will likely crash the system. On the off + * chance the situation is recoverable dump the status of the RAS + * capability registers and bounce the active state of the memdev. + */ + ue = cxl_handle_ras(cxlds, cxlds->regs.ras); + } + + + switch (state) { + case pci_channel_io_normal: + if (ue) { + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + } + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(&pdev->dev, + "%s: frozen state error detected, disable CXL.mem\n", + dev_name(dev)); + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(&pdev->dev, + "failure state error detected, request disconnect\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} +EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index c796c3db36e0b..2301dd42f4f2d 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -805,14 +805,6 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port, struct device *dport_dev, int port_id, resource_size_t rcrb); -#ifdef CONFIG_PCIEAER_CXL -void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport); -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); -#else -static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, - struct device *host) { } -#endif - struct cxl_decoder *to_cxl_decoder(struct device *dev); struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev); struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev); diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index cdb7cf3dbcb43..6f9c78886fd9a 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -76,7 +76,23 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); + +#ifdef CONFIG_CXL_RAS void cxl_cor_error_detected(struct pci_dev *pdev); pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, pci_channel_state_t state); +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); +#else +static inline void cxl_cor_error_detected(struct pci_dev *pdev) { } + +static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + return PCI_ERS_RESULT_NONE; +} + +static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, + struct device *host) { } +#endif + #endif /* __CXL_PCI_H__ */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0e151d0572d1f..b7ea66382f3b1 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -57,12 +57,12 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o -cxl_core-y += $(CXL_CORE_SRC)/ras.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o From 8891ca7c6e4937945336887a8986ff4cfd91cc0a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 14 Jan 2026 12:20:34 -0600 Subject: [PATCH 085/143] PCI/AER: Replace PCIEAER_CXL symbol with CXL_RAS One of the primary reasons for the CXL driver to exist is to perform error handling. If both PCIEAER and CXL are enabled then light up CXL error handling as well. Now that all RAS handling is moved under the CXL_RAS symbol, drop the previous PCIEAER_CXL symbol. Reviewed-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-14-terry.bowman@amd.com Acked-by: Bjorn Helgaas Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit d18f1b7beadf1af1cd334ff789ba5a07ce285bbc) Signed-off-by: Jiandi An --- drivers/cxl/Kconfig | 2 +- drivers/pci/pcie/Kconfig | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 6b8fb3284e7e4..5b5aa941ad2fb 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -235,6 +235,6 @@ config CXL_MCE config CXL_RAS def_bool y - depends on ACPI_APEI_GHES && PCIEAER && CXL_PCI + depends on ACPI_APEI_GHES && PCIEAER && CXL_BUS endif diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig index 17919b99fa66a..207c2deae35ff 100644 --- a/drivers/pci/pcie/Kconfig +++ b/drivers/pci/pcie/Kconfig @@ -49,15 +49,6 @@ config PCIEAER_INJECT gotten from: https://github.com/intel/aer-inject.git -config PCIEAER_CXL - bool "PCI Express CXL RAS support" - default y - depends on PCIEAER && CXL_PCI - help - Enables CXL error handling. - - If unsure, say Y. - # # PCI Express ECRC # From 50a3017a3638240657dd58d047d50ce3c6623024 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:29 -0600 Subject: [PATCH 086/143] cxl/pci: Move CXL driver's RCH error handling into core/ras_rch.c Restricted CXL Host (RCH) protocol error handling uses a procedure distinct from the CXL Virtual Hierarchy (VH) handling. This is because of the differences in the RCH and VH topologies. Improve the maintainability and add ability to enable/disable RCH handling. Move and combine the RCH handling code into a single block conditionally compiled with the CONFIG_CXL_RCH_RAS kernel config. Signed-off-by: Terry Bowman Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260114182055.46029-9-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 0ff60f2ec3e4043a442e805f80f8a2445113ec8f) Signed-off-by: Jiandi An --- drivers/cxl/core/Makefile | 1 + drivers/cxl/core/core.h | 11 +--- drivers/cxl/core/pci.c | 115 ----------------------------------- drivers/cxl/core/ras_rch.c | 121 +++++++++++++++++++++++++++++++++++++ tools/testing/cxl/Kbuild | 1 + 5 files changed, 126 insertions(+), 123 deletions(-) create mode 100644 drivers/cxl/core/ras_rch.c diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index b2930cc54f8ba..b37f38d502d8c 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -20,3 +20,4 @@ cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o cxl_core-$(CONFIG_CXL_RAS) += ras.o +cxl_core-$(CONFIG_CXL_RAS) += ras_rch.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index bc818de87cccc..724361195057e 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -149,6 +149,9 @@ int cxl_ras_init(void); void cxl_ras_exit(void); bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +void cxl_dport_map_rch_aer(struct cxl_dport *dport); +void cxl_disable_rch_root_ints(struct cxl_dport *dport); +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); #else static inline int cxl_ras_init(void) { @@ -164,14 +167,6 @@ static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras return false; } static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } -#endif /* CONFIG_CXL_RAS */ - -/* Restricted CXL Host specific RAS functions */ -#ifdef CONFIG_CXL_RAS -void cxl_dport_map_rch_aer(struct cxl_dport *dport); -void cxl_disable_rch_root_ints(struct cxl_dport *dport); -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); -#else static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index e132fff809792..b838c59d7a3c0 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,121 +632,6 @@ void read_cdat_data(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -#ifdef CONFIG_CXL_RAS -void cxl_dport_map_rch_aer(struct cxl_dport *dport) -{ - resource_size_t aer_phys; - struct device *host; - u16 aer_cap; - - aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); - if (aer_cap) { - host = dport->reg_map.host; - aer_phys = aer_cap + dport->rcrb.base; - dport->regs.dport_aer = devm_cxl_iomap_block(host, aer_phys, - sizeof(struct aer_capability_regs)); - } -} - -void cxl_disable_rch_root_ints(struct cxl_dport *dport) -{ - void __iomem *aer_base = dport->regs.dport_aer; - u32 aer_cmd_mask, aer_cmd; - - if (!aer_base) - return; - - /* - * Disable RCH root port command interrupts. - * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors - * - * This sequence may not be necessary. CXL spec states disabling - * the root cmd register's interrupts is required. But, PCI spec - * shows these are disabled by default on reset. - */ - aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | - PCI_ERR_ROOT_CMD_NONFATAL_EN | - PCI_ERR_ROOT_CMD_FATAL_EN); - aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); - aer_cmd &= ~aer_cmd_mask; - writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); -} - -/* - * Copy the AER capability registers using 32 bit read accesses. - * This is necessary because RCRB AER capability is MMIO mapped. Clear the - * status after copying. - * - * @aer_base: base address of AER capability block in RCRB - * @aer_regs: destination for copying AER capability - */ -static bool cxl_rch_get_aer_info(void __iomem *aer_base, - struct aer_capability_regs *aer_regs) -{ - int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); - u32 *aer_regs_buf = (u32 *)aer_regs; - int n; - - if (!aer_base) - return false; - - /* Use readl() to guarantee 32-bit accesses */ - for (n = 0; n < read_cnt; n++) - aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); - - writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); - writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); - - return true; -} - -/* Get AER severity. Return false if there is no error. */ -static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, - int *severity) -{ - if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { - if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) - *severity = AER_FATAL; - else - *severity = AER_NONFATAL; - return true; - } - - if (aer_regs->cor_status & ~aer_regs->cor_mask) { - *severity = AER_CORRECTABLE; - return true; - } - - return false; -} - -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) -{ - struct pci_dev *pdev = to_pci_dev(cxlds->dev); - struct aer_capability_regs aer_regs; - struct cxl_dport *dport; - int severity; - - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return; - - if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) - return; - - if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) - return; - - pci_print_aer(pdev, severity, &aer_regs); - - if (severity == AER_CORRECTABLE) - cxl_handle_cor_ras(cxlds, dport->regs.ras); - else - cxl_handle_ras(cxlds, dport->regs.ras); -} -#endif - static int cxl_flit_size(struct pci_dev *pdev) { if (cxl_pci_flit_256(pdev)) diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c new file mode 100644 index 0000000000000..ed58afd18ecc9 --- /dev/null +++ b/drivers/cxl/core/ras_rch.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ + +#include +#include +#include "cxl.h" +#include "core.h" +#include "cxlmem.h" + +void cxl_dport_map_rch_aer(struct cxl_dport *dport) +{ + resource_size_t aer_phys; + struct device *host; + u16 aer_cap; + + aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); + if (aer_cap) { + host = dport->reg_map.host; + aer_phys = aer_cap + dport->rcrb.base; + dport->regs.dport_aer = + devm_cxl_iomap_block(host, aer_phys, + sizeof(struct aer_capability_regs)); + } +} + +void cxl_disable_rch_root_ints(struct cxl_dport *dport) +{ + void __iomem *aer_base = dport->regs.dport_aer; + u32 aer_cmd_mask, aer_cmd; + + if (!aer_base) + return; + + /* + * Disable RCH root port command interrupts. + * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors + * + * This sequence may not be necessary. CXL spec states disabling + * the root cmd register's interrupts is required. But, PCI spec + * shows these are disabled by default on reset. + */ + aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | + PCI_ERR_ROOT_CMD_NONFATAL_EN | + PCI_ERR_ROOT_CMD_FATAL_EN); + aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); + aer_cmd &= ~aer_cmd_mask; + writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); +} + +/* + * Copy the AER capability registers using 32 bit read accesses. + * This is necessary because RCRB AER capability is MMIO mapped. Clear the + * status after copying. + * + * @aer_base: base address of AER capability block in RCRB + * @aer_regs: destination for copying AER capability + */ +static bool cxl_rch_get_aer_info(void __iomem *aer_base, + struct aer_capability_regs *aer_regs) +{ + int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); + u32 *aer_regs_buf = (u32 *)aer_regs; + int n; + + if (!aer_base) + return false; + + /* Use readl() to guarantee 32-bit accesses */ + for (n = 0; n < read_cnt; n++) + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); + + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); + + return true; +} + +/* Get AER severity. Return false if there is no error. */ +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, + int *severity) +{ + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) + *severity = AER_FATAL; + else + *severity = AER_NONFATAL; + return true; + } + + if (aer_regs->cor_status & ~aer_regs->cor_mask) { + *severity = AER_CORRECTABLE; + return true; + } + + return false; +} + +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct aer_capability_regs aer_regs; + struct cxl_dport *dport; + int severity; + + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return; + + if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) + return; + + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) + return; + + pci_print_aer(pdev, severity, &aer_regs); + if (severity == AER_CORRECTABLE) + cxl_handle_cor_ras(cxlds, dport->regs.ras); + else + cxl_handle_ras(cxlds, dport->regs.ras); +} diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index b7ea66382f3b1..6eceefefb0e04 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -63,6 +63,7 @@ cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras_rch.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o From 9228e0c331f6a8074a83eaf2a81a5e09dcee125b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 14 Jan 2026 12:20:37 -0600 Subject: [PATCH 087/143] cxl/mem: Clarify @host for devm_cxl_add_nvdimm() The convention for devm_ helpers in the CXL driver is that the first argument is the @host for the operation (locked driver::probe() context). Reviewed-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-17-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit f953b7d5e19a1310dd5d92b86bafc5957847b4d6) Signed-off-by: Jiandi An --- drivers/cxl/core/pmem.c | 13 +++++++------ drivers/cxl/cxl.h | 3 ++- drivers/cxl/mem.c | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index 8853415c106a9..e7b1e6fa0ea09 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -237,12 +237,13 @@ static void cxlmd_release_nvdimm(void *_cxlmd) /** * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm - * @parent_port: parent port for the (to be added) @cxlmd endpoint port - * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations + * @host: host device for devm operations + * @port: any port in the CXL topology to find the nvdimm-bridge device + * @cxlmd: parent of the to be created cxl_nvdimm device * * Return: 0 on success negative error code on failure. */ -int devm_cxl_add_nvdimm(struct cxl_port *parent_port, +int devm_cxl_add_nvdimm(struct device *host, struct cxl_port *port, struct cxl_memdev *cxlmd) { struct cxl_nvdimm_bridge *cxl_nvb; @@ -250,7 +251,7 @@ int devm_cxl_add_nvdimm(struct cxl_port *parent_port, struct device *dev; int rc; - cxl_nvb = cxl_find_nvdimm_bridge(parent_port); + cxl_nvb = cxl_find_nvdimm_bridge(port); if (!cxl_nvb) return -ENODEV; @@ -270,10 +271,10 @@ int devm_cxl_add_nvdimm(struct cxl_port *parent_port, if (rc) goto err; - dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev)); + dev_dbg(host, "register %s\n", dev_name(dev)); /* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */ - return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, cxlmd); + return devm_add_action_or_reset(host, cxlmd_release_nvdimm, cxlmd); err: put_device(dev); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 2301dd42f4f2d..e1d47062e1d3d 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -889,7 +889,8 @@ struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, struct cxl_port *port); struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm(struct device *dev); -int devm_cxl_add_nvdimm(struct cxl_port *parent_port, struct cxl_memdev *cxlmd); +int devm_cxl_add_nvdimm(struct device *host, struct cxl_port *port, + struct cxl_memdev *cxlmd); struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port); #ifdef CONFIG_CXL_REGION diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 333c366b69e76..0958bea915acb 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -115,7 +115,7 @@ static int cxl_mem_probe(struct device *dev) } if (cxl_pmem_size(cxlds) && IS_ENABLED(CONFIG_CXL_PMEM)) { - rc = devm_cxl_add_nvdimm(parent_port, cxlmd); + rc = devm_cxl_add_nvdimm(dev, parent_port, cxlmd); if (rc) { if (rc == -ENODEV) dev_info(dev, "PMEM disabled by platform\n"); From e8681540c92364df59f59839dcc4caf0497c6aea Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:38 -0600 Subject: [PATCH 088/143] cxl: Update RAS handler interfaces to also support CXL Ports CXL PCIe Port Protocol Error handling support will be added to the CXL drivers in the future. In preparation, rename the existing interfaces to support handling all CXL PCIe Port Protocol Errors. The driver's RAS support functions currently rely on a 'struct cxl_dev_state' type parameter, which is not available for CXL Port devices. However, since the same CXL RAS capability structure is needed across most CXL components and devices, a common handling approach should be adopted. To accommodate this, update the __cxl_handle_cor_ras() and __cxl_handle_ras() functions to use a `struct device` instead of `struct cxl_dev_state`. No functional changes are introduced. [1] CXL 3.1 Spec, 8.2.4 CXL.cache and CXL.mem Registers Signed-off-by: Terry Bowman Reviewed-by: Alejandro Lucero Reviewed-by: Ira Weiny Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ben Cheatham Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-18-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 9a8920ca8ebfb99604f639e7fbc681d0d04518a0) Signed-off-by: Jiandi An --- drivers/cxl/core/core.h | 14 +++++--------- drivers/cxl/core/ras.c | 12 ++++++------ drivers/cxl/core/ras_rch.c | 4 ++-- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 724361195057e..422531799af2f 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -147,8 +147,8 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, #ifdef CONFIG_CXL_RAS int cxl_ras_init(void); void cxl_ras_exit(void); -bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); -void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +bool cxl_handle_ras(struct device *dev, void __iomem *ras_base); +void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base); void cxl_dport_map_rch_aer(struct cxl_dport *dport); void cxl_disable_rch_root_ints(struct cxl_dport *dport); void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); @@ -157,16 +157,12 @@ static inline int cxl_ras_init(void) { return 0; } - -static inline void cxl_ras_exit(void) -{ -} - -static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +static inline void cxl_ras_exit(void) { } +static inline bool cxl_handle_ras(struct device *dev, void __iomem *ras_base) { return false; } -static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } +static inline void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base) { } static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index b933030b8e1e7..72908f3ced775 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -160,7 +160,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) } EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); -void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base) { void __iomem *addr; u32 status; @@ -172,7 +172,7 @@ void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) status = readl(addr); if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); - trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); } } @@ -197,7 +197,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log) * Log the state of the RAS status registers and prepare them to log the * next error status. Return 1 if reset needed. */ -bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +bool cxl_handle_ras(struct device *dev, void __iomem *ras_base) { u32 hl[CXL_HEADERLOG_SIZE_U32]; void __iomem *addr; @@ -224,7 +224,7 @@ bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) } header_log_copy(ras_base, hl); - trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); return true; @@ -246,7 +246,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev) if (cxlds->rcd) cxl_handle_rdport_errors(cxlds); - cxl_handle_cor_ras(cxlds, cxlds->regs.ras); + cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); } } EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); @@ -275,7 +275,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, * chance the situation is recoverable dump the status of the RAS * capability registers and bounce the active state of the memdev. */ - ue = cxl_handle_ras(cxlds, cxlds->regs.ras); + ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); } diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c index ed58afd18ecc9..0a8b3b9b63884 100644 --- a/drivers/cxl/core/ras_rch.c +++ b/drivers/cxl/core/ras_rch.c @@ -115,7 +115,7 @@ void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) pci_print_aer(pdev, severity, &aer_regs); if (severity == AER_CORRECTABLE) - cxl_handle_cor_ras(cxlds, dport->regs.ras); + cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras); else - cxl_handle_ras(cxlds, dport->regs.ras); + cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras); } From 51906b0621ccea4d4175785736f95ca53c4097b9 Mon Sep 17 00:00:00 2001 From: Samasth Norway Ananda Date: Mon, 5 Jan 2026 12:38:33 -0800 Subject: [PATCH 089/143] cxl/pci: Remove outdated FIXME comment and BUILD_BUG_ON Remove the outdated FIXME comment about switching to struct_group() and the associated BUILD_BUG_ON check. This work was already completed in commit 301e68dd9b9b ("cxl/core: Replace unions with struct_group()") which converted struct cxl_regs to use struct_group_tagged(). The BUILD_BUG_ON was checking that anonymous union layout was preserved, but since struct_group() now handles this correctly, the compile-time check is no longer necessary. Signed-off-by: Samasth Norway Ananda Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260105203833.1604585-1-samasth.norway.ananda@oracle.com Signed-off-by: Dave Jiang (cherry picked from commit 4dd05f02f1d618da610e7d3bd479c47a96b4fc3f) Signed-off-by: Jiandi An --- drivers/cxl/pci.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index d03292e7b9b99..1cf2322208735 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -912,13 +912,6 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) unsigned int i; bool irq_avail; - /* - * Double check the anonymous union trickery in struct cxl_regs - * FIXME switch to struct_group() - */ - BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) != - offsetof(struct cxl_regs, device_regs.memdev)); - rc = pcim_enable_device(pdev); if (rc) return rc; From 89e9eef4367bbffc5bbce069e75c85bd966323b1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 9 Jan 2026 13:29:51 +0100 Subject: [PATCH 090/143] cxl/hdm: Fix newline character in dev_err() messages The newline character is not placed at the end of the string. This causes unintended line wraps, broken log level and unterminated log messages. Fix that for all messages. Note that the messages are changed to use colons now instead of parentheses, which is more common use. Fixes: 24b18197184a ("cxl/hdm: Extend DVSEC range register emulation for region enumeration") Fixes: 9c57cde0dcbd ("cxl/hdm: Enumerate allocated DPA") Signed-off-by: Robert Richter Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260109122952.639231-1-rrichter@amd.com Signed-off-by: Dave Jiang (cherry picked from commit e5b1887619403c2da25a5899cad3e1ab34e7717f) Signed-off-by: Jiandi An --- drivers/cxl/core/hdm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index aff166798e353..35b34b8c50763 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -966,7 +966,7 @@ static int cxl_setup_hdm_decoder_from_dvsec( rc = devm_cxl_dpa_reserve(cxled, *dpa_base, len, 0); if (rc) { dev_err(&port->dev, - "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx: %d\n", port->id, cxld->id, *dpa_base, *dpa_base + len - 1, rc); return rc; } @@ -1118,7 +1118,7 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, rc = devm_cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip); if (rc) { dev_err(&port->dev, - "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx: %d\n", port->id, cxld->id, *dpa_base, *dpa_base + dpa_size + skip - 1, rc); return rc; From fa29f9690432bae4e9f571eb9be1ecff1a1d9190 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 9 Jan 2026 23:40:42 +0800 Subject: [PATCH 091/143] cxl/acpi: Remove cxl_acpi_set_cache_size() cxl_acpi_set_cache_size() returns an error only when the size of the cache range is not matched with the CXL address range. Almost all implementation of setting cache size is in cxl_acpi_set_cache_size(), cxl_setup_extended_linear_size() does nothing except printing a warning in above error case, but cxl_acpi_set_cache_size() also prints a warning at the same time. So can consolidates these two functions into one, keep the function name as cxl_setup_extended_linear_size(). Signed-off-by: Li Ming Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260109154042.331296-1-ming.li@zohomail.com Signed-off-by: Dave Jiang (cherry picked from commit 99698e70148fbce4410799570adac8456204fa37) Signed-off-by: Jiandi An --- drivers/cxl/acpi.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 77ac940e30138..e65dfae42bded 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -357,7 +357,7 @@ static int add_or_reset_cxl_resource(struct resource *parent, struct resource *r return rc; } -static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd) +static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd) { struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct range *hpa = &cxld->hpa_range; @@ -367,12 +367,14 @@ static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd) struct resource res; int nid, rc; + /* Explicitly initialize cache size to 0 at the beginning */ + cxlrd->cache_size = 0; res = DEFINE_RES_MEM(start, size); nid = phys_to_target_node(start); rc = hmat_get_extended_linear_cache_size(&res, nid, &cache_size); if (rc) - return 0; + return; /* * The cache range is expected to be within the CFMWS. @@ -384,31 +386,10 @@ static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd) dev_warn(&cxld->dev, "Extended Linear Cache size %pa != CXL size %pa. No Support!", &cache_size, &size); - return -ENXIO; + return; } cxlrd->cache_size = cache_size; - - return 0; -} - -static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd) -{ - int rc; - - rc = cxl_acpi_set_cache_size(cxlrd); - if (rc) { - /* - * Failing to retrieve extended linear cache region resize does not - * prevent the region from functioning. Only causes cxl list showing - * incorrect region size. - */ - dev_warn(cxlrd->cxlsd.cxld.dev.parent, - "Extended linear cache retrieval failed rc:%d\n", rc); - - /* Ignoring return code */ - cxlrd->cache_size = 0; - } } DEFINE_FREE(put_cxlrd, struct cxl_root_decoder *, From 78dccb8b6ed30fff42600cd7920d8be2181f67b9 Mon Sep 17 00:00:00 2001 From: "Cheatham, Benjamin" Date: Fri, 9 Jan 2026 07:57:38 -0600 Subject: [PATCH 092/143] cxl/core: Fix cxl_dport debugfs EINJ entries Protocol error injection is only valid for CXL 2.0+ root ports and CXL 1.1 memory-mapped downstream ports as per the ACPI v6.5 spec (Table 8-31). The core code currently creates an 'einj_inject' file in CXL debugfs for all CXL 1.1 downstream ports and all PCI CXL 2.0+ downstream ports. This results in debugfs EINJ files that won't work due to platform/spec restrictions. Fix by limiting 'einj_inject' file creation to only CXL 1.1 dports and CXL 2.0+ root ports. Update the comment above the check to more accurately represent the requirements expected by the EINJ module and ACPI spec. Fixes: 8039804cfa73 ("cxl/core: Add CXL EINJ debugfs files") Signed-off-by: Ben Cheatham Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/6e9fb657-8264-4028-92e2-5428e2695bf1@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 4ed7952b9e87cf731ebc8251874416e60eb15230) Signed-off-by: Jiandi An --- drivers/cxl/core/port.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index fef3aa0c6680c..54f72452fb062 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -822,16 +822,18 @@ DEFINE_DEBUGFS_ATTRIBUTE(cxl_einj_inject_fops, NULL, cxl_einj_inject, static void cxl_debugfs_create_dport_dir(struct cxl_dport *dport) { + struct cxl_port *parent = parent_port_of(dport->port); struct dentry *dir; if (!einj_cxl_is_initialized()) return; /* - * dport_dev needs to be a PCIe port for CXL 2.0+ ports because - * EINJ expects a dport SBDF to be specified for 2.0 error injection. + * Protocol error injection is only available for CXL 2.0+ root ports + * and CXL 1.1 downstream ports */ - if (!dport->rch && !dev_is_pci(dport->dport_dev)) + if (!dport->rch && + !(dev_is_pci(dport->dport_dev) && parent && is_cxl_root(parent))) return; dir = cxl_debugfs_create_dir(dev_name(dport->dport_dev)); From 7ab30cf2502cd8ac35744bb3a3e2b3a06211e3d2 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Thu, 15 Jan 2026 20:58:36 -0800 Subject: [PATCH 093/143] cxl/region: Translate DPA->HPA in unaligned MOD3 regions The CXL driver implementation of DPA->HPA address translation depends on a region's starting address always being aligned to Host Bridge Interleave Ways * 256MB. The driver follows the decode methods defined in the CXL Spec[1] and expanded upon in the CXL Driver Writers Guide[2], which describe bit manipulations based on power-of-2 alignment to translate a DPA to an HPA. With the introduction of MOD3 interleave way support, platforms may create regions at starting addresses that are not power-of-2 aligned. This allows platforms to avoid gaps in the memory map, but addresses within those regions cannot be translated using the existing bit manipulation method. Introduce an unaligned translation method for DPA->HPA that reconstructs an HPA by restoring the address first at the port level and then at the host bridge level. [1] CXL Spec 4.0 8.2.4.20.13 Implementation Note Device Decoder Logic [2] CXL Type 3 Memory Software Guide 1.1 2.13.25 DPA to HPA Translation Suggested-by: Qing Huang Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Signed-off-by: Alison Schofield Link: https://patch.msgid.link/e7c53215bf69f2ff1ae7e58bcc49ca387b7b0299.1768538962.git.alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit e639055f1f30311db91cafb36e408cc727c7d445) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 160 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 155 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 2ef7ac530f4d8..7d5c9ee6bfc2b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3112,13 +3112,146 @@ u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig) } EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate"); +static int decode_pos(int region_ways, int hb_ways, int pos, int *pos_port, + int *pos_hb) +{ + int devices_per_hb; + + /* + * Decode for 3-6-12 way interleaves as defined in the CXL + * Spec 4.0 9.13.1.1 Legal Interleaving Configurations. + * Region creation should prevent invalid combinations but + * sanity check here to avoid a silent bad decode. + */ + switch (hb_ways) { + case 3: + if (region_ways != 3 && region_ways != 6 && region_ways != 12) + return -EINVAL; + break; + case 6: + if (region_ways != 6 && region_ways != 12) + return -EINVAL; + break; + case 12: + if (region_ways != 12) + return -EINVAL; + break; + default: + return -EINVAL; + } + /* + * Each host bridge contributes an equal number of endpoints + * that are laid out contiguously per host bridge. Modulo + * selects the port within a host bridge and division selects + * the host bridge position. + */ + devices_per_hb = region_ways / hb_ways; + *pos_port = pos % devices_per_hb; + *pos_hb = pos / devices_per_hb; + + return 0; +} + +/* + * restore_parent() reconstruct the address in parent + * + * This math, specifically the bitmask creation 'mask = gran - 1' relies + * on the CXL Spec requirement that interleave granularity is always a + * power of two. + * + * [mask] isolate the offset with the granularity + * [addr & ~mask] remove the offset leaving the aligned portion + * [* ways] distribute across all interleave ways + * [+ (pos * gran)] add the positional offset + * [+ (addr & mask)] restore the masked offset + */ +static u64 restore_parent(u64 addr, u64 pos, u64 gran, u64 ways) +{ + u64 mask = gran - 1; + + return ((addr & ~mask) * ways) + (pos * gran) + (addr & mask); +} + +/* + * unaligned_dpa_to_hpa() translates a DPA to HPA when the region resource + * start address is not aligned at Host Bridge Interleave Ways * 256MB. + * + * Unaligned start addresses only occur with MOD3 interleaves. All power- + * of-two interleaves are guaranteed aligned. + */ +static u64 unaligned_dpa_to_hpa(struct cxl_decoder *cxld, + struct cxl_region_params *p, int pos, u64 dpa) +{ + int ways_port = p->interleave_ways / cxld->interleave_ways; + int gran_port = p->interleave_granularity; + int gran_hb = cxld->interleave_granularity; + int ways_hb = cxld->interleave_ways; + int pos_port, pos_hb, gran_shift; + u64 hpa_port = 0; + + /* Decode an endpoint 'pos' into port and host-bridge components */ + if (decode_pos(p->interleave_ways, ways_hb, pos, &pos_port, &pos_hb)) { + dev_dbg(&cxld->dev, "not supported for region ways:%d\n", + p->interleave_ways); + return ULLONG_MAX; + } + + /* Restore the port parent address if needed */ + if (gran_hb != gran_port) + hpa_port = restore_parent(dpa, pos_port, gran_port, ways_port); + else + hpa_port = dpa; + + /* + * Complete the HPA reconstruction by restoring the address as if + * each HB position is a candidate. Test against expected pos_hb + * to confirm match. + */ + gran_shift = ilog2(gran_hb); + for (int position = 0; position < ways_hb; position++) { + u64 shifted, hpa; + + hpa = restore_parent(hpa_port, position, gran_hb, ways_hb); + hpa += p->res->start; + + shifted = hpa >> gran_shift; + if (do_div(shifted, ways_hb) == pos_hb) + return hpa; + } + + dev_dbg(&cxld->dev, "fail dpa:%#llx region:%pr pos:%d\n", dpa, p->res, + pos); + dev_dbg(&cxld->dev, " port-w/g/p:%d/%d/%d hb-w/g/p:%d/%d/%d\n", + ways_port, gran_port, pos_port, ways_hb, gran_hb, pos_hb); + + return ULLONG_MAX; +} + +static bool region_is_unaligned_mod3(struct cxl_region *cxlr) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region_params *p = &cxlr->params; + int hbiw = cxld->interleave_ways; + u64 rem; + + if (is_power_of_2(hbiw)) + return false; + + div64_u64_rem(p->res->start, (u64)hbiw * SZ_256M, &rem); + + return (rem != 0); +} + u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region_params *p = &cxlr->params; struct cxl_endpoint_decoder *cxled = NULL; u64 dpa_offset, hpa_offset, hpa; + bool unaligned = false; u16 eig = 0; u8 eiw = 0; int pos; @@ -3132,15 +3265,32 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, if (!cxled) return ULLONG_MAX; + dpa_offset = dpa - cxl_dpa_resource_start(cxled); + + /* Unaligned calc for MOD3 interleaves not hbiw * 256MB aligned */ + unaligned = region_is_unaligned_mod3(cxlr); + if (unaligned) { + hpa = unaligned_dpa_to_hpa(cxld, p, cxled->pos, dpa_offset); + if (hpa == ULLONG_MAX) + return ULLONG_MAX; + + goto skip_aligned; + } + /* + * Aligned calc for all power-of-2 interleaves and for MOD3 + * interleaves that are aligned at hbiw * 256MB + */ pos = cxled->pos; ways_to_eiw(p->interleave_ways, &eiw); granularity_to_eig(p->interleave_granularity, &eig); - dpa_offset = dpa - cxl_dpa_resource_start(cxled); hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig); /* Apply the hpa_offset to the region base address */ - hpa = hpa_offset + p->res->start + p->cache_size; + hpa = hpa_offset + p->res->start; + +skip_aligned: + hpa += p->cache_size; /* Root decoder translation overrides typical modulo decode */ if (cxlrd->ops.hpa_to_spa) @@ -3151,9 +3301,9 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, "Addr trans fail: hpa 0x%llx not in region\n", hpa); return ULLONG_MAX; } - - /* Simple chunk check, by pos & gran, only applies to modulo decodes */ - if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos)) + /* Chunk check applies to aligned modulo decodes only */ + if (!unaligned && !cxlrd->ops.hpa_to_spa && + !cxl_is_hpa_in_chunk(hpa, cxlr, pos)) return ULLONG_MAX; return hpa; From c210b5788d5b72adcf1ea5feae4e6f49de4ec748 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Thu, 15 Jan 2026 20:58:37 -0800 Subject: [PATCH 094/143] cxl/region: Translate HPA to DPA and memdev in unaligned regions The CXL driver supports an expert user debugfs interface to inject and clear poison by a region offset. That feature requires translating a HPA (the region address) to a DPA and a memdev to perform the poison operation. Unaligned regions do not have an algebraically invertible mapping from HPA to DPA due to the region offset skew. The region base is not aligned to a full interleave. Add a helper to perform the unaligned translations that first calculates the DPA offset and then tests it against each candidate endpoint decoder. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Alison Schofield Link: https://patch.msgid.link/f338b7aff7e4574fcc525b1a0d4f09786bfb6489.1768538962.git.alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit b51792fd9168e581e51be98e22df5f79454e22de) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 7d5c9ee6bfc2b..8bacef7a4d11b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3314,6 +3314,48 @@ struct dpa_result { u64 dpa; }; +static int unaligned_region_offset_to_dpa_result(struct cxl_region *cxlr, + u64 offset, + struct dpa_result *result) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region_params *p = &cxlr->params; + u64 interleave_width, interleave_index; + u64 gran, gran_offset, dpa_offset; + u64 hpa = p->res->start + offset; + + /* + * Unaligned addresses are not algebraically invertible. Calculate + * a dpa_offset independent of the target device and then enumerate + * and test that dpa_offset against each candidate endpoint decoder. + */ + gran = cxld->interleave_granularity; + interleave_width = gran * cxld->interleave_ways; + interleave_index = div64_u64(offset, interleave_width); + gran_offset = div64_u64_rem(offset, gran, NULL); + + dpa_offset = interleave_index * gran + gran_offset; + + for (int i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + int pos = cxled->pos; + u64 test_hpa; + + test_hpa = unaligned_dpa_to_hpa(cxld, p, pos, dpa_offset); + if (test_hpa == hpa) { + result->cxlmd = cxled_to_memdev(cxled); + result->dpa = + cxl_dpa_resource_start(cxled) + dpa_offset; + return 0; + } + } + dev_err(&cxlr->dev, + "failed to resolve HPA %#llx in unaligned MOD3 region\n", hpa); + + return -ENXIO; +} + static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, struct dpa_result *result) { @@ -3343,6 +3385,10 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, hpa_offset = offset; } + if (region_is_unaligned_mod3(cxlr)) + return unaligned_region_offset_to_dpa_result(cxlr, offset, + result); + pos = cxl_calculate_position(hpa_offset, eiw, eig); if (pos < 0 || pos >= p->nr_targets) { dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n", From 84743e0d681d2c796964445f6ec18c1fce157b44 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 16 Jan 2026 20:47:30 -0800 Subject: [PATCH 095/143] cxl/region: Use do_div() for 64-bit modulo operation div64_u64_rem() was the wrong choice for doing a modulo operation and it was used incorrectly, causing a kernel oops by passing NULL as the remainder parameter. Replace it with the do_div() helper that does the intended math (gran_offset = offset % gran) and is architecture safe. This bug appeared during testing of unaligned address translations. The visibility to userspace would be limited to folks doing poison injection or clear by HPA on unaligned regions. Fixes: 78b50b598462 ("cxl/region: Translate HPA to DPA and memdev in unaligned regions") Signed-off-by: Alison Schofield Link: https://patch.msgid.link/20260117044732.567831-1-alison.schofield@intel.com Signed-off-by: Dave Jiang (cherry picked from commit 064c098790944fa44f6aa704eb55a5c3ed65a2fa) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 8bacef7a4d11b..dee25d90b3e49 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3324,6 +3324,7 @@ static int unaligned_region_offset_to_dpa_result(struct cxl_region *cxlr, u64 interleave_width, interleave_index; u64 gran, gran_offset, dpa_offset; u64 hpa = p->res->start + offset; + u64 tmp = offset; /* * Unaligned addresses are not algebraically invertible. Calculate @@ -3333,7 +3334,7 @@ static int unaligned_region_offset_to_dpa_result(struct cxl_region *cxlr, gran = cxld->interleave_granularity; interleave_width = gran * cxld->interleave_ways; interleave_index = div64_u64(offset, interleave_width); - gran_offset = div64_u64_rem(offset, gran, NULL); + gran_offset = do_div(tmp, gran); dpa_offset = interleave_index * gran + gran_offset; From 90497a70c95764d37378914e517db51420d72d01 Mon Sep 17 00:00:00 2001 From: Yuxiong Wang Date: Thu, 29 Jan 2026 14:45:52 +0800 Subject: [PATCH 096/143] cxl: Fix premature commit_end increment on decoder commit failure In cxl_decoder_commit(), commit_end is incremented before verifying whether the commit succeeded, and the CXL_DECODER_F_ENABLE bit in cxld->flags is only set after a successful commit. As a result, if the commit fails, commit_end has been incremented and cxld->reset() has no effect since the flag is not set, so commit_end remains incorrectly incremented. The inconsistency between commit_end and CXL_DECODER_F_ENABLE causes failure during subsequent either commit or reset operations. Fix this by incrementing commit_end only after confirming the commit succeeded. Also, remove the ineffective cxld->reset() call. According to CXL Spec r4.0 8.2.4.20.12 Committing Decoder Programming, since cxld_await_commit() has cleared the decoder commit bit on failure, no additional reset is required. [dj: Fixed commit log 80 char wrapping. ] [dj: Fix "Fixes" tag to correct hash length. ] [dj: Change spec to r4.0. ] Fixes: 176baefb2eb5 ("cxl/hdm: Commit decoder state to hardware") Signed-off-by: Yuxiong Wang Acked-by: Huang Ying Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260129064552.31180-1-yuxiong.wang@linux.alibaba.com Signed-off-by: Dave Jiang (cherry picked from commit 7b6f9d9b1ea05c9c22570126547c780e8c6c3f62) Signed-off-by: Jiandi An --- drivers/cxl/core/hdm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 35b34b8c50763..061f364cc9a00 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -844,14 +844,13 @@ static int cxl_decoder_commit(struct cxl_decoder *cxld) scoped_guard(rwsem_read, &cxl_rwsem.dpa) setup_hw_decoder(cxld, hdm); - port->commit_end++; rc = cxld_await_commit(hdm, cxld->id); if (rc) { dev_dbg(&port->dev, "%s: error %d committing decoder\n", dev_name(&cxld->dev), rc); - cxld->reset(cxld); return rc; } + port->commit_end++; cxld->flags |= CXL_DECODER_F_ENABLE; return 0; From c46d21a21708aaa521f496b225649c1ee867ccc3 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:24 -0600 Subject: [PATCH 097/143] PCI: Introduce pcie_is_cxl() CXL is a protocol that runs on top of PCIe electricals. Its error model also runs on top of the PCIe AER error model by standardizing "internal" errors as "CXL" errors. Linux has historically ignored internal errors. CXL protocol error handling is then a task of enhancing the PCIe AER core to understand that PCIe ports (upstream and downstream) and endpoints may throw internal errors that represent standard CXL protocol errors. The proposed method to make that determination is to teach 'struct pci_dev' to cache when its link has trained the CXL.mem and/or CXL.cache protocols and then treat all internal errors as CXL errors. A design goal is to not burden the PCIe AER core with CXL knowledge beyond just enough to forward error notifications to the CXL RAS core. The forwarded notification looks up a 'struct cxl_port' or 'struct cxl_dport' companion device to the PCI device. Introduce set_pcie_cxl() with logic checking for CXL.mem or CXL.cache status in the CXL Flex Bus DVSEC status register. The CXL Flex Bus DVSEC presence is used because it is required for all the CXL PCIe devices.[1] [1] CXL 3.1 Spec, 8.1.1 PCIe Designated Vendor-Specific Extended Capability (DVSEC) ID Assignment, Table 8-2 Signed-off-by: Terry Bowman Reviewed-by: Ira Weiny Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alejandro Lucero Reviewed-by: Ben Cheatham Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-4-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 7c29ba02210c6e4570cdce53813a1ae68fb6d049) Signed-off-by: Jiandi An --- drivers/pci/probe.c | 31 +++++++++++++++++++++++++++++++ include/linux/pci.h | 6 ++++++ include/uapi/linux/pci_regs.h | 6 ++++++ 3 files changed, 43 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index eb084877bb043..06cb9081d4ac4 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1692,6 +1692,35 @@ static void set_pcie_thunderbolt(struct pci_dev *dev) dev->is_thunderbolt = 1; } +static void set_pcie_cxl(struct pci_dev *dev) +{ + struct pci_dev *bridge; + u16 dvsec, cap; + + if (!pci_is_pcie(dev)) + return; + + /* + * Update parent's CXL state because alternate protocol training + * may have changed + */ + bridge = pci_upstream_bridge(dev); + if (bridge) + set_pcie_cxl(bridge); + + dvsec = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_FLEXBUS_PORT); + if (!dvsec) + return; + + pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS, + &cap); + + dev->is_cxl = FIELD_GET(PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_CACHE, cap) || + FIELD_GET(PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_MEM, cap); + +} + static void set_pcie_untrusted(struct pci_dev *dev) { struct pci_dev *parent = pci_upstream_bridge(dev); @@ -2022,6 +2051,8 @@ int pci_setup_device(struct pci_dev *dev) /* Need to have dev->cfg_size ready */ set_pcie_thunderbolt(dev); + set_pcie_cxl(dev); + set_pcie_untrusted(dev); if (pci_is_pcie(dev)) diff --git a/include/linux/pci.h b/include/linux/pci.h index 1bdfd152eb1f8..a03cdd8c96122 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -466,6 +466,7 @@ struct pci_dev { unsigned int is_pciehp:1; unsigned int shpc_managed:1; /* SHPC owned by shpchp */ unsigned int is_thunderbolt:1; /* Thunderbolt controller */ + unsigned int is_cxl:1; /* Compute Express Link (CXL) */ /* * Devices marked being untrusted are the ones that can potentially * execute DMA attacks and similar. They are typically connected @@ -773,6 +774,11 @@ static inline bool pci_is_display(struct pci_dev *pdev) return (pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY; } +static inline bool pcie_is_cxl(struct pci_dev *pci_dev) +{ + return pci_dev->is_cxl; +} + #define for_each_pci_bridge(dev, bus) \ list_for_each_entry(dev, &bus->devices, bus_list) \ if (!pci_is_bridge(dev)) {} else diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 22e22cea2c4f2..49848c6765270 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1281,6 +1281,12 @@ /* CXL r4.0, 8.1.7: GPF DVSEC for CXL Device */ #define PCI_DVSEC_CXL_DEVICE_GPF 5 +/* CXL r4.0, 8.1.8: Flex Bus DVSEC */ +#define PCI_DVSEC_CXL_FLEXBUS_PORT 7 +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS 0xE +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_CACHE _BITUL(0) +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_MEM _BITUL(2) + /* CXL r4.0, 8.1.9: Register Locator DVSEC */ #define PCI_DVSEC_CXL_REG_LOCATOR 8 #define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 0xC From 55cdd3eb04e827b4aedeabc00ef364f82312119c Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:27 -0600 Subject: [PATCH 098/143] PCI: Replace cxl_error_is_native() with pcie_aer_is_native() The AER driver includes a CXL support function cxl_error_is_native(). This function adds no additional value from pcie_aer_is_native(). Simplify the codebase by removing cxl_error_is_native() and replace occurrences of cxl_error_is_native() with pcie_aer_is_native(). Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-7-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit bcfa289932a703dd189466ea5947212e8dddd399) Signed-off-by: Jiandi An --- drivers/pci/pcie/aer.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 3dba9c0c6ae11..9f4985fba50b9 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1140,13 +1140,6 @@ static bool is_cxl_mem_dev(struct pci_dev *dev) return true; } -static bool cxl_error_is_native(struct pci_dev *dev) -{ - struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); - - return (pcie_ports_native || host->native_aer); -} - static bool is_internal_error(struct aer_err_info *info) { if (info->severity == AER_CORRECTABLE) @@ -1160,7 +1153,7 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) struct aer_err_info *info = (struct aer_err_info *)data; const struct pci_error_handlers *err_handler; - if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) + if (!is_cxl_mem_dev(dev) || !pcie_aer_is_native(dev)) return 0; /* Protect dev->driver */ @@ -1201,7 +1194,7 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data) bool *handles_cxl = data; if (!*handles_cxl) - *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); + *handles_cxl = is_cxl_mem_dev(dev) && pcie_aer_is_native(dev); /* Non-zero terminates iteration */ return *handles_cxl; From bf964029c3486009a54ef9d50329360b73c73732 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:30 -0600 Subject: [PATCH 099/143] PCI/AER: Export pci_aer_unmask_internal_errors() Internal PCIe errors are not enabled by default during initialization because their behavior is too device-specific and there is no standard way to reason about them. However, for CXL an internal error is the standard mechanism for conveying CXL protocol errors. Export pci_aer_unmask_internal_errors() for CXL, but make it clear that they are only meant for CXL and the status quo for leaving them masked for PCIe in general remains. Signed-off-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-10-terry.bowman@amd.com Co-developed-by: Dan Williams Signed-off-by: Dan Williams Acked-by: Bjorn Helgaas Signed-off-by: Dave Jiang (cherry picked from commit 6dc5fe212e74e6880a1da0093f627387d0a658bb) Signed-off-by: Jiandi An --- drivers/pci/pcie/aer.c | 11 ++++++++--- include/linux/aer.h | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 9f4985fba50b9..1ec0193ab1cc9 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1094,8 +1094,6 @@ static bool find_source_device(struct pci_dev *parent, return true; } -#ifdef CONFIG_PCIEAER_CXL - /** * pci_aer_unmask_internal_errors - unmask internal errors * @dev: pointer to the pci_dev data structure @@ -1106,7 +1104,7 @@ static bool find_source_device(struct pci_dev *parent, * Note: AER must be enabled and supported by the device which must be * checked in advance, e.g. with pcie_aer_is_native(). */ -static void pci_aer_unmask_internal_errors(struct pci_dev *dev) +void pci_aer_unmask_internal_errors(struct pci_dev *dev) { int aer = dev->aer_cap; u32 mask; @@ -1120,6 +1118,13 @@ static void pci_aer_unmask_internal_errors(struct pci_dev *dev) pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); } +/* + * Internal errors are too device-specific to enable generally, however for CXL + * their behavior is standardized for conveying CXL protocol errors. + */ +EXPORT_SYMBOL_FOR_MODULES(pci_aer_unmask_internal_errors, "cxl_core"); + +#ifdef CONFIG_PCIEAER_CXL static bool is_cxl_mem_dev(struct pci_dev *dev) { /* diff --git a/include/linux/aer.h b/include/linux/aer.h index 02940be66324e..df0f5c382286f 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -56,12 +56,14 @@ struct aer_capability_regs { #if defined(CONFIG_PCIEAER) int pci_aer_clear_nonfatal_status(struct pci_dev *dev); int pcie_aer_is_native(struct pci_dev *dev); +void pci_aer_unmask_internal_errors(struct pci_dev *dev); #else static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev) { return -EINVAL; } static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; } +static inline void pci_aer_unmask_internal_errors(struct pci_dev *dev) { } #endif void pci_print_aer(struct pci_dev *dev, int aer_severity, From 7d34b727c1a1c9b9e0bf9e876fb09ce29fcfac69 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:31 -0600 Subject: [PATCH 100/143] PCI/AER: Update is_internal_error() to be non-static is_aer_internal_error() The AER driver includes significant logic for handling CXL protocol errors. The AER driver will be updated in the future to separate the AER and CXL logic. Rename the is_internal_error() function to is_aer_internal_error() as it gives a more precise indication of the purpose. Make is_aer_internal_error() non-static to allow for the 2 different CXL topology error model implementations (RCH and VH) to share this helper. Signed-off-by: Terry Bowman Link: https://patch.msgid.link/20260114182055.46029-11-terry.bowman@amd.com Acked-by: Bjorn Helgaas Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 51ce56b1a5d6f7263739d4766ae445463c74b689) Signed-off-by: Jiandi An --- drivers/pci/pcie/aer.c | 4 ++-- drivers/pci/pcie/portdrv.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 1ec0193ab1cc9..8bb894f9b152c 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1145,7 +1145,7 @@ static bool is_cxl_mem_dev(struct pci_dev *dev) return true; } -static bool is_internal_error(struct aer_err_info *info) +bool is_aer_internal_error(struct aer_err_info *info) { if (info->severity == AER_CORRECTABLE) return info->status & PCI_ERR_COR_INTERNAL; @@ -1190,7 +1190,7 @@ static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) * device driver. */ if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && - is_internal_error(info)) + is_aer_internal_error(info)) pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); } diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index bd29d1cc7b8bd..e7a0a2cffea93 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -123,4 +123,13 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {} #endif /* !CONFIG_PCIE_PME */ struct device *pcie_port_find_device(struct pci_dev *dev, u32 service); + +struct aer_err_info; + +#ifdef CONFIG_PCIEAER_CXL +bool is_aer_internal_error(struct aer_err_info *info); +#else +static inline bool is_aer_internal_error(struct aer_err_info *info) { return false; } +#endif /* CONFIG_PCIEAER_CXL */ + #endif /* _PORTDRV_H_ */ From 115b1e171dbe7847829c4ce3059ad7566d3e887e Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:32 -0600 Subject: [PATCH 101/143] PCI/AER: Move CXL RCH error handling to aer_cxl_rch.c The Restricted CXL Host (RCH) AER error handling logic currently resides in the AER driver file, aer.c. CXL specific changes conditionally compiled using #ifdefs. Improve the AER driver maintainability by separating the RCH specific logic from the AER driver's core functionality and removing the ifdefs. Introduce drivers/pci/pcie/aer_cxl_rch.c for moving the RCH AER logic into. Conditionally compile the file using the CONFIG_CXL_RCH_RAS Kconfig. Move the CXL logic into the new file but leave CXL helper function is_internal_error() in aer.c for now as it will be moved in future patch for CXL Virtual Hierarchy handling. To maintain compilation after the move other changes are required. Change cxl_rch_handle_error(), cxl_rch_enable_rcec(), and is_internal_error() to be non-static inorder for accessing from the AER driver. Update the new file with the SPDX and 2023 AMD copyright notations because the RCH bits were initially contributed in 2023 by AMD. See commit: commit 0a867568bb0d ("PCI/AER: Forward RCH downstream port-detected errors to the CXL.mem dev handler") Signed-off-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Ben Cheatham Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-12-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 59010029faf27c82d1e786dfd1fb83b09f478d1b) Signed-off-by: Jiandi An --- drivers/pci/pcie/Makefile | 1 + drivers/pci/pcie/aer.c | 99 +----------------------------- drivers/pci/pcie/aer_cxl_rch.c | 106 +++++++++++++++++++++++++++++++++ drivers/pci/pcie/portdrv.h | 9 ++- 4 files changed, 114 insertions(+), 101 deletions(-) create mode 100644 drivers/pci/pcie/aer_cxl_rch.c diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile index 173829aa02e60..b0b43a18c304b 100644 --- a/drivers/pci/pcie/Makefile +++ b/drivers/pci/pcie/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o obj-y += aspm.o obj-$(CONFIG_PCIEAER) += aer.o err.o tlp.o +obj-$(CONFIG_CXL_RAS) += aer_cxl_rch.o obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o obj-$(CONFIG_PCIE_PME) += pme.o obj-$(CONFIG_PCIE_DPC) += dpc.o diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 8bb894f9b152c..95a829b6c0889 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1124,27 +1124,7 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev) */ EXPORT_SYMBOL_FOR_MODULES(pci_aer_unmask_internal_errors, "cxl_core"); -#ifdef CONFIG_PCIEAER_CXL -static bool is_cxl_mem_dev(struct pci_dev *dev) -{ - /* - * The capability, status, and control fields in Device 0, - * Function 0 DVSEC control the CXL functionality of the - * entire device (CXL 3.0, 8.1.3). - */ - if (dev->devfn != PCI_DEVFN(0, 0)) - return false; - - /* - * CXL Memory Devices must have the 502h class code set (CXL - * 3.0, 8.1.12.1). - */ - if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL) - return false; - - return true; -} - +#ifdef CONFIG_CXL_RAS bool is_aer_internal_error(struct aer_err_info *info) { if (info->severity == AER_CORRECTABLE) @@ -1152,83 +1132,6 @@ bool is_aer_internal_error(struct aer_err_info *info) return info->status & PCI_ERR_UNC_INTN; } - -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) -{ - struct aer_err_info *info = (struct aer_err_info *)data; - const struct pci_error_handlers *err_handler; - - if (!is_cxl_mem_dev(dev) || !pcie_aer_is_native(dev)) - return 0; - - /* Protect dev->driver */ - device_lock(&dev->dev); - - err_handler = dev->driver ? dev->driver->err_handler : NULL; - if (!err_handler) - goto out; - - if (info->severity == AER_CORRECTABLE) { - if (err_handler->cor_error_detected) - err_handler->cor_error_detected(dev); - } else if (err_handler->error_detected) { - if (info->severity == AER_NONFATAL) - err_handler->error_detected(dev, pci_channel_io_normal); - else if (info->severity == AER_FATAL) - err_handler->error_detected(dev, pci_channel_io_frozen); - } -out: - device_unlock(&dev->dev); - return 0; -} - -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) -{ - /* - * Internal errors of an RCEC indicate an AER error in an - * RCH's downstream port. Check and handle them in the CXL.mem - * device driver. - */ - if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && - is_aer_internal_error(info)) - pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); -} - -static int handles_cxl_error_iter(struct pci_dev *dev, void *data) -{ - bool *handles_cxl = data; - - if (!*handles_cxl) - *handles_cxl = is_cxl_mem_dev(dev) && pcie_aer_is_native(dev); - - /* Non-zero terminates iteration */ - return *handles_cxl; -} - -static bool handles_cxl_errors(struct pci_dev *rcec) -{ - bool handles_cxl = false; - - if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && - pcie_aer_is_native(rcec)) - pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); - - return handles_cxl; -} - -static void cxl_rch_enable_rcec(struct pci_dev *rcec) -{ - if (!handles_cxl_errors(rcec)) - return; - - pci_aer_unmask_internal_errors(rcec); - pci_info(rcec, "CXL: Internal errors unmasked"); -} - -#else -static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { } -static inline void cxl_rch_handle_error(struct pci_dev *dev, - struct aer_err_info *info) { } #endif /** diff --git a/drivers/pci/pcie/aer_cxl_rch.c b/drivers/pci/pcie/aer_cxl_rch.c new file mode 100644 index 0000000000000..6b515edb12c15 --- /dev/null +++ b/drivers/pci/pcie/aer_cxl_rch.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 AMD Corporation. All rights reserved. */ + +#include +#include +#include +#include "../pci.h" +#include "portdrv.h" + +static bool is_cxl_mem_dev(struct pci_dev *dev) +{ + /* + * The capability, status, and control fields in Device 0, + * Function 0 DVSEC control the CXL functionality of the + * entire device (CXL 3.0, 8.1.3). + */ + if (dev->devfn != PCI_DEVFN(0, 0)) + return false; + + /* + * CXL Memory Devices must have the 502h class code set (CXL + * 3.0, 8.1.12.1). + */ + if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL) + return false; + + return true; +} + +static bool cxl_error_is_native(struct pci_dev *dev) +{ + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); + + return (pcie_ports_native || host->native_aer); +} + +static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) +{ + struct aer_err_info *info = (struct aer_err_info *)data; + const struct pci_error_handlers *err_handler; + + if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) + return 0; + + device_lock(&dev->dev); + + err_handler = dev->driver ? dev->driver->err_handler : NULL; + if (!err_handler) + goto out; + + if (info->severity == AER_CORRECTABLE) { + if (err_handler->cor_error_detected) + err_handler->cor_error_detected(dev); + } else if (err_handler->error_detected) { + if (info->severity == AER_NONFATAL) + err_handler->error_detected(dev, pci_channel_io_normal); + else if (info->severity == AER_FATAL) + err_handler->error_detected(dev, pci_channel_io_frozen); + } +out: + device_unlock(&dev->dev); + return 0; +} + +void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) +{ + /* + * Internal errors of an RCEC indicate an AER error in an + * RCH's downstream port. Check and handle them in the CXL.mem + * device driver. + */ + if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && + is_aer_internal_error(info)) + pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); +} + +static int handles_cxl_error_iter(struct pci_dev *dev, void *data) +{ + bool *handles_cxl = data; + + if (!*handles_cxl) + *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); + + /* Non-zero terminates iteration */ + return *handles_cxl; +} + +static bool handles_cxl_errors(struct pci_dev *rcec) +{ + bool handles_cxl = false; + + if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && + pcie_aer_is_native(rcec)) + pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); + + return handles_cxl; +} + +void cxl_rch_enable_rcec(struct pci_dev *rcec) +{ + if (!handles_cxl_errors(rcec)) + return; + + pci_aer_unmask_internal_errors(rcec); + pci_info(rcec, "CXL: Internal errors unmasked"); +} diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index e7a0a2cffea93..cc58bf2f2c844 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -126,10 +126,13 @@ struct device *pcie_port_find_device(struct pci_dev *dev, u32 service); struct aer_err_info; -#ifdef CONFIG_PCIEAER_CXL +#ifdef CONFIG_CXL_RAS bool is_aer_internal_error(struct aer_err_info *info); +void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info); +void cxl_rch_enable_rcec(struct pci_dev *rcec); #else static inline bool is_aer_internal_error(struct aer_err_info *info) { return false; } -#endif /* CONFIG_PCIEAER_CXL */ - +static inline void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) { } +static inline void cxl_rch_enable_rcec(struct pci_dev *rcec) { } +#endif /* CONFIG_CXL_RAS */ #endif /* _PORTDRV_H_ */ From fa00d6812c94691961f0b52143748f536649d340 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:33 -0600 Subject: [PATCH 102/143] PCI/AER: Use guard() in cxl_rch_handle_error_iter() cxl_rch_handle_error_iter() includes a call to device_lock() using a goto for multiple return paths. Improve readability and maintainability by using the guard() lock variant. Signed-off-by: Terry Bowman Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-13-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit da71bd360ded15626dabd59dd1d6939de38cab39) Signed-off-by: Jiandi An --- drivers/pci/pcie/aer_cxl_rch.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pcie/aer_cxl_rch.c b/drivers/pci/pcie/aer_cxl_rch.c index 6b515edb12c15..e471eefec9c40 100644 --- a/drivers/pci/pcie/aer_cxl_rch.c +++ b/drivers/pci/pcie/aer_cxl_rch.c @@ -42,11 +42,11 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) return 0; - device_lock(&dev->dev); + guard(device)(&dev->dev); err_handler = dev->driver ? dev->driver->err_handler : NULL; if (!err_handler) - goto out; + return 0; if (info->severity == AER_CORRECTABLE) { if (err_handler->cor_error_detected) @@ -57,8 +57,6 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) else if (info->severity == AER_FATAL) err_handler->error_detected(dev, pci_channel_io_frozen); } -out: - device_unlock(&dev->dev); return 0; } From 4c19ee278cd408d47446812388d75d905372a3d1 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:35 -0600 Subject: [PATCH 103/143] PCI/AER: Report CXL or PCIe bus type in AER trace logging The AER service driver and aer_event tracing currently log 'PCIe Bus Type' for all errors. Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL device errors. This requires that AER can identify and distinguish between PCIe errors and CXL errors. Introduce boolean 'is_cxl' to 'struct aer_err_info'. Add assignment in aer_get_device_error_info() and pci_print_aer(). Update the aer_event trace routine to accept a bus type string parameter. Signed-off-by: Terry Bowman Co-developed-by: Dan Williams Acked-by: Bjorn Helgaas Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-15-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit 83cba5b31e6b0aeb32f41b9c954fe97b60db2817) Signed-off-by: Jiandi An --- drivers/pci/pci.h | 8 +++++++- drivers/pci/pcie/aer.c | 20 +++++++++++++------- include/ras/ras_event.h | 12 ++++++++---- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ee7b515125826..7ed929cfd45dd 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -621,7 +621,8 @@ struct aer_err_info { unsigned int multi_error_valid:1; unsigned int first_error:5; - unsigned int __pad2:2; + unsigned int __pad2:1; + unsigned int is_cxl:1; unsigned int tlp_header_valid:1; unsigned int status; /* COR/UNCOR Error Status */ @@ -632,6 +633,11 @@ struct aer_err_info { int aer_get_device_error_info(struct aer_err_info *info, int i); void aer_print_error(struct aer_err_info *info, int i); +static inline const char *aer_err_bus(struct aer_err_info *info) +{ + return info->is_cxl ? "CXL" : "PCIe"; +} + int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, unsigned int tlp_len, bool flit, struct pcie_tlp_log *log); diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 95a829b6c0889..5331a1c908375 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -844,6 +844,7 @@ void aer_print_error(struct aer_err_info *info, int i) struct pci_dev *dev; int layer, agent, id; const char *level = info->level; + const char *bus_type = aer_err_bus(info); if (WARN_ON_ONCE(i >= AER_MAX_MULTI_ERR_DEVICES)) return; @@ -853,22 +854,22 @@ void aer_print_error(struct aer_err_info *info, int i) pci_dev_aer_stats_incr(dev, info); trace_aer_event(pci_name(dev), (info->status & ~info->mask), - info->severity, info->tlp_header_valid, &info->tlp); + info->severity, info->tlp_header_valid, &info->tlp, bus_type); if (!info->ratelimit_print[i]) return; if (!info->status) { - pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", - aer_error_severity_string[info->severity]); + pci_err(dev, "%s Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", + bus_type, aer_error_severity_string[info->severity]); goto out; } layer = AER_GET_LAYER_ERROR(info->severity, info->status); agent = AER_GET_AGENT(info->severity, info->status); - aer_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", - aer_error_severity_string[info->severity], + aer_printk(level, dev, "%s Bus Error: severity=%s, type=%s, (%s)\n", + bus_type, aer_error_severity_string[info->severity], aer_error_layer[layer], aer_agent_string[agent]); aer_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", @@ -902,6 +903,7 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer); void pci_print_aer(struct pci_dev *dev, int aer_severity, struct aer_capability_regs *aer) { + const char *bus_type; int layer, agent, tlp_header_valid = 0; u32 status, mask; struct aer_err_info info = { @@ -922,10 +924,13 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, info.status = status; info.mask = mask; + info.is_cxl = pcie_is_cxl(dev); + + bus_type = aer_err_bus(&info); pci_dev_aer_stats_incr(dev, &info); - trace_aer_event(pci_name(dev), (status & ~mask), - aer_severity, tlp_header_valid, &aer->header_log); + trace_aer_event(pci_name(dev), (status & ~mask), aer_severity, + tlp_header_valid, &aer->header_log, bus_type); if (!aer_ratelimit(dev, info.severity)) return; @@ -1280,6 +1285,7 @@ int aer_get_device_error_info(struct aer_err_info *info, int i) /* Must reset in this function */ info->status = 0; info->tlp_header_valid = 0; + info->is_cxl = pcie_is_cxl(dev); /* The device might not support AER */ if (!aer) diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index fecfeb7c8be7f..3523cc8597612 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -301,9 +301,11 @@ TRACE_EVENT(aer_event, const u32 status, const u8 severity, const u8 tlp_header_valid, - struct pcie_tlp_log *tlp), + struct pcie_tlp_log *tlp, + const char *bus_type), - TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp), + + TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp, bus_type), TP_STRUCT__entry( __string( dev_name, dev_name ) @@ -311,10 +313,12 @@ TRACE_EVENT(aer_event, __field( u8, severity ) __field( u8, tlp_header_valid) __array( u32, tlp_header, PCIE_STD_MAX_TLP_HEADERLOG) + __string( bus_type, bus_type ) ), TP_fast_assign( __assign_str(dev_name); + __assign_str(bus_type); __entry->status = status; __entry->severity = severity; __entry->tlp_header_valid = tlp_header_valid; @@ -326,8 +330,8 @@ TRACE_EVENT(aer_event, } ), - TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n", - __get_str(dev_name), + TP_printk("%s %s Bus Error: severity=%s, %s, TLP Header=%s\n", + __get_str(dev_name), __get_str(bus_type), __entry->severity == AER_CORRECTABLE ? "Corrected" : __entry->severity == AER_FATAL ? "Fatal" : "Uncorrected, non-fatal", From 6b3be6b7a150f84a77bfffed07bdeba4b5558436 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Mon, 19 Jan 2026 18:40:58 -0800 Subject: [PATCH 104/143] PCI/AER: Update struct aer_err_info with kernel-doc formatting Update the existing 'struct aer_err_info' definition to use kernel-doc formatting. Remove the inline comments to reduce noise and do not introduce functional changes. This will improve readability and maintainability. Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-16-terry.bowman@amd.com Acked-by: Bjorn Helgaas Signed-off-by: Dan Williams Signed-off-by: Dave Jiang (cherry picked from commit fda78d848178fb2b4eea74d96218c6c98fbe8562) Signed-off-by: Jiandi An --- drivers/pci/pci.h | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 7ed929cfd45dd..d4ae4eef89975 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -607,16 +607,35 @@ static inline bool pci_dev_binding_disallowed(struct pci_dev *dev) #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ +/** + * struct aer_err_info - AER Error Information + * @dev: Devices reporting error + * @ratelimit_print: Flag to log or not log the devices' error. 0=NotLog/1=Log + * @__pad1: Padding for alignment + * @error_dev_num: Number of devices reporting an error + * @level: printk level to use in logging + * @id: Value from register PCI_ERR_ROOT_ERR_SRC + * @severity: AER severity, 0-UNCOR Non-fatal, 1-UNCOR fatal, 2-COR + * @root_ratelimit_print: Flag to log or not log the root's error. 0=NotLog/1=Log + * @multi_error_valid: If multiple errors are reported + * @first_error: First reported error + * @__pad2: Padding for alignment + * @is_cxl: Bus type error: 0-PCI Bus error, 1-CXL Bus error + * @tlp_header_valid: Indicates if TLP field contains error information + * @status: COR/UNCOR error status + * @mask: COR/UNCOR mask + * @tlp: Transaction packet information + */ struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int ratelimit_print[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; - const char *level; /* printk level */ + const char *level; unsigned int id:16; - unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ - unsigned int root_ratelimit_print:1; /* 0=skip, 1=print */ + unsigned int severity:2; + unsigned int root_ratelimit_print:1; unsigned int __pad1:4; unsigned int multi_error_valid:1; @@ -625,9 +644,9 @@ struct aer_err_info { unsigned int is_cxl:1; unsigned int tlp_header_valid:1; - unsigned int status; /* COR/UNCOR Error Status */ - unsigned int mask; /* COR/UNCOR Error Mask */ - struct pcie_tlp_log tlp; /* TLP Header */ + unsigned int status; + unsigned int mask; + struct pcie_tlp_log tlp; }; int aer_get_device_error_info(struct aer_err_info *info, int i); From dd9ca3b59da228165eb8383119a54603261fb837 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:44:55 +0000 Subject: [PATCH 105/143] NVIDIA: VR: SAUCE: cxl/region: Skip decoder reset on detach for autodiscovered regions __cxl_decoder_detach() currently resets decoder programming whenever a region is detached if cxl_config_state is beyond CXL_CONFIG_ACTIVE. For autodiscovered regions, this can incorrectly tear down decoder state that may be relied upon by other consumers or by subsequent ownership decisions. Skip cxl_region_decode_reset() during detach when CXL_REGION_F_AUTO is set. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alejandro Lucero Tested-by: Tomasz Wolski (backported from https://lore.kernel.org/linux-cxl/20260210064501.157591-4-Smita.KoralahalliChannabasappa@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index dee25d90b3e49..50df9afac20ff 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2178,7 +2178,9 @@ __cxl_decoder_detach(struct cxl_region *cxlr, cxled->part = -1; if (p->state > CXL_CONFIG_ACTIVE) { - cxl_region_decode_reset(cxlr, p->interleave_ways); + if (!test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + cxl_region_decode_reset(cxlr, p->interleave_ways); + p->state = CXL_CONFIG_ACTIVE; } From adc3833ba27c3f26e88f1427a92f6b138a452552 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:17 +0000 Subject: [PATCH 106/143] NVIDIA: VR: SAUCE: cxl: Add type2 device basic support Differentiate CXL memory expanders (type 3) from CXL device accelerators (type 2) with a new function for initializing cxl_dev_state and a macro for helping accel drivers to embed cxl_dev_state inside a private struct. Move structs to include/cxl as the size of the accel driver private struct embedding cxl_dev_state needs to know the size of this struct. Use same new initialization with the type3 pci driver. Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Ben Cheatham (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/mbox.c | 12 +- drivers/cxl/core/memdev.c | 32 +++++ drivers/cxl/cxl.h | 97 +-------------- drivers/cxl/cxlmem.h | 86 +------------ drivers/cxl/pci.c | 14 +-- include/cxl/cxl.h | 226 +++++++++++++++++++++++++++++++++++ tools/testing/cxl/test/mem.c | 3 +- 7 files changed, 274 insertions(+), 196 deletions(-) create mode 100644 include/cxl/cxl.h diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index fa6dd0c94656f..bee84d0101d1a 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1514,23 +1514,21 @@ int cxl_mailbox_init(struct cxl_mailbox *cxl_mbox, struct device *host) } EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL"); -struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, + u16 dvsec) { struct cxl_memdev_state *mds; int rc; - mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL); + mds = devm_cxl_dev_state_create(dev, CXL_DEVTYPE_CLASSMEM, serial, + dvsec, struct cxl_memdev_state, cxlds, + true); if (!mds) { dev_err(dev, "No memory available\n"); return ERR_PTR(-ENOMEM); } mutex_init(&mds->event.log_lock); - mds->cxlds.dev = dev; - mds->cxlds.reg_map.host = dev; - mds->cxlds.cxl_mbox.host = dev; - mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; - mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); if (rc == -EOPNOTSUPP) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index af3d0cc651387..22d156f25305d 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -656,6 +656,38 @@ static void detach_memdev(struct work_struct *work) static struct lock_class_key cxl_memdev_key; +static void cxl_dev_state_init(struct cxl_dev_state *cxlds, struct device *dev, + enum cxl_devtype type, u64 serial, u16 dvsec, + bool has_mbox) +{ + *cxlds = (struct cxl_dev_state) { + .dev = dev, + .type = type, + .serial = serial, + .cxl_dvsec = dvsec, + .reg_map.host = dev, + .reg_map.resource = CXL_RESOURCE_NONE, + }; + + if (has_mbox) + cxlds->cxl_mbox.host = dev; +} + +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox) +{ + struct cxl_dev_state *cxlds = devm_kzalloc(dev, size, GFP_KERNEL); + + if (!cxlds) + return NULL; + + cxl_dev_state_init(cxlds, dev, type, serial, dvsec, has_mbox); + return cxlds; +} +EXPORT_SYMBOL_NS_GPL(_devm_cxl_dev_state_create, "CXL"); + static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, const struct file_operations *fops, const struct cxl_memdev_attach *attach) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index e1d47062e1d3d..3eaa353e430b8 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -12,6 +12,7 @@ #include #include #include +#include extern const struct nvdimm_security_ops *cxl_security_ops; @@ -201,97 +202,6 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw) #define CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48) #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20 -/* - * Using struct_group() allows for per register-block-type helper routines, - * without requiring block-type agnostic code to include the prefix. - */ -struct cxl_regs { - /* - * Common set of CXL Component register block base pointers - * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure - * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure - */ - struct_group_tagged(cxl_component_regs, component, - void __iomem *hdm_decoder; - void __iomem *ras; - ); - /* - * Common set of CXL Device register block base pointers - * @status: CXL 2.0 8.2.8.3 Device Status Registers - * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers - * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers - */ - struct_group_tagged(cxl_device_regs, device_regs, - void __iomem *status, *mbox, *memdev; - ); - - struct_group_tagged(cxl_pmu_regs, pmu_regs, - void __iomem *pmu; - ); - - /* - * RCH downstream port specific RAS register - * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB - */ - struct_group_tagged(cxl_rch_regs, rch_regs, - void __iomem *dport_aer; - ); - - /* - * RCD upstream port specific PCIe cap register - * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB - */ - struct_group_tagged(cxl_rcd_regs, rcd_regs, - void __iomem *rcd_pcie_cap; - ); -}; - -struct cxl_reg_map { - bool valid; - int id; - unsigned long offset; - unsigned long size; -}; - -struct cxl_component_reg_map { - struct cxl_reg_map hdm_decoder; - struct cxl_reg_map ras; -}; - -struct cxl_device_reg_map { - struct cxl_reg_map status; - struct cxl_reg_map mbox; - struct cxl_reg_map memdev; -}; - -struct cxl_pmu_reg_map { - struct cxl_reg_map pmu; -}; - -/** - * struct cxl_register_map - DVSEC harvested register block mapping parameters - * @host: device for devm operations and logging - * @base: virtual base of the register-block-BAR + @block_offset - * @resource: physical resource base of the register block - * @max_size: maximum mapping size to perform register search - * @reg_type: see enum cxl_regloc_type - * @component_map: cxl_reg_map for component registers - * @device_map: cxl_reg_maps for device registers - * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units - */ -struct cxl_register_map { - struct device *host; - void __iomem *base; - resource_size_t resource; - resource_size_t max_size; - u8 reg_type; - union { - struct cxl_component_reg_map component_map; - struct cxl_device_reg_map device_map; - struct cxl_pmu_reg_map pmu_map; - }; -}; - void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, @@ -497,11 +407,6 @@ struct cxl_region_params { resource_size_t cache_size; }; -enum cxl_partition_mode { - CXL_PARTMODE_RAM, - CXL_PARTMODE_PMEM, -}; - /* * Indicate whether this region has been assembled by autodetection or * userspace assembly. Prevent endpoint decoders outside of automatic diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index ef202b34e5ea4..281546de426e4 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -113,8 +113,6 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped); -#define CXL_NR_PARTITIONS_MAX 2 - struct cxl_dpa_info { u64 size; struct cxl_dpa_part_info { @@ -373,87 +371,6 @@ struct cxl_security_state { struct kernfs_node *sanitize_node; }; -/* - * enum cxl_devtype - delineate type-2 from a generic type-3 device - * @CXL_DEVTYPE_DEVMEM - Vendor specific CXL Type-2 device implementing HDM-D or - * HDM-DB, no requirement that this device implements a - * mailbox, or other memory-device-standard manageability - * flows. - * @CXL_DEVTYPE_CLASSMEM - Common class definition of a CXL Type-3 device with - * HDM-H and class-mandatory memory device registers - */ -enum cxl_devtype { - CXL_DEVTYPE_DEVMEM, - CXL_DEVTYPE_CLASSMEM, -}; - -/** - * struct cxl_dpa_perf - DPA performance property entry - * @dpa_range: range for DPA address - * @coord: QoS performance data (i.e. latency, bandwidth) - * @cdat_coord: raw QoS performance data from CDAT - * @qos_class: QoS Class cookies - */ -struct cxl_dpa_perf { - struct range dpa_range; - struct access_coordinate coord[ACCESS_COORDINATE_MAX]; - struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; - int qos_class; -}; - -/** - * struct cxl_dpa_partition - DPA partition descriptor - * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) - * @perf: performance attributes of the partition from CDAT - * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... - */ -struct cxl_dpa_partition { - struct resource res; - struct cxl_dpa_perf perf; - enum cxl_partition_mode mode; -}; - -/** - * struct cxl_dev_state - The driver device state - * - * cxl_dev_state represents the CXL driver/device state. It provides an - * interface to mailbox commands as well as some cached data about the device. - * Currently only memory devices are represented. - * - * @dev: The device associated with this CXL state - * @cxlmd: The device representing the CXL.mem capabilities of @dev - * @reg_map: component and ras register mapping parameters - * @regs: Parsed register blocks - * @cxl_dvsec: Offset to the PCIe device DVSEC - * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) - * @media_ready: Indicate whether the device media is usable - * @dpa_res: Overall DPA resource tree for the device - * @part: DPA partition array - * @nr_partitions: Number of DPA partitions - * @serial: PCIe Device Serial Number - * @type: Generic Memory Class device or Vendor Specific Memory device - * @cxl_mbox: CXL mailbox context - * @cxlfs: CXL features context - */ -struct cxl_dev_state { - struct device *dev; - struct cxl_memdev *cxlmd; - struct cxl_register_map reg_map; - struct cxl_regs regs; - int cxl_dvsec; - bool rcd; - bool media_ready; - struct resource dpa_res; - struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; - unsigned int nr_partitions; - u64 serial; - enum cxl_devtype type; - struct cxl_mailbox cxl_mbox; -#ifdef CONFIG_CXL_FEATURES - struct cxl_features_state *cxlfs; -#endif -}; - static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) { /* @@ -858,7 +775,8 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds); int cxl_await_media_ready(struct cxl_dev_state *cxlds); int cxl_enumerate_cmds(struct cxl_memdev_state *mds); int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info); -struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev); +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, + u16 dvsec); void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds); void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 1cf2322208735..24179cc702bfc 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -911,25 +911,25 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) int rc, pmu_count; unsigned int i; bool irq_avail; + u16 dvsec; rc = pcim_enable_device(pdev); if (rc) return rc; pci_set_master(pdev); - mds = cxl_memdev_state_create(&pdev->dev); + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + pci_warn(pdev, "Device DVSEC not present, skip CXL.mem init\n"); + + mds = cxl_memdev_state_create(&pdev->dev, pci_get_dsn(pdev), dvsec); if (IS_ERR(mds)) return PTR_ERR(mds); cxlds = &mds->cxlds; pci_set_drvdata(pdev, cxlds); cxlds->rcd = is_cxl_restricted(pdev); - cxlds->serial = pci_get_dsn(pdev); - cxlds->cxl_dvsec = pci_find_dvsec_capability( - pdev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_DEVICE); - if (!cxlds->cxl_dvsec) - dev_warn(&pdev->dev, - "Device DVSEC not present, skip CXL.mem init\n"); rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); if (rc) diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h new file mode 100644 index 0000000000000..13d448686189c --- /dev/null +++ b/include/cxl/cxl.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ +/* Copyright(c) 2025 Advanced Micro Devices, Inc. */ + +#ifndef __CXL_CXL_H__ +#define __CXL_CXL_H__ + +#include +#include +#include + +/** + * enum cxl_devtype - delineate type-2 from a generic type-3 device + * @CXL_DEVTYPE_DEVMEM: Vendor specific CXL Type-2 device implementing HDM-D or + * HDM-DB, no requirement that this device implements a + * mailbox, or other memory-device-standard manageability + * flows. + * @CXL_DEVTYPE_CLASSMEM: Common class definition of a CXL Type-3 device with + * HDM-H and class-mandatory memory device registers + */ +enum cxl_devtype { + CXL_DEVTYPE_DEVMEM, + CXL_DEVTYPE_CLASSMEM, +}; + +struct device; + +/* + * Using struct_group() allows for per register-block-type helper routines, + * without requiring block-type agnostic code to include the prefix. + */ +struct cxl_regs { + /* + * Common set of CXL Component register block base pointers + * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure + * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure + */ + struct_group_tagged(cxl_component_regs, component, + void __iomem *hdm_decoder; + void __iomem *ras; + ); + /* + * Common set of CXL Device register block base pointers + * @status: CXL 2.0 8.2.8.3 Device Status Registers + * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers + * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers + */ + struct_group_tagged(cxl_device_regs, device_regs, + void __iomem *status, *mbox, *memdev; + ); + + struct_group_tagged(cxl_pmu_regs, pmu_regs, + void __iomem *pmu; + ); + + /* + * RCH downstream port specific RAS register + * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB + */ + struct_group_tagged(cxl_rch_regs, rch_regs, + void __iomem *dport_aer; + ); + + /* + * RCD upstream port specific PCIe cap register + * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB + */ + struct_group_tagged(cxl_rcd_regs, rcd_regs, + void __iomem *rcd_pcie_cap; + ); +}; + +struct cxl_reg_map { + bool valid; + int id; + unsigned long offset; + unsigned long size; +}; + +struct cxl_component_reg_map { + struct cxl_reg_map hdm_decoder; + struct cxl_reg_map ras; +}; + +struct cxl_device_reg_map { + struct cxl_reg_map status; + struct cxl_reg_map mbox; + struct cxl_reg_map memdev; +}; + +struct cxl_pmu_reg_map { + struct cxl_reg_map pmu; +}; + +/** + * struct cxl_register_map - DVSEC harvested register block mapping parameters + * @host: device for devm operations and logging + * @base: virtual base of the register-block-BAR + @block_offset + * @resource: physical resource base of the register block + * @max_size: maximum mapping size to perform register search + * @reg_type: see enum cxl_regloc_type + * @component_map: cxl_reg_map for component registers + * @device_map: cxl_reg_maps for device registers + * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + */ +struct cxl_register_map { + struct device *host; + void __iomem *base; + resource_size_t resource; + resource_size_t max_size; + u8 reg_type; + union { + struct cxl_component_reg_map component_map; + struct cxl_device_reg_map device_map; + struct cxl_pmu_reg_map pmu_map; + }; +}; + +/** + * struct cxl_dpa_perf - DPA performance property entry + * @dpa_range: range for DPA address + * @coord: QoS performance data (i.e. latency, bandwidth) + * @cdat_coord: raw QoS performance data from CDAT + * @qos_class: QoS Class cookies + */ +struct cxl_dpa_perf { + struct range dpa_range; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; + int qos_class; +}; + +enum cxl_partition_mode { + CXL_PARTMODE_RAM, + CXL_PARTMODE_PMEM, +}; + +/** + * struct cxl_dpa_partition - DPA partition descriptor + * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) + * @perf: performance attributes of the partition from CDAT + * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... + */ +struct cxl_dpa_partition { + struct resource res; + struct cxl_dpa_perf perf; + enum cxl_partition_mode mode; +}; + +#define CXL_NR_PARTITIONS_MAX 2 + +/** + * struct cxl_dev_state - The driver device state + * + * cxl_dev_state represents the CXL driver/device state. It provides an + * interface to mailbox commands as well as some cached data about the device. + * Currently only memory devices are represented. + * + * @dev: The device associated with this CXL state + * @cxlmd: The device representing the CXL.mem capabilities of @dev + * @reg_map: component and ras register mapping parameters + * @regs: Parsed register blocks + * @cxl_dvsec: Offset to the PCIe device DVSEC + * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) + * @media_ready: Indicate whether the device media is usable + * @dpa_res: Overall DPA resource tree for the device + * @part: DPA partition array + * @nr_partitions: Number of DPA partitions + * @serial: PCIe Device Serial Number + * @type: Generic Memory Class device or Vendor Specific Memory device + * @cxl_mbox: CXL mailbox context + * @cxlfs: CXL features context + */ +struct cxl_dev_state { + /* public for Type2 drivers */ + struct device *dev; + struct cxl_memdev *cxlmd; + + /* private for Type2 drivers */ + struct cxl_register_map reg_map; + struct cxl_regs regs; + int cxl_dvsec; + bool rcd; + bool media_ready; + struct resource dpa_res; + struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; + unsigned int nr_partitions; + u64 serial; + enum cxl_devtype type; + struct cxl_mailbox cxl_mbox; +#ifdef CONFIG_CXL_FEATURES + struct cxl_features_state *cxlfs; +#endif +}; + +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox); + +/** + * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a + * driver specific struct. + * + * @parent: device behind the request + * @type: CXL device type + * @serial: device identification + * @dvsec: dvsec capability offset + * @drv_struct: driver struct embedding a cxl_dev_state struct + * @member: drv_struct member as cxl_dev_state + * @mbox: true if mailbox supported + * + * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state + * struct initialized. + * + * Introduced for Type2 driver support. + */ +#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ + ({ \ + static_assert(__same_type(struct cxl_dev_state, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ + sizeof(drv_struct), mbox); \ + }) +#endif /* __CXL_CXL_H__ */ diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index cb87e8c0e63c0..79f42f4474d47 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1716,7 +1716,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; - mds = cxl_memdev_state_create(dev); + mds = cxl_memdev_state_create(dev, pdev->id + 1, 0); if (IS_ERR(mds)) return PTR_ERR(mds); @@ -1732,7 +1732,6 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf; INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work); - cxlds->serial = pdev->id + 1; if (is_rcd(pdev)) cxlds->rcd = true; From 25de561ceb1c37778055869229a1227a7949945b Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:18 +0000 Subject: [PATCH 107/143] NVIDIA: VR: SAUCE: sfc: add cxl support Add CXL initialization based on new CXL API for accel drivers and make it dependent on kernel CXL configuration. Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Acked-by: Edward Cree Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/net/ethernet/sfc/Kconfig | 9 +++++ drivers/net/ethernet/sfc/Makefile | 1 + drivers/net/ethernet/sfc/efx.c | 15 ++++++- drivers/net/ethernet/sfc/efx_cxl.c | 56 +++++++++++++++++++++++++++ drivers/net/ethernet/sfc/efx_cxl.h | 40 +++++++++++++++++++ drivers/net/ethernet/sfc/net_driver.h | 10 +++++ 6 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/sfc/efx_cxl.c create mode 100644 drivers/net/ethernet/sfc/efx_cxl.h diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig index c4c43434f3143..979f2801e2a8e 100644 --- a/drivers/net/ethernet/sfc/Kconfig +++ b/drivers/net/ethernet/sfc/Kconfig @@ -66,6 +66,15 @@ config SFC_MCDI_LOGGING Driver-Interface) commands and responses, allowing debugging of driver/firmware interaction. The tracing is actually enabled by a sysfs file 'mcdi_logging' under the PCI device. +config SFC_CXL + bool "Solarflare SFC9100-family CXL support" + depends on SFC && CXL_BUS >= SFC + default SFC + help + This enables SFC CXL support if the kernel is configuring CXL for + using CTPIO with CXL.mem. The SFC device with CXL support and + with a CXL-aware firmware can be used for minimizing latencies + when sending through CTPIO. source "drivers/net/ethernet/sfc/falcon/Kconfig" source "drivers/net/ethernet/sfc/siena/Kconfig" diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile index d99039ec468d6..bb0f1891cde65 100644 --- a/drivers/net/ethernet/sfc/Makefile +++ b/drivers/net/ethernet/sfc/Makefile @@ -13,6 +13,7 @@ sfc-$(CONFIG_SFC_SRIOV) += sriov.o ef10_sriov.o ef100_sriov.o ef100_rep.o \ mae.o tc.o tc_bindings.o tc_counters.o \ tc_encap_actions.o tc_conntrack.o +sfc-$(CONFIG_SFC_CXL) += efx_cxl.o obj-$(CONFIG_SFC) += sfc.o obj-$(CONFIG_SFC_FALCON) += falcon/ diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 112e55b98ed3b..537668278375b 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -34,6 +34,7 @@ #include "selftest.h" #include "sriov.h" #include "efx_devlink.h" +#include "efx_cxl.h" #include "mcdi_port_common.h" #include "mcdi_pcol.h" @@ -981,12 +982,15 @@ static void efx_pci_remove(struct pci_dev *pci_dev) efx_pci_remove_main(efx); efx_fini_io(efx); + + probe_data = container_of(efx, struct efx_probe_data, efx); + efx_cxl_exit(probe_data); + pci_dbg(efx->pci_dev, "shutdown successful\n"); efx_fini_devlink_and_unlock(efx); efx_fini_struct(efx); free_netdev(efx->net_dev); - probe_data = container_of(efx, struct efx_probe_data, efx); kfree(probe_data); }; @@ -1190,6 +1194,15 @@ static int efx_pci_probe(struct pci_dev *pci_dev, if (rc) goto fail2; + /* A successful cxl initialization implies a CXL region created to be + * used for PIO buffers. If there is no CXL support, or initialization + * fails, efx_cxl_pio_initialised will be false and legacy PIO buffers + * defined at specific PCI BAR regions will be used. + */ + rc = efx_cxl_init(probe_data); + if (rc) + pci_err(pci_dev, "CXL initialization failed with error %d\n", rc); + rc = efx_pci_probe_post_io(efx); if (rc) { /* On failure, retry once immediately. diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c new file mode 100644 index 0000000000000..8e0481d8dced6 --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * + * Driver for AMD network controllers and boards + * Copyright (C) 2025, Advanced Micro Devices, Inc. + */ + +#include + +#include "net_driver.h" +#include "efx_cxl.h" + +#define EFX_CTPIO_BUFFER_SIZE SZ_256M + +int efx_cxl_init(struct efx_probe_data *probe_data) +{ + struct efx_nic *efx = &probe_data->efx; + struct pci_dev *pci_dev = efx->pci_dev; + struct efx_cxl *cxl; + u16 dvsec; + + probe_data->cxl_pio_initialised = false; + + /* Is the device configured with and using CXL? */ + if (!pcie_is_cxl(pci_dev)) + return 0; + + dvsec = pci_find_dvsec_capability(pci_dev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) { + pci_err(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability not found\n"); + return 0; + } + + pci_dbg(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability found\n"); + + /* Create a cxl_dev_state embedded in the cxl struct using cxl core api + * specifying no mbox available. + */ + cxl = devm_cxl_dev_state_create(&pci_dev->dev, CXL_DEVTYPE_DEVMEM, + pci_dev->dev.id, dvsec, struct efx_cxl, + cxlds, false); + + if (!cxl) + return -ENOMEM; + + probe_data->cxl = cxl; + + return 0; +} + +void efx_cxl_exit(struct efx_probe_data *probe_data) +{ +} + +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h new file mode 100644 index 0000000000000..961639cef692e --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_cxl.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for AMD network controllers and boards + * Copyright (C) 2025, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#ifndef EFX_CXL_H +#define EFX_CXL_H + +#ifdef CONFIG_SFC_CXL + +#include + +struct cxl_root_decoder; +struct cxl_port; +struct cxl_endpoint_decoder; +struct cxl_region; +struct efx_probe_data; + +struct efx_cxl { + struct cxl_dev_state cxlds; + struct cxl_memdev *cxlmd; + struct cxl_root_decoder *cxlrd; + struct cxl_port *endpoint; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *efx_region; + void __iomem *ctpio_cxl; +}; + +int efx_cxl_init(struct efx_probe_data *probe_data); +void efx_cxl_exit(struct efx_probe_data *probe_data); +#else +static inline int efx_cxl_init(struct efx_probe_data *probe_data) { return 0; } +static inline void efx_cxl_exit(struct efx_probe_data *probe_data) {} +#endif +#endif diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index b98c259f672db..3964b2c56609c 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -1197,14 +1197,24 @@ struct efx_nic { atomic_t n_rx_noskb_drops; }; +#ifdef CONFIG_SFC_CXL +struct efx_cxl; +#endif + /** * struct efx_probe_data - State after hardware probe * @pci_dev: The PCI device * @efx: Efx NIC details + * @cxl: details of related cxl objects + * @cxl_pio_initialised: cxl initialization outcome. */ struct efx_probe_data { struct pci_dev *pci_dev; struct efx_nic efx; +#ifdef CONFIG_SFC_CXL + struct efx_cxl *cxl; + bool cxl_pio_initialised; +#endif }; static inline struct efx_nic *efx_netdev_priv(struct net_device *dev) From 98d5c84f1b8c77896e4e39dff2c973e8015a2902 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:19 +0000 Subject: [PATCH 108/143] NVIDIA: VR: SAUCE: cxl: Move pci generic code Inside cxl/core/pci.c there are helpers for CXL PCIe initialization meanwhile cxl/pci_drv.c implements the functionality for a Type3 device initialization. Move helper functions from cxl/core/pci_drv.c to cxl/core/pci.c in order to be exported and shared with CXL Type2 device initialization. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Reviewed-by: Fan Ni Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Dan Williams (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/core.h | 3 +- drivers/cxl/core/pci.c | 62 ++++++++++++++++++++++++++++++++++++ drivers/cxl/core/regs.c | 1 - drivers/cxl/cxl.h | 2 -- drivers/cxl/cxlpci.h | 13 ++++++++ drivers/cxl/pci.c | 70 ----------------------------------------- 6 files changed, 77 insertions(+), 74 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 422531799af2f..256799d393616 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -187,5 +187,6 @@ int cxl_set_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid, size_t feat_data_size, u32 feat_flag, u16 offset, u16 *return_code); #endif - +resource_size_t cxl_rcd_component_reg_phys(struct device *dev, + struct cxl_dport *dport); #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index b838c59d7a3c0..6b7e50858d56d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -696,6 +696,68 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, "CXL"); +static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, + struct cxl_register_map *map, + struct cxl_dport *dport) +{ + resource_size_t component_reg_phys; + + *map = (struct cxl_register_map) { + .host = &pdev->dev, + .resource = CXL_RESOURCE_NONE, + }; + + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return -EPROBE_DEFER; + + component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); + if (component_reg_phys == CXL_RESOURCE_NONE) + return -ENXIO; + + map->resource = component_reg_phys; + map->reg_type = CXL_REGLOC_RBI_COMPONENT; + map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + return 0; +} + +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map) +{ + int rc; + + rc = cxl_find_regblock(pdev, type, map); + + /* + * If the Register Locator DVSEC does not exist, check if it + * is an RCH and try to extract the Component Registers from + * an RCRB. + */ + if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { + struct cxl_dport *dport; + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return -EPROBE_DEFER; + + rc = cxl_rcrb_get_comp_regs(pdev, map, dport); + if (rc) + return rc; + + rc = cxl_dport_map_rcd_linkcap(pdev, dport); + if (rc) + return rc; + + } else if (rc) { + return rc; + } + + return cxl_setup_regs(map); +} +EXPORT_SYMBOL_NS_GPL(cxl_pci_setup_regs, "CXL"); + int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) { int speed, bw; diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index a010b32143422..93710cf4f0a69 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -641,4 +641,3 @@ resource_size_t cxl_rcd_component_reg_phys(struct device *dev, return CXL_RESOURCE_NONE; return __rcrb_to_component(dev, &dport->rcrb, CXL_RCRB_UPSTREAM); } -EXPORT_SYMBOL_NS_GPL(cxl_rcd_component_reg_phys, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 3eaa353e430b8..5d111980d879d 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -222,8 +222,6 @@ int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); int cxl_setup_regs(struct cxl_register_map *map); struct cxl_dport; -resource_size_t cxl_rcd_component_reg_phys(struct device *dev, - struct cxl_dport *dport); int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_RESOURCE_NONE ((resource_size_t) -1) diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 6f9c78886fd9a..d879120b27800 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -74,6 +74,17 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) return lnksta2 & PCI_EXP_LNKSTA2_FLIT; } +/* + * Assume that the caller has already validated that @pdev has CXL + * capabilities, any RCiEP with CXL capabilities is treated as a + * Restricted CXL Device (RCD) and finds upstream port and endpoint + * registers in a Root Complex Register Block (RCRB). + */ +static inline bool is_cxl_restricted(struct pci_dev *pdev) +{ + return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; +} + struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); @@ -95,4 +106,6 @@ static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) { } #endif +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 24179cc702bfc..668d44eb1bf5c 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -465,76 +465,6 @@ static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail) return 0; } -/* - * Assume that any RCIEP that emits the CXL memory expander class code - * is an RCD - */ -static bool is_cxl_restricted(struct pci_dev *pdev) -{ - return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; -} - -static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, - struct cxl_register_map *map, - struct cxl_dport *dport) -{ - resource_size_t component_reg_phys; - - *map = (struct cxl_register_map) { - .host = &pdev->dev, - .resource = CXL_RESOURCE_NONE, - }; - - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - - component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); - if (component_reg_phys == CXL_RESOURCE_NONE) - return -ENXIO; - - map->resource = component_reg_phys; - map->reg_type = CXL_REGLOC_RBI_COMPONENT; - map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; - - return 0; -} - -static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map) -{ - int rc; - - rc = cxl_find_regblock(pdev, type, map); - - /* - * If the Register Locator DVSEC does not exist, check if it - * is an RCH and try to extract the Component Registers from - * an RCRB. - */ - if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { - struct cxl_dport *dport; - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - - rc = cxl_rcrb_get_comp_regs(pdev, map, dport); - if (rc) - return rc; - - rc = cxl_dport_map_rcd_linkcap(pdev, dport); - if (rc) - return rc; - - } else if (rc) { - return rc; - } - - return cxl_setup_regs(map); -} - static int cxl_pci_ras_unmask(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); From 17c0f738323147a3fcac101b00a62cee8a9c9ad8 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:20 +0000 Subject: [PATCH 109/143] NVIDIA: VR: SAUCE: cxl/sfc: Map cxl component regs Export cxl core functions for a Type2 driver being able to discover and map the device component registers. Use it in sfc driver cxl initialization. Signed-off-by: Alejandro Lucero Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 1 + drivers/cxl/core/port.c | 1 + drivers/cxl/core/regs.c | 1 + drivers/cxl/cxl.h | 7 ------ drivers/cxl/cxlpci.h | 12 ---------- drivers/cxl/pci.c | 1 + drivers/net/ethernet/sfc/efx_cxl.c | 35 ++++++++++++++++++++++++++++++ include/cxl/cxl.h | 19 ++++++++++++++++ include/cxl/pci.h | 21 ++++++++++++++++++ 9 files changed, 79 insertions(+), 19 deletions(-) create mode 100644 include/cxl/pci.h diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 6b7e50858d56d..ba2d393c540af 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 54f72452fb062..385588b8b30b5 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 93710cf4f0a69..20c2d9fbcfe7d 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 5d111980d879d..944c5d1cccebe 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -39,10 +39,6 @@ extern const struct nvdimm_security_ops *cxl_security_ops; #define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) #define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) -#define CXL_CM_CAP_CAP_ID_RAS 0x2 -#define CXL_CM_CAP_CAP_ID_HDM 0x5 -#define CXL_CM_CAP_CAP_HDM_VERSION 1 - /* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ #define CXL_HDM_DECODER_CAP_OFFSET 0x0 #define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) @@ -206,9 +202,6 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, struct cxl_device_reg_map *map); -int cxl_map_component_regs(const struct cxl_register_map *map, - struct cxl_component_regs *regs, - unsigned long map_mask); int cxl_map_device_regs(const struct cxl_register_map *map, struct cxl_device_regs *regs); int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs); diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index d879120b27800..93df1b1fa3268 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -13,16 +13,6 @@ */ #define CXL_PCI_DEFAULT_MAX_VECTORS 16 -/* Register Block Identifier (RBI) */ -enum cxl_regloc_type { - CXL_REGLOC_RBI_EMPTY = 0, - CXL_REGLOC_RBI_COMPONENT, - CXL_REGLOC_RBI_VIRT, - CXL_REGLOC_RBI_MEMDEV, - CXL_REGLOC_RBI_PMU, - CXL_REGLOC_RBI_TYPES -}; - /* * Table Access DOE, CDAT Read Entry Response * @@ -106,6 +96,4 @@ static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) { } #endif -int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 668d44eb1bf5c..7b4699fb88709 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "cxlmem.h" #include "cxlpci.h" diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 8e0481d8dced6..34126bc4826c8 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -7,6 +7,8 @@ #include +#include +#include #include "net_driver.h" #include "efx_cxl.h" @@ -18,6 +20,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data) struct pci_dev *pci_dev = efx->pci_dev; struct efx_cxl *cxl; u16 dvsec; + int rc; probe_data->cxl_pio_initialised = false; @@ -44,6 +47,38 @@ int efx_cxl_init(struct efx_probe_data *probe_data) if (!cxl) return -ENOMEM; + rc = cxl_pci_setup_regs(pci_dev, CXL_REGLOC_RBI_COMPONENT, + &cxl->cxlds.reg_map); + if (rc) { + pci_err(pci_dev, "No component registers\n"); + return rc; + } + + if (!cxl->cxlds.reg_map.component_map.hdm_decoder.valid) { + pci_err(pci_dev, "Expected HDM component register not found\n"); + return -ENODEV; + } + + if (!cxl->cxlds.reg_map.component_map.ras.valid) { + pci_err(pci_dev, "Expected RAS component register not found\n"); + return -ENODEV; + } + + rc = cxl_map_component_regs(&cxl->cxlds.reg_map, + &cxl->cxlds.regs.component, + BIT(CXL_CM_CAP_CAP_ID_RAS)); + if (rc) { + pci_err(pci_dev, "Failed to map RAS capability.\n"); + return rc; + } + + /* + * Set media ready explicitly as there are neither mailbox for checking + * this state nor the CXL register involved, both not mandatory for + * type2. + */ + cxl->cxlds.media_ready = true; + probe_data->cxl = cxl; return 0; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 13d448686189c..7f2e23bce1f78 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -70,6 +70,10 @@ struct cxl_regs { ); }; +#define CXL_CM_CAP_CAP_ID_RAS 0x2 +#define CXL_CM_CAP_CAP_ID_HDM 0x5 +#define CXL_CM_CAP_CAP_HDM_VERSION 1 + struct cxl_reg_map { bool valid; int id; @@ -223,4 +227,19 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ sizeof(drv_struct), mbox); \ }) + +/** + * cxl_map_component_regs - map cxl component registers + * + * @map: cxl register map to update with the mappings + * @regs: cxl component registers to work with + * @map_mask: cxl component regs to map + * + * Returns integer: success (0) or error (-ENOMEM) + * + * Made public for Type2 driver support. + */ +int cxl_map_component_regs(const struct cxl_register_map *map, + struct cxl_component_regs *regs, + unsigned long map_mask); #endif /* __CXL_CXL_H__ */ diff --git a/include/cxl/pci.h b/include/cxl/pci.h new file mode 100644 index 0000000000000..a172439f08c60 --- /dev/null +++ b/include/cxl/pci.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ + +#ifndef __CXL_CXL_PCI_H__ +#define __CXL_CXL_PCI_H__ + +/* Register Block Identifier (RBI) */ +enum cxl_regloc_type { + CXL_REGLOC_RBI_EMPTY = 0, + CXL_REGLOC_RBI_COMPONENT, + CXL_REGLOC_RBI_VIRT, + CXL_REGLOC_RBI_MEMDEV, + CXL_REGLOC_RBI_PMU, + CXL_REGLOC_RBI_TYPES +}; + +struct cxl_register_map; + +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); +#endif From ac5bfc99d6b8a2792c6870ed591dc69b258a9294 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:21 +0000 Subject: [PATCH 110/143] NVIDIA: VR: SAUCE: cxl/sfc: Initialize dpa without a mailbox Type3 relies on mailbox CXL_MBOX_OP_IDENTIFY command for initializing memdev state params which end up being used for DPA initialization. Allow a Type2 driver to initialize DPA simply by giving the size of its volatile hardware partition. Move related functions to memdev. Add sfc driver as the client. Signed-off-by: Alejandro Lucero Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Reviewed-by: Jonathan Cameron (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/core.h | 2 + drivers/cxl/core/mbox.c | 51 +---------------------- drivers/cxl/core/memdev.c | 66 ++++++++++++++++++++++++++++++ drivers/net/ethernet/sfc/efx_cxl.c | 5 +++ include/cxl/cxl.h | 1 + 5 files changed, 75 insertions(+), 50 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 256799d393616..e3c85ceda2485 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -89,6 +89,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, struct dentry *cxl_debugfs_create_dir(const char *dir); int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, enum cxl_partition_mode mode); +struct cxl_memdev_state; +int cxl_mem_get_partition_info(struct cxl_memdev_state *mds); int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size); int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index bee84d0101d1a..d57a0c2d39fb6 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1144,7 +1144,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, "CXL"); * * See CXL @8.2.9.5.2.1 Get Partition Info */ -static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) +int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; struct cxl_mbox_get_partition_info pi; @@ -1300,55 +1300,6 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) return -EBUSY; } -static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) -{ - int i = info->nr_partitions; - - if (size == 0) - return; - - info->part[i].range = (struct range) { - .start = start, - .end = start + size - 1, - }; - info->part[i].mode = mode; - info->nr_partitions++; -} - -int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) -{ - struct cxl_dev_state *cxlds = &mds->cxlds; - struct device *dev = cxlds->dev; - int rc; - - if (!cxlds->media_ready) { - info->size = 0; - return 0; - } - - info->size = mds->total_bytes; - - if (mds->partition_align_bytes == 0) { - add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); - add_part(info, mds->volatile_only_bytes, - mds->persistent_only_bytes, CXL_PARTMODE_PMEM); - return 0; - } - - rc = cxl_mem_get_partition_info(mds); - if (rc) { - dev_err(dev, "Failed to query partition information\n"); - return rc; - } - - add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); - add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, - CXL_PARTMODE_PMEM); - - return 0; -} -EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); - int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 22d156f25305d..2c5dd72f43ca4 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -582,6 +582,72 @@ bool is_cxl_memdev(const struct device *dev) } EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); +static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) +{ + int i = info->nr_partitions; + + if (size == 0) + return; + + info->part[i].range = (struct range) { + .start = start, + .end = start + size - 1, + }; + info->part[i].mode = mode; + info->nr_partitions++; +} + +int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) +{ + struct cxl_dev_state *cxlds = &mds->cxlds; + struct device *dev = cxlds->dev; + int rc; + + if (!cxlds->media_ready) { + info->size = 0; + return 0; + } + + info->size = mds->total_bytes; + + if (mds->partition_align_bytes == 0) { + add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->volatile_only_bytes, + mds->persistent_only_bytes, CXL_PARTMODE_PMEM); + return 0; + } + + rc = cxl_mem_get_partition_info(mds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; + } + + add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, + CXL_PARTMODE_PMEM); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); + +/** + * cxl_set_capacity: initialize dpa by a driver without a mailbox. + * + * @cxlds: pointer to cxl_dev_state + * @capacity: device volatile memory size + */ +int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity) +{ + struct cxl_dpa_info range_info = { + .size = capacity, + }; + + add_part(&range_info, 0, capacity, CXL_PARTMODE_RAM); + return cxl_dpa_setup(cxlds, &range_info); +} +EXPORT_SYMBOL_NS_GPL(cxl_set_capacity, "CXL"); + /** * set_exclusive_cxl_commands() - atomically disable user cxl commands * @mds: The device state to operate on diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 34126bc4826c8..0b10a2e6aceb6 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -79,6 +79,11 @@ int efx_cxl_init(struct efx_probe_data *probe_data) */ cxl->cxlds.media_ready = true; + if (cxl_set_capacity(&cxl->cxlds, EFX_CTPIO_BUFFER_SIZE)) { + pci_err(pci_dev, "dpa capacity setup failed\n"); + return -ENODEV; + } + probe_data->cxl = cxl; return 0; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 7f2e23bce1f78..fb2f8f2395d50 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -242,4 +242,5 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, int cxl_map_component_regs(const struct cxl_register_map *map, struct cxl_component_regs *regs, unsigned long map_mask); +int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); #endif /* __CXL_CXL_H__ */ From 34564978cea4888a1627260f4eb564ad865e34d4 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:22 +0000 Subject: [PATCH 111/143] NVIDIA: VR: SAUCE: cxl: Prepare memdev creation for type2 Current cxl core is relying on a CXL_DEVTYPE_CLASSMEM type device when creating a memdev leading to problems when obtaining cxl_memdev_state references from a CXL_DEVTYPE_DEVMEM type. Modify check for obtaining cxl_memdev_state adding CXL_DEVTYPE_DEVMEM support. Make devm_cxl_add_memdev accessible from an accel driver. Signed-off-by: Alejandro Lucero Reviewed-by: Ben Cheatham Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Dan Williams (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/memdev.c | 15 +++++++++++-- drivers/cxl/cxlmem.h | 6 ------ drivers/cxl/mem.c | 45 +++++++++++++++++++++++++++++---------- include/cxl/cxl.h | 6 ++++++ 4 files changed, 53 insertions(+), 19 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 2c5dd72f43ca4..1b43763b8e20e 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "trace.h" #include "core.h" @@ -576,9 +577,16 @@ static const struct device_type cxl_memdev_type = { .groups = cxl_memdev_attribute_groups, }; +static const struct device_type cxl_accel_memdev_type = { + .name = "cxl_accel_memdev", + .release = cxl_memdev_release, + .devnode = cxl_memdev_devnode, +}; + bool is_cxl_memdev(const struct device *dev) { - return dev->type == &cxl_memdev_type; + return (dev->type == &cxl_memdev_type || + dev->type == &cxl_accel_memdev_type); } EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); @@ -781,7 +789,10 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, dev->parent = cxlds->dev; dev->bus = &cxl_bus_type; dev->devt = MKDEV(cxl_mem_major, cxlmd->id); - dev->type = &cxl_memdev_type; + if (cxlds->type == CXL_DEVTYPE_DEVMEM) + dev->type = &cxl_accel_memdev_type; + else + dev->type = &cxl_memdev_type; device_set_pm_not_required(dev); INIT_WORK(&cxlmd->detach_work, detach_memdev); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 281546de426e4..c98db6f18aa29 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,10 +34,6 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) -struct cxl_memdev_attach { - int (*probe)(struct cxl_memdev *cxlmd); -}; - /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -103,8 +99,6 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, - const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 0958bea915acb..39687baedd1a9 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -65,6 +65,26 @@ static int cxl_debugfs_poison_clear(void *data, u64 dpa) DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, cxl_debugfs_poison_clear, "%llx\n"); +static void cxl_memdev_poison_enable(struct cxl_memdev_state *mds, + struct cxl_memdev *cxlmd, + struct dentry *dentry) +{ + /* + * Avoid poison debugfs for DEVMEM aka accelerators as they rely on + * cxl_memdev_state. + */ + if (!mds) + return; + + if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) + debugfs_create_file("inject_poison", 0200, dentry, cxlmd, + &cxl_poison_inject_fops); + + if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) + debugfs_create_file("clear_poison", 0200, dentry, cxlmd, + &cxl_poison_clear_fops); +} + static int cxl_mem_probe(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); @@ -92,12 +112,7 @@ static int cxl_mem_probe(struct device *dev) dentry = cxl_debugfs_create_dir(dev_name(dev)); debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show); - if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) - debugfs_create_file("inject_poison", 0200, dentry, cxlmd, - &cxl_poison_inject_fops); - if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) - debugfs_create_file("clear_poison", 0200, dentry, cxlmd, - &cxl_poison_clear_fops); + cxl_memdev_poison_enable(mds, cxlmd, dentry); rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); if (rc) @@ -208,16 +223,24 @@ static ssize_t trigger_poison_list_store(struct device *dev, } static DEVICE_ATTR_WO(trigger_poison_list); -static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +static bool cxl_poison_attr_visible(struct kobject *kobj, struct attribute *a) { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); - if (a == &dev_attr_trigger_poison_list.attr) - if (!test_bit(CXL_POISON_ENABLED_LIST, - mds->poison.enabled_cmds)) - return 0; + if (!mds || + !test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) + return false; + + return true; +} + +static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +{ + if (a == &dev_attr_trigger_poison_list.attr && + !cxl_poison_attr_visible(kobj, a)) + return 0; return a->mode; } diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index fb2f8f2395d50..6f8d365067af7 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -153,6 +153,10 @@ struct cxl_dpa_partition { #define CXL_NR_PARTITIONS_MAX 2 +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + /** * struct cxl_dev_state - The driver device state * @@ -243,4 +247,6 @@ int cxl_map_component_regs(const struct cxl_register_map *map, struct cxl_component_regs *regs, unsigned long map_mask); int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); #endif /* __CXL_CXL_H__ */ From e3e4d2429cc1de170daba4de15209b665512ee32 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:23 +0000 Subject: [PATCH 112/143] NVIDIA: VR: SAUCE: sfc: create type2 cxl memdev Use cxl API for creating a cxl memory device using the type2 cxl_dev_state struct. Signed-off-by: Alejandro Lucero Reviewed-by: Martin Habets Reviewed-by: Fan Ni Acked-by: Edward Cree Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/net/ethernet/sfc/efx_cxl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 0b10a2e6aceb6..a77ef4783fcb8 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -84,6 +84,12 @@ int efx_cxl_init(struct efx_probe_data *probe_data) return -ENODEV; } + cxl->cxlmd = devm_cxl_add_memdev(&cxl->cxlds, NULL); + if (IS_ERR(cxl->cxlmd)) { + pci_err(pci_dev, "CXL accel memdev creation failed"); + return PTR_ERR(cxl->cxlmd); + } + probe_data->cxl = cxl; return 0; From 7222243b64891c1bb8f482fc877026ebe79bbd18 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:24 +0000 Subject: [PATCH 113/143] NVIDIA: VR: SAUCE: cxl/hdm: Add support for getting region from committed decoder A Type2 device configured by the BIOS can already have its HDM committed. Add a cxl_get_committed_decoder() function for cheking so after memdev creation. A CXL region should have been created during memdev initialization, therefore a Type2 driver can ask for such a region for working with the HPA. If the HDM is not committed, a Type2 driver will create the region after obtaining proper HPA and DPA space. Signed-off-by: Alejandro Lucero (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/hdm.c | 39 +++++++++++++++++++++++++++++++++++++++ include/cxl/cxl.h | 3 +++ 2 files changed, 42 insertions(+) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 061f364cc9a00..b4bd3d91f1cfb 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -686,6 +686,45 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size) return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); } +static int find_committed_endpoint_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_port *port; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + port = cxled_to_port(cxled); + + return cxled->cxld.id == port->hdm_end; +} + +struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, + struct cxl_region **cxlr) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct cxl_endpoint_decoder *cxled; + struct device *cxled_dev; + + if (!endpoint) + return NULL; + + guard(rwsem_read)(&cxl_rwsem.dpa); + cxled_dev = device_find_child(&endpoint->dev, NULL, + find_committed_endpoint_decoder); + + if (!cxled_dev) + return NULL; + + cxled = to_cxl_endpoint_decoder(cxled_dev); + *cxlr = cxled->cxld.region; + + put_device(cxled_dev); + return cxled; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_committed_decoder, "CXL"); + static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl) { u16 eig; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 6f8d365067af7..928276dba9526 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -249,4 +249,7 @@ int cxl_map_component_regs(const struct cxl_register_map *map, int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); +struct cxl_region; +struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, + struct cxl_region **cxlr); #endif /* __CXL_CXL_H__ */ From 8834e8542c5cc5453936864a68d664bbe633ae43 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:25 +0000 Subject: [PATCH 114/143] NVIDIA: VR: SAUCE: cxl: Add function for obtaining region range A CXL region struct contains the physical address to work with. Type2 drivers can create a CXL region but have not access to the related struct as it is defined as private by the kernel CXL core. Add a function for getting the cxl region range to be used for mapping such memory range by a Type2 driver. Signed-off-by: Alejandro Lucero Reviewed-by: Zhi Wang Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 23 +++++++++++++++++++++++ include/cxl/cxl.h | 2 ++ 2 files changed, 25 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 50df9afac20ff..faac07bb80c83 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2623,6 +2623,29 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, return ERR_PTR(rc); } +/** + * cxl_get_region_range - obtain range linked to a CXL region + * + * @region: a pointer to struct cxl_region + * @range: a pointer to a struct range to be set + * + * Returns 0 or error. + */ +int cxl_get_region_range(struct cxl_region *region, struct range *range) +{ + if (WARN_ON_ONCE(!region)) + return -ENODEV; + + if (!region->params.res) + return -ENOSPC; + + range->start = region->params.res->start; + range->end = region->params.res->end; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_region_range, "CXL"); + static ssize_t __create_region_show(struct cxl_root_decoder *cxlrd, char *buf) { return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id)); diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 928276dba9526..906065e0d2a69 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -252,4 +252,6 @@ struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, struct cxl_region; struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, struct cxl_region **cxlr); +struct range; +int cxl_get_region_range(struct cxl_region *region, struct range *range); #endif /* __CXL_CXL_H__ */ From 564c150f82c94a2b3c2a413011395a227a64818a Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:26 +0000 Subject: [PATCH 115/143] NVIDIA: VR: SAUCE: cxl: Export function for unwinding cxl by accelerators Add cxl_unregister_region() to the accelerator driver API for a clean exit. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 17 ++++++++++++----- include/cxl/cxl.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index faac07bb80c83..b145b69e70bb6 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2440,9 +2440,8 @@ static struct cxl_region *to_cxl_region(struct device *dev) return container_of(dev, struct cxl_region, dev); } -static void unregister_region(void *_cxlr) +void cxl_unregister_region(struct cxl_region *cxlr) { - struct cxl_region *cxlr = _cxlr; struct cxl_region_params *p = &cxlr->params; int i; @@ -2459,6 +2458,14 @@ static void unregister_region(void *_cxlr) cxl_region_iomem_release(cxlr); put_device(&cxlr->dev); } +EXPORT_SYMBOL_NS_GPL(cxl_unregister_region, "CXL"); + +static void __unregister_region(void *_cxlr) +{ + struct cxl_region *cxlr = _cxlr; + + return cxl_unregister_region(cxlr); +} static struct lock_class_key cxl_region_key; @@ -2610,7 +2617,7 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, if (rc) goto err; - rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); + rc = devm_add_action_or_reset(port->uport_dev, __unregister_region, cxlr); if (rc) return ERR_PTR(rc); @@ -2764,7 +2771,7 @@ static ssize_t delete_region_store(struct device *dev, if (IS_ERR(cxlr)) return PTR_ERR(cxlr); - devm_release_action(port->uport_dev, unregister_region, cxlr); + devm_release_action(port->uport_dev, __unregister_region, cxlr); put_device(&cxlr->dev); return len; @@ -3888,7 +3895,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, rc = __construct_region(cxlr, cxlrd, cxled); if (rc) { - devm_release_action(port->uport_dev, unregister_region, cxlr); + devm_release_action(port->uport_dev, __unregister_region, cxlr); return ERR_PTR(rc); } diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 906065e0d2a69..92880c26b2d52 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -254,4 +254,5 @@ struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, struct cxl_region **cxlr); struct range; int cxl_get_region_range(struct cxl_region *region, struct range *range); +void cxl_unregister_region(struct cxl_region *cxlr); #endif /* __CXL_CXL_H__ */ From 5b1e9dc8ea4afbc302237f51f9701ce172c8f50f Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:27 +0000 Subject: [PATCH 116/143] NVIDIA: VR: SAUCE: sfc: obtain decoder and region if committed by firmware Check if device HDM is already committed during firmware/BIOS initialization. A CXL region should exist if so after memdev allocation/initialization. Get HPA from region and map it. Signed-off-by: Alejandro Lucero (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/net/ethernet/sfc/efx_cxl.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index a77ef4783fcb8..3536eccf1b2aa 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -19,6 +19,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data) struct efx_nic *efx = &probe_data->efx; struct pci_dev *pci_dev = efx->pci_dev; struct efx_cxl *cxl; + struct range range; u16 dvsec; int rc; @@ -90,13 +91,38 @@ int efx_cxl_init(struct efx_probe_data *probe_data) return PTR_ERR(cxl->cxlmd); } - probe_data->cxl = cxl; + cxl->cxled = cxl_get_committed_decoder(cxl->cxlmd, &cxl->efx_region); + if (cxl->cxled) { + if (!cxl->efx_region) { + pci_err(pci_dev, "CXL found committed decoder without a region"); + return -ENODEV; + } + rc = cxl_get_region_range(cxl->efx_region, &range); + if (rc) { + pci_err(pci_dev, + "CXL getting regions params from a committed decoder failed"); + return rc; + } + + cxl->ctpio_cxl = ioremap(range.start, range.end - range.start + 1); + if (!cxl->ctpio_cxl) { + pci_err(pci_dev, "CXL ioremap region (%pra) failed", &range); + return -ENOMEM; + } + + probe_data->cxl = cxl; + } return 0; } void efx_cxl_exit(struct efx_probe_data *probe_data) { + if (!probe_data->cxl) + return; + + iounmap(probe_data->cxl->ctpio_cxl); + cxl_unregister_region(probe_data->cxl->efx_region); } MODULE_IMPORT_NS("CXL"); From 2cdf3a5021c1cc5747197c37705eec37c21e55fc Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:28 +0000 Subject: [PATCH 117/143] NVIDIA: VR: SAUCE: cxl: Define a driver interface for HPA free space enumeration CXL region creation involves allocating capacity from Device Physical Address (DPA) and assigning it to decode a given Host Physical Address (HPA). Before determining how much DPA to allocate the amount of available HPA must be determined. Also, not all HPA is created equal, some HPA targets RAM, some targets PMEM, some is prepared for device-memory flows like HDM-D and HDM-DB, and some is HDM-H (host-only). In order to support Type2 CXL devices, wrap all of those concerns into an API that retrieves a root decoder (platform CXL window) that fits the specified constraints and the capacity available for a new region. Add a complementary function for releasing the reference to such root decoder. Based on https://lore.kernel.org/linux-cxl/168592159290.1948938.13522227102445462976.stgit@dwillia2-xfh.jf.intel.com/ Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 164 ++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 3 + include/cxl/cxl.h | 6 ++ 3 files changed, 173 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index b145b69e70bb6..400e5cadc3135 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -705,6 +705,170 @@ static int free_hpa(struct cxl_region *cxlr) return 0; } +struct cxlrd_max_context { + struct device * const *host_bridges; + int interleave_ways; + unsigned long flags; + resource_size_t max_hpa; + struct cxl_root_decoder *cxlrd; +}; + +static int find_max_hpa(struct device *dev, void *data) +{ + struct cxlrd_max_context *ctx = data; + struct cxl_switch_decoder *cxlsd; + struct cxl_root_decoder *cxlrd; + struct resource *res, *prev; + struct cxl_decoder *cxld; + resource_size_t free = 0; + resource_size_t max; + int found = 0; + + if (!is_root_decoder(dev)) + return 0; + + cxlrd = to_cxl_root_decoder(dev); + cxlsd = &cxlrd->cxlsd; + cxld = &cxlsd->cxld; + + if ((cxld->flags & ctx->flags) != ctx->flags) { + dev_dbg(dev, "flags not matching: %08lx vs %08lx\n", + cxld->flags, ctx->flags); + return 0; + } + + for (int i = 0; i < ctx->interleave_ways; i++) { + for (int j = 0; j < ctx->interleave_ways; j++) { + if (ctx->host_bridges[i] == cxlsd->target[j]->dport_dev) { + found++; + break; + } + } + } + + if (found != ctx->interleave_ways) { + dev_dbg(dev, + "Not enough host bridges. Found %d for %d interleave ways requested\n", + found, ctx->interleave_ways); + return 0; + } + + /* + * Walk the root decoder resource range relying on cxl_rwsem.region to + * preclude sibling arrival/departure and find the largest free space + * gap. + */ + lockdep_assert_held_read(&cxl_rwsem.region); + res = cxlrd->res->child; + + /* With no resource child the whole parent resource is available */ + if (!res) + max = resource_size(cxlrd->res); + else + max = 0; + + for (prev = NULL; res; prev = res, res = res->sibling) { + if (!prev && res->start == cxlrd->res->start && + res->end == cxlrd->res->end) { + max = resource_size(cxlrd->res); + break; + } + /* + * Sanity check for preventing arithmetic problems below as a + * resource with size 0 could imply using the end field below + * when set to unsigned zero - 1 or all f in hex. + */ + if (prev && !resource_size(prev)) + continue; + + if (!prev && res->start > cxlrd->res->start) { + free = res->start - cxlrd->res->start; + max = max(free, max); + } + if (prev && res->start > prev->end + 1) { + free = res->start - prev->end + 1; + max = max(free, max); + } + } + + if (prev && prev->end + 1 < cxlrd->res->end + 1) { + free = cxlrd->res->end + 1 - prev->end + 1; + max = max(free, max); + } + + dev_dbg(cxlrd_dev(cxlrd), "found %pa bytes of free space\n", &max); + if (max > ctx->max_hpa) { + if (ctx->cxlrd) + put_device(cxlrd_dev(ctx->cxlrd)); + get_device(cxlrd_dev(cxlrd)); + ctx->cxlrd = cxlrd; + ctx->max_hpa = max; + } + return 0; +} + +/** + * cxl_get_hpa_freespace - find a root decoder with free capacity per constraints + * @cxlmd: the mem device requiring the HPA + * @interleave_ways: number of entries in @host_bridges + * @flags: CXL_DECODER_F flags for selecting RAM vs PMEM, and Type2 device + * @max_avail_contig: output parameter of max contiguous bytes available in the + * returned decoder + * + * Returns a pointer to a struct cxl_root_decoder + * + * The return tuple of a 'struct cxl_root_decoder' and 'bytes available given + * in (@max_avail_contig))' is a point in time snapshot. If by the time the + * caller goes to use this decoder and its capacity is reduced then caller needs + * to loop and retry. + * + * The returned root decoder has an elevated reference count that needs to be + * put with cxl_put_root_decoder(cxlrd). + */ +struct cxl_root_decoder *cxl_get_hpa_freespace(struct cxl_memdev *cxlmd, + int interleave_ways, + unsigned long flags, + resource_size_t *max_avail_contig) +{ + struct cxlrd_max_context ctx = { + .flags = flags, + .interleave_ways = interleave_ways, + }; + struct cxl_port *root_port; + struct cxl_port *endpoint; + + endpoint = cxlmd->endpoint; + if (!endpoint) { + dev_dbg(&cxlmd->dev, "endpoint not linked to memdev\n"); + return ERR_PTR(-ENXIO); + } + + ctx.host_bridges = &endpoint->host_bridge; + + struct cxl_root *root __free(put_cxl_root) = find_cxl_root(endpoint); + if (!root) { + dev_dbg(&endpoint->dev, "endpoint is not related to a root port\n"); + return ERR_PTR(-ENXIO); + } + + root_port = &root->port; + scoped_guard(rwsem_read, &cxl_rwsem.region) + device_for_each_child(&root_port->dev, &ctx, find_max_hpa); + + if (!ctx.cxlrd) + return ERR_PTR(-ENOMEM); + + *max_avail_contig = ctx.max_hpa; + return ctx.cxlrd; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_hpa_freespace, "CXL"); + +void cxl_put_root_decoder(struct cxl_root_decoder *cxlrd) +{ + put_device(cxlrd_dev(cxlrd)); +} +EXPORT_SYMBOL_NS_GPL(cxl_put_root_decoder, "CXL"); + static ssize_t size_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 944c5d1cccebe..c7d9b2c2908f8 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -706,6 +706,9 @@ struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev); struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev); struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev); bool is_root_decoder(struct device *dev); + +#define cxlrd_dev(cxlrd) (&(cxlrd)->cxlsd.cxld.dev) + bool is_switch_decoder(struct device *dev); bool is_endpoint_decoder(struct device *dev); struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 92880c26b2d52..834dc7e789347 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -255,4 +255,10 @@ struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, struct range; int cxl_get_region_range(struct cxl_region *region, struct range *range); void cxl_unregister_region(struct cxl_region *cxlr); +struct cxl_port; +struct cxl_root_decoder *cxl_get_hpa_freespace(struct cxl_memdev *cxlmd, + int interleave_ways, + unsigned long flags, + resource_size_t *max); +void cxl_put_root_decoder(struct cxl_root_decoder *cxlrd); #endif /* __CXL_CXL_H__ */ From a4b6f622bf3ac889d558add2d4350104d82222d0 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:29 +0000 Subject: [PATCH 118/143] NVIDIA: VR: SAUCE: sfc: get root decoder Use cxl api for getting HPA (Host Physical Address) to use from a CXL root decoder. Signed-off-by: Alejandro Lucero Reviewed-by: Martin Habets Acked-by: Edward Cree Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/cxl.h | 15 --------------- drivers/net/ethernet/sfc/Kconfig | 1 + drivers/net/ethernet/sfc/efx_cxl.c | 26 +++++++++++++++++++++++--- drivers/net/ethernet/sfc/efx_cxl.h | 1 + include/cxl/cxl.h | 15 +++++++++++++++ 5 files changed, 40 insertions(+), 18 deletions(-) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index c7d9b2c2908f8..d1b010e5e1d07 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -220,21 +220,6 @@ int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_RESOURCE_NONE ((resource_size_t) -1) #define CXL_TARGET_STRLEN 20 -/* - * cxl_decoder flags that define the type of memory / devices this - * decoder supports as well as configuration lock status See "CXL 2.0 - * 8.2.5.12.7 CXL HDM Decoder 0 Control Register" for details. - * Additionally indicate whether decoder settings were autodetected, - * user customized. - */ -#define CXL_DECODER_F_RAM BIT(0) -#define CXL_DECODER_F_PMEM BIT(1) -#define CXL_DECODER_F_TYPE2 BIT(2) -#define CXL_DECODER_F_TYPE3 BIT(3) -#define CXL_DECODER_F_LOCK BIT(4) -#define CXL_DECODER_F_ENABLE BIT(5) -#define CXL_DECODER_F_MASK GENMASK(5, 0) - enum cxl_decoder_type { CXL_DECODER_DEVMEM = 2, CXL_DECODER_HOSTONLYMEM = 3, diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig index 979f2801e2a8e..e959d9b4f4cef 100644 --- a/drivers/net/ethernet/sfc/Kconfig +++ b/drivers/net/ethernet/sfc/Kconfig @@ -69,6 +69,7 @@ config SFC_MCDI_LOGGING config SFC_CXL bool "Solarflare SFC9100-family CXL support" depends on SFC && CXL_BUS >= SFC + depends on CXL_REGION default SFC help This enables SFC CXL support if the kernel is configuring CXL for diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 3536eccf1b2aa..1a4c1097c3152 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -18,6 +18,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data) { struct efx_nic *efx = &probe_data->efx; struct pci_dev *pci_dev = efx->pci_dev; + resource_size_t max_size; struct efx_cxl *cxl; struct range range; u16 dvsec; @@ -110,9 +111,24 @@ int efx_cxl_init(struct efx_probe_data *probe_data) return -ENOMEM; } - probe_data->cxl = cxl; + cxl->hdm_was_committed = true; + } else { + cxl->cxlrd = cxl_get_hpa_freespace(cxl->cxlmd, 1, CXL_DECODER_F_RAM | + CXL_DECODER_F_TYPE2, &max_size); + if (IS_ERR(cxl->cxlrd)) { + dev_err(&pci_dev->dev, "cxl_get_hpa_freespace failed\n"); + return PTR_ERR(cxl->cxlrd); + } + + if (max_size < EFX_CTPIO_BUFFER_SIZE) { + dev_err(&pci_dev->dev, "%s: not enough free HPA space %pap < %u\n", + __func__, &max_size, EFX_CTPIO_BUFFER_SIZE); + cxl_put_root_decoder(cxl->cxlrd); + return -ENOSPC; + } } + probe_data->cxl = cxl; return 0; } @@ -121,8 +137,12 @@ void efx_cxl_exit(struct efx_probe_data *probe_data) if (!probe_data->cxl) return; - iounmap(probe_data->cxl->ctpio_cxl); - cxl_unregister_region(probe_data->cxl->efx_region); + if (probe_data->cxl->hdm_was_committed) { + iounmap(probe_data->cxl->ctpio_cxl); + cxl_unregister_region(probe_data->cxl->efx_region); + } else { + cxl_put_root_decoder(probe_data->cxl->cxlrd); + } } MODULE_IMPORT_NS("CXL"); diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h index 961639cef692e..9a92e386695bb 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.h +++ b/drivers/net/ethernet/sfc/efx_cxl.h @@ -27,6 +27,7 @@ struct efx_cxl { struct cxl_root_decoder *cxlrd; struct cxl_port *endpoint; struct cxl_endpoint_decoder *cxled; + bool hdm_was_committed; struct cxl_region *efx_region; void __iomem *ctpio_cxl; }; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 834dc7e789347..783ad570a6ebe 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -153,6 +153,21 @@ struct cxl_dpa_partition { #define CXL_NR_PARTITIONS_MAX 2 +/* + * cxl_decoder flags that define the type of memory / devices this + * decoder supports as well as configuration lock status See "CXL 2.0 + * 8.2.5.12.7 CXL HDM Decoder 0 Control Register" for details. + * Additionally indicate whether decoder settings were autodetected, + * user customized. + */ +#define CXL_DECODER_F_RAM BIT(0) +#define CXL_DECODER_F_PMEM BIT(1) +#define CXL_DECODER_F_TYPE2 BIT(2) +#define CXL_DECODER_F_TYPE3 BIT(3) +#define CXL_DECODER_F_LOCK BIT(4) +#define CXL_DECODER_F_ENABLE BIT(5) +#define CXL_DECODER_F_MASK GENMASK(5, 0) + struct cxl_memdev_attach { int (*probe)(struct cxl_memdev *cxlmd); }; From 50ebf24971214621627b609f429dd80d2cc7eaaa Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:30 +0000 Subject: [PATCH 119/143] NVIDIA: VR: SAUCE: cxl: Define a driver interface for DPA allocation Region creation involves finding available DPA (device-physical-address) capacity to map into HPA (host-physical-address) space. In order to support CXL Type2 devices, define an API, cxl_request_dpa(), that tries to allocate the DPA memory the driver requires to operate.The memory requested should not be bigger than the max available HPA obtained previously with cxl_get_hpa_freespace(). Based on https://lore.kernel.org/linux-cxl/168592158743.1948938.7622563891193802610.stgit@dwillia2-xfh.jf.intel.com/ Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/hdm.c | 84 ++++++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 1 + include/cxl/cxl.h | 5 +++ 3 files changed, 90 insertions(+) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index b4bd3d91f1cfb..70da3daac3178 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "cxlmem.h" #include "core.h" @@ -546,6 +547,12 @@ bool cxl_resource_contains_addr(const struct resource *res, const resource_size_ return resource_contains(res, &_addr); } +/** + * cxl_dpa_free - release DPA (Device Physical Address) + * @cxled: endpoint decoder linked to the DPA + * + * Returns 0 or error. + */ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) { struct cxl_port *port = cxled_to_port(cxled); @@ -572,6 +579,7 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) devm_cxl_dpa_release(cxled); return 0; } +EXPORT_SYMBOL_NS_GPL(cxl_dpa_free, "CXL"); int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, enum cxl_partition_mode mode) @@ -603,6 +611,82 @@ int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, return 0; } +static int find_free_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_port *port; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + port = cxled_to_port(cxled); + + return cxled->cxld.id == (port->hdm_end + 1); +} + +static struct cxl_endpoint_decoder * +cxl_find_free_decoder(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct device *dev; + + guard(rwsem_read)(&cxl_rwsem.dpa); + dev = device_find_child(&endpoint->dev, NULL, + find_free_decoder); + if (!dev) + return NULL; + + return to_cxl_endpoint_decoder(dev); +} + +/** + * cxl_request_dpa - search and reserve DPA given input constraints + * @cxlmd: memdev with an endpoint port with available decoders + * @mode: CXL partition mode (ram vs pmem) + * @alloc: dpa size required + * + * Returns a pointer to a 'struct cxl_endpoint_decoder' on success or + * an errno encoded pointer on failure. + * + * Given that a region needs to allocate from limited HPA capacity it + * may be the case that a device has more mappable DPA capacity than + * available HPA. The expectation is that @alloc is a driver known + * value based on the device capacity but which could not be fully + * available due to HPA constraints. + * + * Returns a pinned cxl_decoder with at least @alloc bytes of capacity + * reserved, or an error pointer. The caller is also expected to own the + * lifetime of the memdev registration associated with the endpoint to + * pin the decoder registered as well. + */ +struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_memdev *cxlmd, + enum cxl_partition_mode mode, + resource_size_t alloc) +{ + int rc; + + if (!IS_ALIGNED(alloc, SZ_256M)) + return ERR_PTR(-EINVAL); + + struct cxl_endpoint_decoder *cxled __free(put_cxled) = + cxl_find_free_decoder(cxlmd); + + if (!cxled) + return ERR_PTR(-ENODEV); + + rc = cxl_dpa_set_part(cxled, mode); + if (rc) + return ERR_PTR(rc); + + rc = cxl_dpa_alloc(cxled, alloc); + if (rc) + return ERR_PTR(rc); + + return no_free_ptr(cxled); +} +EXPORT_SYMBOL_NS_GPL(cxl_request_dpa, "CXL"); + static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index d1b010e5e1d07..2b1f7d687a0e8 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -667,6 +667,7 @@ struct cxl_root *find_cxl_root(struct cxl_port *port); DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev)) DEFINE_FREE(put_cxl_port, struct cxl_port *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) +DEFINE_FREE(put_cxled, struct cxl_endpoint_decoder *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->cxld.dev)) DEFINE_FREE(put_cxl_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->cxlsd.cxld.dev)) DEFINE_FREE(put_cxl_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 783ad570a6ebe..4802371db00e0 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -7,6 +7,7 @@ #include #include +#include #include /** @@ -276,4 +277,8 @@ struct cxl_root_decoder *cxl_get_hpa_freespace(struct cxl_memdev *cxlmd, unsigned long flags, resource_size_t *max); void cxl_put_root_decoder(struct cxl_root_decoder *cxlrd); +struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_memdev *cxlmd, + enum cxl_partition_mode mode, + resource_size_t alloc); +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); #endif /* __CXL_CXL_H__ */ From f5deab37ee296dd3c28b9755adb3a92e674819a1 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:31 +0000 Subject: [PATCH 120/143] NVIDIA: VR: SAUCE: sfc: get endpoint decoder Use cxl api for getting DPA (Device Physical Address) to use through an endpoint decoder. Signed-off-by: Alejandro Lucero Reviewed-by: Martin Habets Acked-by: Edward Cree Reviewed-by: Jonathan Cameron Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/net/ethernet/sfc/efx_cxl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 1a4c1097c3152..2cfd0a46225f9 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -126,6 +126,14 @@ int efx_cxl_init(struct efx_probe_data *probe_data) cxl_put_root_decoder(cxl->cxlrd); return -ENOSPC; } + + cxl->cxled = cxl_request_dpa(cxl->cxlmd, CXL_PARTMODE_RAM, + EFX_CTPIO_BUFFER_SIZE); + if (IS_ERR(cxl->cxled)) { + pci_err(pci_dev, "CXL accel request DPA failed"); + cxl_put_root_decoder(cxl->cxlrd); + return PTR_ERR(cxl->cxled); + } } probe_data->cxl = cxl; @@ -141,6 +149,7 @@ void efx_cxl_exit(struct efx_probe_data *probe_data) iounmap(probe_data->cxl->ctpio_cxl); cxl_unregister_region(probe_data->cxl->efx_region); } else { + cxl_dpa_free(probe_data->cxl->cxled); cxl_put_root_decoder(probe_data->cxl->cxlrd); } } From 26af67c48ae74bf40b0b3cf63675ca6386bdef75 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:32 +0000 Subject: [PATCH 121/143] NVIDIA: VR: SAUCE: cxl: Make region type based on endpoint type Current code is expecting Type3 or CXL_DECODER_HOSTONLYMEM devices only. Support for Type2 implies region type needs to be based on the endpoint type HDM-D[B] instead. Signed-off-by: Alejandro Lucero Reviewed-by: Zhi Wang Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Ben Cheatham Reviewed-by: Alison Schofield Reviewed-by: Davidlohr Bueso Reviewed-by: Gregory Price Reviewed-by: Davidlohr Bueso (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 400e5cadc3135..f72988b355ec9 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2835,7 +2835,8 @@ static ssize_t create_ram_region_show(struct device *dev, } static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, - enum cxl_partition_mode mode, int id) + enum cxl_partition_mode mode, int id, + enum cxl_decoder_type target_type) { int rc; @@ -2857,7 +2858,7 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, return ERR_PTR(-EBUSY); } - return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM); + return devm_cxl_add_region(cxlrd, id, mode, target_type); } static ssize_t create_region_store(struct device *dev, const char *buf, @@ -2871,7 +2872,7 @@ static ssize_t create_region_store(struct device *dev, const char *buf, if (rc != 1) return -EINVAL; - cxlr = __create_region(cxlrd, mode, id); + cxlr = __create_region(cxlrd, mode, id, CXL_DECODER_HOSTONLYMEM); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); @@ -4046,7 +4047,8 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, do { cxlr = __create_region(cxlrd, cxlds->part[part].mode, - atomic_read(&cxlrd->region_id)); + atomic_read(&cxlrd->region_id), + cxled->cxld.target_type); } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); if (IS_ERR(cxlr)) { From d86e720921aa53b83a3f9da2caa938b022060e89 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:33 +0000 Subject: [PATCH 122/143] NVIDIA: VR: SAUCE: cxl/region: Factor out interleave ways setup Region creation based on Type3 devices is triggered from user space allowing memory combination through interleaving. In preparation for kernel driven region creation, that is Type2 drivers triggering region creation backed with its advertised CXL memory, factor out a common helper from the user-sysfs region setup for interleave ways. Signed-off-by: Alejandro Lucero Reviewed-by: Zhi Wang Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Ben Cheatham Reviewed-by: Alison Schofield (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 43 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index f72988b355ec9..637914e320c7a 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -485,22 +485,14 @@ static ssize_t interleave_ways_show(struct device *dev, static const struct attribute_group *get_cxl_region_target_group(void); -static ssize_t interleave_ways_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) +static int set_interleave_ways(struct cxl_region *cxlr, int val) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; - struct cxl_region *cxlr = to_cxl_region(dev); struct cxl_region_params *p = &cxlr->params; - unsigned int val, save; - int rc; + int save, rc; u8 iw; - rc = kstrtouint(buf, 0, &val); - if (rc) - return rc; - rc = ways_to_eiw(val, &iw); if (rc) return rc; @@ -515,9 +507,7 @@ static ssize_t interleave_ways_store(struct device *dev, return -EINVAL; } - ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); - if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) - return rc; + lockdep_assert_held_write(&cxl_rwsem.region); if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) return -EBUSY; @@ -525,10 +515,31 @@ static ssize_t interleave_ways_store(struct device *dev, save = p->interleave_ways; p->interleave_ways = val; rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); - if (rc) { + if (rc) p->interleave_ways = save; + + return rc; +} + +static ssize_t interleave_ways_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + rc = set_interleave_ways(cxlr, val); + if (rc) return rc; - } return len; } From e738dac445cc8e3ef0070976ffacd2d0cd5bcc71 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:34 +0000 Subject: [PATCH 123/143] NVIDIA: VR: SAUCE: cxl/region: Factor out interleave granularity setup Region creation based on Type3 devices is triggered from user space allowing memory combination through interleaving. In preparation for kernel driven region creation, that is Type2 drivers triggering region creation backed with its advertised CXL memory, factor out a common helper from the user-sysfs region setup forinterleave granularity. Signed-off-by: Alejandro Lucero Reviewed-by: Zhi Wang Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Ben Cheatham Reviewed-by: Alison Schofield (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 637914e320c7a..a1003994a5564 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -559,21 +559,14 @@ static ssize_t interleave_granularity_show(struct device *dev, return sysfs_emit(buf, "%d\n", p->interleave_granularity); } -static ssize_t interleave_granularity_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) +static int set_interleave_granularity(struct cxl_region *cxlr, int val) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; - struct cxl_region *cxlr = to_cxl_region(dev); struct cxl_region_params *p = &cxlr->params; - int rc, val; + int rc; u16 ig; - rc = kstrtoint(buf, 0, &val); - if (rc) - return rc; - rc = granularity_to_eig(val, &ig); if (rc) return rc; @@ -589,14 +582,32 @@ static ssize_t interleave_granularity_store(struct device *dev, if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity) return -EINVAL; - ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); - if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) - return rc; - + lockdep_assert_held_write(&cxl_rwsem.region); if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) return -EBUSY; p->interleave_granularity = val; + return 0; +} + +static ssize_t interleave_granularity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + int rc, val; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + rc = set_interleave_granularity(cxlr, val); + if (rc) + return rc; return len; } From 5fff05702af15078130e886097b63c1e606c4ec7 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:35 +0000 Subject: [PATCH 124/143] NVIDIA: VR: SAUCE: cxl: Allow region creation by type2 drivers Creating a CXL region requires userspace intervention through the cxl sysfs files. Type2 support should allow accelerator drivers to create such cxl region from kernel code. Adding that functionality and integrating it with current support for memory expanders. Based on https://lore.kernel.org/linux-cxl/168592159835.1948938.1647215579839222774.stgit@dwillia2-xfh.jf.intel.com/ Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) [jan: Resolve minor conflict due to code lines shift] Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 131 ++++++++++++++++++++++++++++++++++++-- include/cxl/cxl.h | 3 + 2 files changed, 127 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index a1003994a5564..b8450895e8d15 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2946,6 +2946,14 @@ cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name) return to_cxl_region(region_dev); } +static void drop_region(struct cxl_region *cxlr) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_port *port = cxlrd_to_port(cxlrd); + + devm_release_action(port->uport_dev, __unregister_region, cxlr); +} + static ssize_t delete_region_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -4049,14 +4057,12 @@ static int __construct_region(struct cxl_region *cxlr, return 0; } -/* Establish an empty region covering the given HPA range */ -static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, - struct cxl_endpoint_decoder *cxled) +static struct cxl_region *construct_region_begin(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlrd_to_port(cxlrd); struct cxl_dev_state *cxlds = cxlmd->cxlds; - int rc, part = READ_ONCE(cxled->part); + int part = READ_ONCE(cxled->part); struct cxl_region *cxlr; if (part < 0 || part >= cxlds->nr_partitions) { @@ -4073,13 +4079,26 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, cxled->cxld.target_type); } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); - if (IS_ERR(cxlr)) { + if (IS_ERR(cxlr)) dev_err(cxlmd->dev.parent, "%s:%s: %s failed assign region: %ld\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__, PTR_ERR(cxlr)); + + return cxlr; +} + +/* Establish an empty region covering the given HPA range */ +static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxlrd_to_port(cxlrd); + struct cxl_region *cxlr; + int rc; + + cxlr = construct_region_begin(cxlrd, cxled); + if (IS_ERR(cxlr)) return cxlr; - } rc = __construct_region(cxlr, cxlrd, cxled); if (rc) { @@ -4090,6 +4109,104 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, return cxlr; } +DEFINE_FREE(cxl_region_drop, struct cxl_region *, if (_T) drop_region(_T)) + +static struct cxl_region * +__construct_new_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, int ways) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled[0]); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region_params *p; + resource_size_t size = 0; + int rc, i; + + struct cxl_region *cxlr __free(cxl_region_drop) = + construct_region_begin(cxlrd, cxled[0]); + if (IS_ERR(cxlr)) + return cxlr; + + guard(rwsem_write)(&cxl_rwsem.region); + + /* + * Sanity check. This should not happen with an accel driver handling + * the region creation. + */ + p = &cxlr->params; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_err(cxlmd->dev.parent, + "%s:%s: %s unexpected region state\n", + dev_name(&cxlmd->dev), dev_name(&cxled[0]->cxld.dev), + __func__); + return ERR_PTR(-EBUSY); + } + + rc = set_interleave_ways(cxlr, ways); + if (rc) + return ERR_PTR(rc); + + rc = set_interleave_granularity(cxlr, cxld->interleave_granularity); + if (rc) + return ERR_PTR(rc); + + scoped_guard(rwsem_read, &cxl_rwsem.dpa) { + for (i = 0; i < ways; i++) { + if (!cxled[i]->dpa_res) + return ERR_PTR(-EINVAL); + size += resource_size(cxled[i]->dpa_res); + } + + rc = alloc_hpa(cxlr, size); + if (rc) + return ERR_PTR(rc); + + for (i = 0; i < ways; i++) { + rc = cxl_region_attach(cxlr, cxled[i], 0); + if (rc) + return ERR_PTR(rc); + } + } + + rc = cxl_region_decode_commit(cxlr); + if (rc) + return ERR_PTR(rc); + + p->state = CXL_CONFIG_COMMIT; + + return no_free_ptr(cxlr); +} + +/** + * cxl_create_region - Establish a region given an endpoint decoder + * @cxlrd: root decoder to allocate HPA + * @cxled: endpoint decoders with reserved DPA capacity + * @ways: interleave ways required + * + * Returns a fully formed region in the commit state and attached to the + * cxl_region driver. + */ +struct cxl_region *cxl_create_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, + int ways) +{ + struct cxl_region *cxlr; + + mutex_lock(&cxlrd->range_lock); + cxlr = __construct_new_region(cxlrd, cxled, ways); + mutex_unlock(&cxlrd->range_lock); + if (IS_ERR(cxlr)) + return cxlr; + + if (device_attach(&cxlr->dev) <= 0) { + dev_err(&cxlr->dev, "failed to create region\n"); + drop_region(cxlr); + return ERR_PTR(-ENODEV); + } + + return cxlr; +} +EXPORT_SYMBOL_NS_GPL(cxl_create_region, "CXL"); + static struct cxl_region * cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa) { diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 4802371db00e0..50acbd13bcf85 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -281,4 +281,7 @@ struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_memdev *cxlmd, enum cxl_partition_mode mode, resource_size_t alloc); int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); +struct cxl_region *cxl_create_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, + int ways); #endif /* __CXL_CXL_H__ */ From 6750cb7eb68749baa3fed1c64c5c1b5e1dad89d0 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:36 +0000 Subject: [PATCH 125/143] NVIDIA: VR: SAUCE: cxl: Avoid dax creation for accelerators By definition a type2 cxl device will use the host managed memory for specific functionality, therefore it should not be available to other uses. Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index b8450895e8d15..5a9857ec226d1 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -4451,6 +4451,13 @@ static int cxl_region_probe(struct device *dev) if (rc) return rc; + /* + * HDM-D[B] (device-memory) regions have accelerator specific usage. + * Skip device-dax registration. + */ + if (cxlr->type == CXL_DECODER_DEVMEM) + return 0; + /* * From this point on any path that changes the region's state away from * CXL_CONFIG_COMMIT is also responsible for releasing the driver. From 0231daf04c229c2d6a2ebdcc33b21b13d9d5de01 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:37 +0000 Subject: [PATCH 126/143] NVIDIA: VR: SAUCE: sfc: create cxl region Use cxl api for creating a region using the endpoint decoder related to a DPA range. Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/net/ethernet/sfc/efx_cxl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 2cfd0a46225f9..4d5f3974e51dc 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -134,6 +134,14 @@ int efx_cxl_init(struct efx_probe_data *probe_data) cxl_put_root_decoder(cxl->cxlrd); return PTR_ERR(cxl->cxled); } + + cxl->efx_region = cxl_create_region(cxl->cxlrd, &cxl->cxled, 1); + if (IS_ERR(cxl->efx_region)) { + pci_err(pci_dev, "CXL accel create region failed"); + cxl_put_root_decoder(cxl->cxlrd); + cxl_dpa_free(cxl->cxled); + return PTR_ERR(cxl->efx_region); + } } probe_data->cxl = cxl; @@ -147,11 +155,11 @@ void efx_cxl_exit(struct efx_probe_data *probe_data) if (probe_data->cxl->hdm_was_committed) { iounmap(probe_data->cxl->ctpio_cxl); - cxl_unregister_region(probe_data->cxl->efx_region); } else { cxl_dpa_free(probe_data->cxl->cxled); cxl_put_root_decoder(probe_data->cxl->cxlrd); } + cxl_unregister_region(probe_data->cxl->efx_region); } MODULE_IMPORT_NS("CXL"); From 36d7e3e41b178fb95d612397ff954c353131e128 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sun, 1 Feb 2026 15:54:38 +0000 Subject: [PATCH 127/143] NVIDIA: VR: SAUCE: sfc: support pio mapping based on cxl A PIO buffer is a region of device memory to which the driver can write a packet for TX, with the device handling the transmit doorbell without requiring a DMA for getting the packet data, which helps reducing latency in certain exchanges. With CXL mem protocol this latency can be lowered further. With a device supporting CXL and successfully initialised, use the cxl region to map the memory range and use this mapping for PIO buffers. Add the disabling of those CXL-based PIO buffers if the callback for potential cxl endpoint removal by the CXL code happens. Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260201155438.2664640-1-alejandro.lucero-palau@amd.com/) Signed-off-by: Jiandi An --- drivers/net/ethernet/sfc/ef10.c | 50 +++++++++++++++++++++++---- drivers/net/ethernet/sfc/efx_cxl.c | 33 ++++++++++++++---- drivers/net/ethernet/sfc/net_driver.h | 2 ++ drivers/net/ethernet/sfc/nic.h | 3 ++ 4 files changed, 75 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index fcec81f862ec5..2bb6d3136c7c3 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -24,6 +24,7 @@ #include #include #include +#include "efx_cxl.h" /* Hardware control for EF10 architecture including 'Huntington'. */ @@ -106,7 +107,7 @@ static int efx_ef10_get_vf_index(struct efx_nic *efx) static int efx_ef10_init_datapath_caps(struct efx_nic *efx) { - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V4_OUT_LEN); + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V7_OUT_LEN); struct efx_ef10_nic_data *nic_data = efx->nic_data; size_t outlen; int rc; @@ -177,6 +178,12 @@ static int efx_ef10_init_datapath_caps(struct efx_nic *efx) efx->num_mac_stats); } + if (outlen < MC_CMD_GET_CAPABILITIES_V7_OUT_LEN) + nic_data->datapath_caps3 = 0; + else + nic_data->datapath_caps3 = MCDI_DWORD(outbuf, + GET_CAPABILITIES_V7_OUT_FLAGS3); + return 0; } @@ -919,6 +926,9 @@ static void efx_ef10_forget_old_piobufs(struct efx_nic *efx) static void efx_ef10_remove(struct efx_nic *efx) { struct efx_ef10_nic_data *nic_data = efx->nic_data; +#ifdef CONFIG_SFC_CXL + struct efx_probe_data *probe_data; +#endif int rc; #ifdef CONFIG_SFC_SRIOV @@ -949,7 +959,12 @@ static void efx_ef10_remove(struct efx_nic *efx) efx_mcdi_rx_free_indir_table(efx); +#ifdef CONFIG_SFC_CXL + probe_data = container_of(efx, struct efx_probe_data, efx); + if (nic_data->wc_membase && !probe_data->cxl_pio_in_use) +#else if (nic_data->wc_membase) +#endif iounmap(nic_data->wc_membase); rc = efx_mcdi_free_vis(efx); @@ -1140,6 +1155,9 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) unsigned int channel_vis, pio_write_vi_base, max_vis; struct efx_ef10_nic_data *nic_data = efx->nic_data; unsigned int uc_mem_map_size, wc_mem_map_size; +#ifdef CONFIG_SFC_CXL + struct efx_probe_data *probe_data; +#endif void __iomem *membase; int rc; @@ -1263,8 +1281,25 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) iounmap(efx->membase); efx->membase = membase; - /* Set up the WC mapping if needed */ - if (wc_mem_map_size) { + if (!wc_mem_map_size) + goto skip_pio; + + /* Set up the WC mapping */ + +#ifdef CONFIG_SFC_CXL + probe_data = container_of(efx, struct efx_probe_data, efx); + if ((nic_data->datapath_caps3 & + (1 << MC_CMD_GET_CAPABILITIES_V7_OUT_CXL_CONFIG_ENABLE_LBN)) && + probe_data->cxl_pio_initialised) { + /* Using PIO through CXL mapping? */ + nic_data->pio_write_base = probe_data->cxl->ctpio_cxl + + (pio_write_vi_base * efx->vi_stride + + ER_DZ_TX_PIOBUF - uc_mem_map_size); + probe_data->cxl_pio_in_use = true; + } else +#endif + { + /* Using legacy PIO BAR mapping */ nic_data->wc_membase = ioremap_wc(efx->membase_phys + uc_mem_map_size, wc_mem_map_size); @@ -1279,12 +1314,13 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) nic_data->wc_membase + (pio_write_vi_base * efx->vi_stride + ER_DZ_TX_PIOBUF - uc_mem_map_size); - - rc = efx_ef10_link_piobufs(efx); - if (rc) - efx_ef10_free_piobufs(efx); } + rc = efx_ef10_link_piobufs(efx); + if (rc) + efx_ef10_free_piobufs(efx); + +skip_pio: netif_dbg(efx, probe, efx->net_dev, "memory BAR at %pa (virtual %p+%x UC, %p+%x WC)\n", &efx->membase_phys, efx->membase, uc_mem_map_size, diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 4d5f3974e51dc..c13e1f2bf7eaf 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -11,6 +11,7 @@ #include #include "net_driver.h" #include "efx_cxl.h" +#include "efx.h" #define EFX_CTPIO_BUFFER_SIZE SZ_256M @@ -138,14 +139,34 @@ int efx_cxl_init(struct efx_probe_data *probe_data) cxl->efx_region = cxl_create_region(cxl->cxlrd, &cxl->cxled, 1); if (IS_ERR(cxl->efx_region)) { pci_err(pci_dev, "CXL accel create region failed"); - cxl_put_root_decoder(cxl->cxlrd); - cxl_dpa_free(cxl->cxled); - return PTR_ERR(cxl->efx_region); + rc = PTR_ERR(cxl->efx_region); + goto err_region; + } + + rc = cxl_get_region_range(cxl->efx_region, &range); + if (rc) { + pci_err(pci_dev, "CXL getting regions params failed"); + goto err_map; + } + + cxl->ctpio_cxl = ioremap(range.start, range.end - range.start + 1); + if (!cxl->ctpio_cxl) { + pci_err(pci_dev, "CXL ioremap region (%pra) failed", &range); + rc = -ENOMEM; + goto err_map; } } probe_data->cxl = cxl; + probe_data->cxl_pio_initialised = true; return 0; + +err_map: + cxl_unregister_region(cxl->efx_region); +err_region: + cxl_put_root_decoder(cxl->cxlrd); + cxl_dpa_free(cxl->cxled); + return rc; } void efx_cxl_exit(struct efx_probe_data *probe_data) @@ -153,9 +174,9 @@ void efx_cxl_exit(struct efx_probe_data *probe_data) if (!probe_data->cxl) return; - if (probe_data->cxl->hdm_was_committed) { - iounmap(probe_data->cxl->ctpio_cxl); - } else { + iounmap(probe_data->cxl->ctpio_cxl); + + if (!probe_data->cxl->hdm_was_committed) { cxl_dpa_free(probe_data->cxl->cxled); cxl_put_root_decoder(probe_data->cxl->cxlrd); } diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 3964b2c56609c..bea4eecdf842d 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -1207,6 +1207,7 @@ struct efx_cxl; * @efx: Efx NIC details * @cxl: details of related cxl objects * @cxl_pio_initialised: cxl initialization outcome. + * @cxl_pio_in_use: PIO using CXL mapping */ struct efx_probe_data { struct pci_dev *pci_dev; @@ -1214,6 +1215,7 @@ struct efx_probe_data { #ifdef CONFIG_SFC_CXL struct efx_cxl *cxl; bool cxl_pio_initialised; + bool cxl_pio_in_use; #endif }; diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h index 9fa5c4c713abd..c87cc9214690b 100644 --- a/drivers/net/ethernet/sfc/nic.h +++ b/drivers/net/ethernet/sfc/nic.h @@ -152,6 +152,8 @@ enum { * %MC_CMD_GET_CAPABILITIES response) * @datapath_caps2: Further Capabilities of datapath firmware (FLAGS2 field of * %MC_CMD_GET_CAPABILITIES response) + * @datapath_caps3: Further Capabilities of datapath firmware (FLAGS3 field of + * %MC_CMD_GET_CAPABILITIES response) * @rx_dpcpu_fw_id: Firmware ID of the RxDPCPU * @tx_dpcpu_fw_id: Firmware ID of the TxDPCPU * @must_probe_vswitching: Flag: vswitching has yet to be setup after MC reboot @@ -186,6 +188,7 @@ struct efx_ef10_nic_data { bool must_check_datapath_caps; u32 datapath_caps; u32 datapath_caps2; + u32 datapath_caps3; unsigned int rx_dpcpu_fw_id; unsigned int tx_dpcpu_fw_id; bool must_probe_vswitching; From 7460ed22b18dd9f22512081460045b518aa88a08 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 28 Oct 2025 10:47:53 +0100 Subject: [PATCH 128/143] NVIDIA: VR: SAUCE: cxl/region: Support multi-level interleaving with smaller granularities for lower levels The CXL specification supports multi-level interleaving "as long as all the levels use different, but consecutive, HPA bits to select the target and no Interleave Set has more than 8 devices" (from 3.2). Currently the kernel expects that a decoder's "interleave granularity is a multiple of @parent_port granularity". That is, the granularity of a lower level is bigger than those of the parent and uses the outer HPA bits as selector. It works e.g. for the following 8-way config: * cross-link (cross-hostbridge config in CFMWS): * 4-way * 256 granularity * Selector: HPA[8:9] * sub-link (CXL Host bridge config of the HDM): * 2-way * 1024 granularity * Selector: HPA[10] Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way config could look like this: * cross-link (cross-hostbridge config in CFMWS): * 4-way * 512 granularity * Selector: HPA[9:10] * sub-link (CXL Host bridge config of the HDM): * 2-way * 256 granularity * Selector: HPA[8] The enumeration of decoders for this configuration fails then with following error: cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200] cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff cxl_port endpoint12: failed to attach decoder12.0 to region0: -6 Note that this happens only if firmware is setting up the decoders (CXL_REGION_F_AUTO). For userspace region assembly the granularities are chosen to increase from root down to the lower levels. That is, outer HPA bits are always used for lower interleaving levels. Rework the implementation to also support multi-level interleaving with smaller granularities for lower levels. Determine the interleave set of autodetected decoders. Check that it is a subset of the root interleave. The HPA selector bits are extracted for all decoders of the set and checked that there is no overlap and bits are consecutive. All decoders can be programmed now to use any bit range within the region's target selector. Signed-off-by: Robert Richter (backported from https://lore.kernel.org/all/20251028094754.72816-1-rrichter@amd.com/) [jan: Resolved minor conflicts] Signed-off-by: Jiandi An --- drivers/cxl/core/region.c | 201 ++++++++++++++++++++------------------ 1 file changed, 108 insertions(+), 93 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 5a9857ec226d1..b66d663deb8fd 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1555,57 +1555,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig) return 0; } +static inline u64 get_selector(u64 ways, u64 gran) +{ + if (!is_power_of_2(ways)) + ways /= 3; + + if (!is_power_of_2(ways) || !is_power_of_2(gran)) + return 0; + + return (ways - 1) * gran; +} + static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); - int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_ep *ep = cxl_ep_load(port, cxlmd); struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; - struct cxl_switch_decoder *cxlsd; + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev); struct cxl_port *iter = port; - u16 eig, peig; - u8 eiw, peiw; + int ig, iw = cxl_rr->nr_targets, rc, pos = cxled->pos; + int distance, parent_distance; + u64 selector, cxlr_sel; + u16 eig; + u8 eiw; /* * While root level decoders support x3, x6, x12, switch level * decoders only support powers of 2 up to x16. */ - if (!is_power_of_2(cxl_rr->nr_targets)) { + if (!is_power_of_2(iw)) { dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - cxl_rr->nr_targets); + dev_name(port->uport_dev), dev_name(&port->dev), iw); return -EINVAL; } - cxlsd = to_cxl_switch_decoder(&cxld->dev); - if (cxl_rr->nr_targets_set) { - int i, distance = 1; - struct cxl_region_ref *cxl_rr_iter; + if (iw > 8 || iw > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s:%s: ways: %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), iw, cxlsd->nr_targets); + return -ENXIO; + } - /* - * The "distance" between peer downstream ports represents which - * endpoint positions in the region interleave a given port can - * host. - * - * For example, at the root of a hierarchy the distance is - * always 1 as every index targets a different host-bridge. At - * each subsequent switch level those ports map every Nth region - * position where N is the width of the switch == distance. - */ - do { - cxl_rr_iter = cxl_rr_load(iter, cxlr); - distance *= cxl_rr_iter->nr_targets; - iter = to_cxl_port(iter->dev.parent); - } while (!is_cxl_root(iter)); - distance *= cxlrd->cxlsd.cxld.interleave_ways; + /* + * Calculate the effective granularity and ways to determine + * HPA bits used as target selectors of the interleave set. + * Use this to check if the root decoder and all subsequent + * HDM decoders only use bits from that range as selectors. + * + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. + */ + + /* Start with the root decoders selector and distance. */ + selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways, + cxlrd->cxlsd.cxld.interleave_granularity); + distance = cxlrd->cxlsd.cxld.interleave_ways; + if (!is_power_of_2(distance)) + distance /= 3; + + for (iter = parent_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr); + struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder; + u64 cxld_sel; + + if (cxld_iter->interleave_ways == 1) + continue; + + cxld_sel = get_selector(cxld_iter->interleave_ways, + cxld_iter->interleave_granularity); + + if (cxld_sel & selector) { + dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxld_sel, selector); + return -ENXIO; + } - for (i = 0; i < cxl_rr->nr_targets_set; i++) + selector |= cxld_sel; + distance *= cxl_rr_iter->nr_targets; + } + + parent_distance = distance; + distance *= iw; + + /* The combined selector bits must fit the region selector. */ + cxlr_sel = get_selector(p->interleave_ways, + p->interleave_granularity); + + if ((cxlr_sel & selector) != selector) { + dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxlr_sel, selector); + return -ENXIO; + } + + /* Calculate remaining selector bits available for use. */ + selector = cxlr_sel & ~selector; + + if (cxl_rr->nr_targets_set) { + for (int i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, distance); @@ -1616,87 +1678,40 @@ static int cxl_port_setup_targets(struct cxl_port *port, goto add_target; } - if (is_cxl_root(parent_port)) { + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + ig = cxld->interleave_granularity; + else /* + * Set the interleave granularity with each interleave + * level to a multiple of it's parent port interleave + * ways. Beginning with the granularity of the root + * decoder set to the region granularity (starting + * with the inner selector bits of the HPA), the + * granularity is increased with each level. Calculate + * this using the parent distance and region + * granularity. + * * Root decoder IG is always set to value in CFMWS which * may be different than this region's IG. We can use the * region's IG here since interleave_granularity_store() * does not allow interleaved host-bridges with * root IG != region IG. */ - parent_ig = p->interleave_granularity; - parent_iw = cxlrd->cxlsd.cxld.interleave_ways; - /* - * For purposes of address bit routing, use power-of-2 math for - * switch ports. - */ - if (!is_power_of_2(parent_iw)) - parent_iw /= 3; - } else { - struct cxl_region_ref *parent_rr; - struct cxl_decoder *parent_cxld; - - parent_rr = cxl_rr_load(parent_port, cxlr); - parent_cxld = parent_rr->decoder; - parent_ig = parent_cxld->interleave_granularity; - parent_iw = parent_cxld->interleave_ways; - } - - rc = granularity_to_eig(parent_ig, &peig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_ig); - return rc; - } - - rc = ways_to_eiw(parent_iw, &peiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_iw); - return rc; - } + ig = p->interleave_granularity * parent_distance; - iw = cxl_rr->nr_targets; rc = ways_to_eiw(iw, &eiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), iw); - return rc; - } - - /* - * Interleave granularity is a multiple of @parent_port granularity. - * Multiplier is the parent port interleave ways. - */ - rc = granularity_to_eig(parent_ig * parent_iw, &eig); - if (rc) { - dev_dbg(&cxlr->dev, - "%s: invalid granularity calculation (%d * %d)\n", - dev_name(&parent_port->dev), parent_ig, parent_iw); - return rc; - } - - rc = eig_to_granularity(eig, &ig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - 256 << eig); - return rc; - } + if (!rc) + rc = granularity_to_eig(ig, &eig); - if (iw > 8 || iw > cxlsd->nr_targets) { - dev_dbg(&cxlr->dev, - "%s:%s:%s: ways: %d overflows targets: %d\n", + if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) { + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n", dev_name(port->uport_dev), dev_name(&port->dev), - dev_name(&cxld->dev), iw, cxlsd->nr_targets); + iw, ig, selector); return -ENXIO; } if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || - (iw > 1 && cxld->interleave_granularity != ig) || !spa_maps_hpa(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, From 2ed6a4b939e31cf0dea4c2dd535a46098c2eedfd Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:15 +0000 Subject: [PATCH 129/143] NVIDIA: VR: SAUCE: PCI: Add CXL DVSEC control, lock, and range register definitions PCI: Add CXL DVSEC control, lock, and range register definitions Add register offset and field definitions for CXL DVSEC registers needed by CXL state save/restore across resets: - CTRL2 (offset 0x10) and LOCK (offset 0x14) registers - CONFIG_LOCK bit in the LOCK register - RWL (read-write-when-locked) field masks for CTRL and range base registers. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An --- include/uapi/linux/pci_regs.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 49848c6765270..68aec848510d5 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1250,14 +1250,20 @@ #define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) #define PCI_DVSEC_CXL_CTRL 0xC #define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) +#define PCI_DVSEC_CXL_CTRL_RWL 0x5FED +#define PCI_DVSEC_CXL_CTRL2 0x10 +#define PCI_DVSEC_CXL_LOCK 0x14 +#define PCI_DVSEC_CXL_LOCK_CONFIG _BITUL(0) #define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) #define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) #define PCI_DVSEC_CXL_MEM_ACTIVE _BITUL(1) #define PCI_DVSEC_CXL_MEM_SIZE_LOW __GENMASK(31, 28) #define PCI_DVSEC_CXL_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_BASE_HI_RWL 0xFFFFFFFF #define PCI_DVSEC_CXL_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_BASE_LOW __GENMASK(31, 28) +#define PCI_DVSEC_CXL_RANGE_BASE_LO_RWL 0xF0000000 #define CXL_DVSEC_RANGE_MAX 2 From 004d5d97613cad6e06d5a1facb1058114cc93b09 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:16 +0000 Subject: [PATCH 130/143] NVIDIA: VR: SAUCE: cxl: Move HDM decoder and register map definitions to include/cxl/cxl.h Move CXL HDM decoder register defines, register map structs (cxl_reg_map, cxl_component_reg_map, cxl_device_reg_map, cxl_pmu_reg_map, cxl_register_map), cxl_hdm_decoder_count(), enum cxl_regloc_type, and cxl_find_regblock()/cxl_setup_regs() declarations from internal CXL headers to include/cxl/pci.h. This makes them accessible to code outside the CXL subsystem, in particular the PCI core CXL state save/restore support added in a subsequent patch. No functional change. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) [jan: Resolve conflicts by moving certain definitions to include/cxl/cxl.h instead of to include/cxl/pci.h to align with its dependency of Alejandro's series] Signed-off-by: Jiandi An --- drivers/cxl/cxl.h | 57 ----------------------------------------------- include/cxl/cxl.h | 54 ++++++++++++++++++++++++++++++++++++++++++++ include/cxl/pci.h | 4 ++++ 3 files changed, 58 insertions(+), 57 deletions(-) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 2b1f7d687a0e8..f84910ba7fa2b 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -24,59 +24,6 @@ extern const struct nvdimm_security_ops *cxl_security_ops; * (port-driver, region-driver, nvdimm object-drivers... etc). */ -/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ -#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K - -/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/ -#define CXL_CM_OFFSET 0x1000 -#define CXL_CM_CAP_HDR_OFFSET 0x0 -#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) -#define CM_CAP_HDR_CAP_ID 1 -#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) -#define CM_CAP_HDR_CAP_VERSION 1 -#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) -#define CM_CAP_HDR_CACHE_MEM_VERSION 1 -#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) -#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) - -/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ -#define CXL_HDM_DECODER_CAP_OFFSET 0x0 -#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) -#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) -#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) -#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) -#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 -#define CXL_HDM_DECODER_ENABLE BIT(1) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) -#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) -#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) -#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) -#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) -#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) -#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) -#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) -#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) -#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) -#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) - -/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ -#define CXL_DECODER_MIN_GRANULARITY 256 -#define CXL_DECODER_MAX_ENCODED_IG 6 - -static inline int cxl_hdm_decoder_count(u32 cap_hdr) -{ - int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); - - return val ? val * 2 : 1; -} - /* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ static inline int eig_to_granularity(u16 eig, unsigned int *granularity) { @@ -207,13 +154,9 @@ int cxl_map_device_regs(const struct cxl_register_map *map, int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs); #define CXL_INSTANCES_COUNT -1 -enum cxl_regloc_type; int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type); int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map, unsigned int index); -int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map); -int cxl_setup_regs(struct cxl_register_map *map); struct cxl_dport; int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 50acbd13bcf85..7d0b09ff57681 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -5,6 +5,7 @@ #ifndef __CXL_CXL_H__ #define __CXL_CXL_H__ +#include #include #include #include @@ -75,6 +76,59 @@ struct cxl_regs { #define CXL_CM_CAP_CAP_ID_HDM 0x5 #define CXL_CM_CAP_CAP_HDM_VERSION 1 +/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K + +/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers */ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) + +/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE BIT(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) +#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ +#define CXL_DECODER_MIN_GRANULARITY 256 +#define CXL_DECODER_MAX_ENCODED_IG 6 + +static inline int cxl_hdm_decoder_count(u32 cap_hdr) +{ + int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); + + return val ? val * 2 : 1; +} + struct cxl_reg_map { bool valid; int id; diff --git a/include/cxl/pci.h b/include/cxl/pci.h index a172439f08c60..edbf980c283f1 100644 --- a/include/cxl/pci.h +++ b/include/cxl/pci.h @@ -14,8 +14,12 @@ enum cxl_regloc_type { CXL_REGLOC_RBI_TYPES }; +struct pci_dev; struct cxl_register_map; int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); +int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); +int cxl_setup_regs(struct cxl_register_map *map); #endif From a0c071f697b1f08b5d707080c8662f75717c5d29 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:17 +0000 Subject: [PATCH 131/143] NVIDIA: VR: SAUCE: PCI: Add virtual extended cap save buffer for CXL state Add pci_add_virtual_ext_cap_save_buffer() to allocate save buffers using virtual cap IDs (above PCI_EXT_CAP_ID_MAX) that don't require a real capability in config space. The existing pci_add_ext_cap_save_buffer() cannot be used for CXL DVSEC state because it calls pci_find_saved_ext_cap() which searches for a matching capability in PCI config space. The CXL state saved here is a synthetic snapshot (DVSEC+HDM) and should not be tied to a real extended-cap instance. A virtual extended-cap save buffer API (cap IDs above PCI_EXT_CAP_ID_MAX) allows PCI to track this state without a backing config space capability. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An --- drivers/pci/pci.c | 20 ++++++++++++++++++++ drivers/pci/pci.h | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 372de7961d2a6..81733831e248a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3528,6 +3528,26 @@ int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size) return _pci_add_cap_save_buffer(dev, cap, true, size); } +int pci_add_virtual_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, + unsigned int size) +{ + struct pci_cap_saved_state *save_state; + + if (cap <= PCI_EXT_CAP_ID_MAX) + return -EINVAL; + + save_state = kzalloc(sizeof(*save_state) + size, GFP_KERNEL); + if (!save_state) + return -ENOMEM; + + save_state->cap.cap_nr = cap; + save_state->cap.cap_extended = true; + save_state->cap.size = size; + pci_add_saved_cap(dev, save_state); + + return 0; +} + /** * pci_allocate_cap_save_buffers - allocate buffers for saving capabilities * @dev: the PCI device diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d4ae4eef89975..6167e0e204ade 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -119,15 +119,33 @@ struct pci_cap_saved_state { struct pci_cap_saved_data cap; }; +/* + * Virtual extended cap ID for CXL DVSEC state in the cap save chain. + */ +#define PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL 0xFFFF +static_assert(PCI_EXT_CAP_ID_MAX < PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size); +int pci_add_virtual_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, + unsigned int size); struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, u16 cap); +#ifdef CONFIG_PCI_CXL +void pci_allocate_cxl_save_buffer(struct pci_dev *dev); +void pci_save_cxl_state(struct pci_dev *dev); +void pci_restore_cxl_state(struct pci_dev *dev); +#else +static inline void pci_allocate_cxl_save_buffer(struct pci_dev *dev) { } +static inline void pci_save_cxl_state(struct pci_dev *dev) { } +static inline void pci_restore_cxl_state(struct pci_dev *dev) { } +#endif + #define PCI_PM_D2_DELAY 200 /* usec; see PCIe r4.0, sec 5.9.1 */ #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ From 0ce70d718d954531e8197201594da6787ab709fa Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:18 +0000 Subject: [PATCH 132/143] NVIDIA: VR: SAUCE: PCI: Add cxl DVSEC state save/restore across resets Save and restore CXL DVSEC control registers (CTRL, CTRL2), range base registers, and lock state across PCI resets. When the DVSEC CONFIG_LOCK bit is set, certain DVSEC fields become read-only and hardware may have updated them. Blindly restoring saved values would be silently ignored or conflict with hardware state. Instead, a read-merge-write approach is used: current hardware values are read for the RWL (read-write-when-locked) fields and merged with saved state, so only writable bits are restored while locked bits retain their hardware values. Hooked into pci_save_state()/pci_restore_state() so all PCI reset paths automatically preserve CXL DVSEC configuration. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) [jan: Resolve minor conflict in drivers/pci/Makefile due to code line shifts ] Signed-off-by: Jiandi An --- drivers/pci/Kconfig | 4 + drivers/pci/Makefile | 1 + drivers/pci/cxl.c | 177 +++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.c | 3 + 4 files changed, 185 insertions(+) create mode 100644 drivers/pci/cxl.c diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 9a249c65aedcd..d094f9532b74f 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -119,6 +119,10 @@ config XEN_PCIDEV_FRONTEND The PCI device frontend driver allows the kernel to import arbitrary PCI devices from a PCI backend to support PCI driver domains. +config PCI_CXL + bool + default y if CXL_BUS + config PCI_ATS bool diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 67647f1880fb8..8d39e070c6ec0 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_PCI_DOE) += doe.o obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o obj-$(CONFIG_PCI_NPEM) += npem.o obj-$(CONFIG_PCIE_TPH) += tph.o +obj-$(CONFIG_PCI_CXL) += cxl.o # Endpoint library must be initialized before its users obj-$(CONFIG_PCI_ENDPOINT) += endpoint/ diff --git a/drivers/pci/cxl.c b/drivers/pci/cxl.c new file mode 100644 index 0000000000000..abcf70de91715 --- /dev/null +++ b/drivers/pci/cxl.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CXL PCI state save/restore support. + * + * Saves and restores CXL DVSEC registers across PCI resets and link + * disable/enable transitions. Hooked into pci_save_state() / + * pci_restore_state() via the PCI capability save chain. + */ +#include +#include +#include "pci.h" + +struct cxl_pci_state { + u16 dvsec; + u16 dvsec_ctrl; + u16 dvsec_ctrl2; + u32 range_base_hi[CXL_DVSEC_RANGE_MAX]; + u32 range_base_lo[CXL_DVSEC_RANGE_MAX]; + u16 dvsec_lock; + bool dvsec_valid; +}; + +static void cxl_save_dvsec(struct pci_dev *pdev, struct cxl_pci_state *state) +{ + int rc_ctrl, rc_ctrl2; + u16 dvsec; + int i; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return; + + state->dvsec = dvsec; + rc_ctrl = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL, + &state->dvsec_ctrl); + rc_ctrl2 = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + &state->dvsec_ctrl2); + if (rc_ctrl || rc_ctrl2) { + pci_warn(pdev, + "CXL: DVSEC read failed (ctrl rc=%d, ctrl2 rc=%d)\n", + rc_ctrl, rc_ctrl2); + return; + } + + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_read_config_dword(pdev, + dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + &state->range_base_hi[i]); + pci_read_config_dword(pdev, + dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + &state->range_base_lo[i]); + } + + pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_LOCK, + &state->dvsec_lock); + + state->dvsec_valid = true; +} + +static u32 cxl_merge_rwl(u32 saved, u32 current_hw, u32 rwl_mask) +{ + return (current_hw & rwl_mask) | (saved & ~rwl_mask); +} + +static void cxl_restore_dvsec(struct pci_dev *pdev, + const struct cxl_pci_state *state) +{ + u16 lock_reg = 0; + int i; + + if (!state->dvsec_valid) + return; + + pci_read_config_word(pdev, state->dvsec + PCI_DVSEC_CXL_LOCK, + &lock_reg); + + if (lock_reg & PCI_DVSEC_CXL_LOCK_CONFIG) { + u16 hw_ctrl; + u32 hw_range_hi, hw_range_lo; + + pci_read_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + &hw_ctrl); + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + cxl_merge_rwl(state->dvsec_ctrl, hw_ctrl, + PCI_DVSEC_CXL_CTRL_RWL)); + + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL2, + state->dvsec_ctrl2); + + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_read_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + &hw_range_hi); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + cxl_merge_rwl(state->range_base_hi[i], + hw_range_hi, + PCI_DVSEC_CXL_RANGE_BASE_HI_RWL)); + + pci_read_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + &hw_range_lo); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + cxl_merge_rwl(state->range_base_lo[i], + hw_range_lo, + PCI_DVSEC_CXL_RANGE_BASE_LO_RWL)); + } + } else { + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + state->dvsec_ctrl); + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL2, + state->dvsec_ctrl2); + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + state->range_base_hi[i]); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + state->range_base_lo[i]); + } + + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_LOCK, + state->dvsec_lock); + } +} + +void pci_allocate_cxl_save_buffer(struct pci_dev *dev) +{ + if (!pcie_is_cxl(dev)) + return; + + if (pci_add_virtual_ext_cap_save_buffer(dev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL, + sizeof(struct cxl_pci_state))) + pci_err(dev, "unable to allocate CXL save buffer\n"); +} + +void pci_save_cxl_state(struct pci_dev *pdev) +{ + struct pci_cap_saved_state *save_state; + struct cxl_pci_state *state; + + save_state = pci_find_saved_ext_cap(pdev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + if (!save_state) + return; + + state = (struct cxl_pci_state *)save_state->cap.data; + state->dvsec_valid = false; + + cxl_save_dvsec(pdev, state); +} + +void pci_restore_cxl_state(struct pci_dev *pdev) +{ + struct pci_cap_saved_state *save_state; + struct cxl_pci_state *state; + + save_state = pci_find_saved_ext_cap(pdev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + if (!save_state) + return; + + state = (struct cxl_pci_state *)save_state->cap.data; + if (!state->dvsec_valid) + return; + + cxl_restore_dvsec(pdev, state); +} diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 81733831e248a..193bbb6347566 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1817,6 +1817,7 @@ int pci_save_state(struct pci_dev *dev) pci_save_aer_state(dev); pci_save_ptm_state(dev); pci_save_tph_state(dev); + pci_save_cxl_state(dev); return pci_save_vc_state(dev); } EXPORT_SYMBOL(pci_save_state); @@ -1928,6 +1929,7 @@ void pci_restore_state(struct pci_dev *dev) pci_restore_aer_state(dev); pci_restore_config_space(dev); + pci_restore_cxl_state(dev); pci_restore_pcix_state(dev); pci_restore_msi_state(dev); @@ -3571,6 +3573,7 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) pci_err(dev, "unable to allocate suspend buffer for LTR\n"); pci_allocate_vc_save_buffers(dev); + pci_allocate_cxl_save_buffer(dev); } void pci_free_cap_save_buffers(struct pci_dev *dev) From 2eb6dee0c0e67a32c0c9ac7b60e766712e6e5914 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:19 +0000 Subject: [PATCH 133/143] NVIDIA: VR: SAUCE: PCI: Add HDM decoder state save/restore Save and restore CXL HDM decoder registers (global control, per-decoder base/size/target-list, and commit state) across PCI resets. On restore, decoders that were committed are reprogrammed and recommitted with a 10ms timeout. Locked decoders that are already committed are skipped, since their state is protected by hardware and reprogramming them would fail. The Register Locator DVSEC is parsed directly via PCI config space reads rather than calling cxl_find_regblock()/cxl_setup_regs(), since this code lives in the PCI core and must not depend on CXL module symbols. MSE is temporarily enabled during save/restore to allow MMIO access to the HDM decoder register block. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) [jan: Include in drivers/pci/cxl.c due to conflict resolution in "4acbc27592b8 NVIDIA: VR: SAUCE: cxl: Move HDM decoder and register map definitions to include/cxl/cxl.h"] Signed-off-by: Jiandi An --- drivers/pci/cxl.c | 298 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 295 insertions(+), 3 deletions(-) diff --git a/drivers/pci/cxl.c b/drivers/pci/cxl.c index abcf70de91715..900d0316f38d0 100644 --- a/drivers/pci/cxl.c +++ b/drivers/pci/cxl.c @@ -2,15 +2,32 @@ /* * CXL PCI state save/restore support. * - * Saves and restores CXL DVSEC registers across PCI resets and link - * disable/enable transitions. Hooked into pci_save_state() / + * Saves and restores CXL DVSEC and HDM decoder registers across PCI resets + * and link disable/enable transitions. Hooked into pci_save_state() / * pci_restore_state() via the PCI capability save chain. */ #include +#include +#include +#include +#include #include #include "pci.h" +#define CXL_HDM_MAX_DECODERS 32 + +struct cxl_hdm_decoder_snapshot { + u32 base_lo; + u32 base_hi; + u32 size_lo; + u32 size_hi; + u32 ctrl; + u32 tl_lo; + u32 tl_hi; +}; + struct cxl_pci_state { + /* DVSEC saved state */ u16 dvsec; u16 dvsec_ctrl; u16 dvsec_ctrl2; @@ -18,6 +35,15 @@ struct cxl_pci_state { u32 range_base_lo[CXL_DVSEC_RANGE_MAX]; u16 dvsec_lock; bool dvsec_valid; + + /* HDM decoder saved state */ + int hdm_bar; + unsigned long hdm_bar_offset; + unsigned long hdm_map_size; + u32 hdm_global_ctrl; + int hdm_count; + struct cxl_hdm_decoder_snapshot decoders[CXL_HDM_MAX_DECODERS]; + bool hdm_valid; }; static void cxl_save_dvsec(struct pci_dev *pdev, struct cxl_pci_state *state) @@ -132,6 +158,269 @@ static void cxl_restore_dvsec(struct pci_dev *pdev, } } +struct pci_cmd_saved { + struct pci_dev *pdev; + u16 cmd; +}; + +DEFINE_FREE(restore_pci_cmd, struct pci_cmd_saved, + if (!(_T.cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(_T.pdev, PCI_COMMAND, _T.cmd)) + +/** + * cxl_find_component_regblock - Find the Component Register Block via + * the Register Locator DVSEC + * @pdev: PCI device to scan + * @bir: output BAR index + * @offset: output offset within the BAR + * + * Parses the Register Locator DVSEC (ID 8) directly via PCI config space + * reads. No dependency on CXL module symbols. + * + * Return: 0 on success, -ENODEV if not found. + */ +static int cxl_find_component_regblock(struct pci_dev *pdev, + int *bir, u64 *offset) +{ + u32 regloc_size, regblocks; + u16 regloc; + int i; + + regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_REG_LOCATOR); + if (!regloc) + return -ENODEV; + + pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); + regloc_size = PCI_DVSEC_HEADER1_LEN(regloc_size); + regblocks = (regloc_size - PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1) / 8; + + for (i = 0; i < regblocks; i++) { + u32 reg_lo, reg_hi; + unsigned int off; + + off = regloc + PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 + i * 8; + pci_read_config_dword(pdev, off, ®_lo); + pci_read_config_dword(pdev, off + 4, ®_hi); + + if (FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID, reg_lo) != + CXL_REGLOC_RBI_COMPONENT) + continue; + + *bir = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BIR, reg_lo); + *offset = ((u64)reg_hi << 32) | + (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW); + return 0; + } + + return -ENODEV; +} + +/* + * Discover and map HDM decoder registers. + * Caller must pci_iounmap() the returned pointer. + */ +static void __iomem *cxl_hdm_map(struct pci_dev *pdev, int *bar_out, + unsigned long *offset_out, + unsigned long *size_out) +{ + int bir; + u64 reg_offset; + void __iomem *comp_base, *cm_base; + u32 cap_hdr; + int cap, cap_count; + unsigned long hdm_offset = 0, hdm_size = 0; + void __iomem *hdm; + + if (cxl_find_component_regblock(pdev, &bir, ®_offset)) + return NULL; + + comp_base = pci_iomap_range(pdev, bir, reg_offset, + CXL_CM_OFFSET + SZ_4K); + if (!comp_base) + return NULL; + + cm_base = comp_base + CXL_CM_OFFSET; + cap_hdr = readl(cm_base); + + if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_hdr) != CM_CAP_HDR_CAP_ID) { + pci_iounmap(pdev, comp_base); + return NULL; + } + + cap_count = FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr); + + for (cap = 1; cap <= cap_count; cap++) { + u32 hdr = readl(cm_base + cap * 4); + u16 cap_id = FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, hdr); + u32 cap_off = FIELD_GET(CXL_CM_CAP_PTR_MASK, hdr); + + if (cap_id != CXL_CM_CAP_CAP_ID_HDM) + continue; + + hdr = readl(cm_base + cap_off); + hdm_offset = CXL_CM_OFFSET + cap_off; + hdm_size = 0x20 * cxl_hdm_decoder_count(hdr) + 0x10; + break; + } + + pci_iounmap(pdev, comp_base); + + if (!hdm_size) + return NULL; + + hdm = pci_iomap_range(pdev, bir, reg_offset + hdm_offset, hdm_size); + if (!hdm) + return NULL; + + *bar_out = bir; + *offset_out = reg_offset + hdm_offset; + *size_out = hdm_size; + return hdm; +} + +static void cxl_save_hdm(struct pci_dev *pdev, void __iomem *hdm, + struct cxl_pci_state *state, int count) +{ + int i; + + state->hdm_count = min_t(int, count, CXL_HDM_MAX_DECODERS); + state->hdm_global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + for (i = 0; i < state->hdm_count; i++) { + struct cxl_hdm_decoder_snapshot *d = &state->decoders[i]; + + d->base_lo = readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)); + d->base_hi = readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)); + d->size_lo = readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i)); + d->size_hi = readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)); + d->ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + d->tl_lo = readl(hdm + CXL_HDM_DECODER0_TL_LOW(i)); + d->tl_hi = readl(hdm + CXL_HDM_DECODER0_TL_HIGH(i)); + } +} + +static void cxl_restore_hdm(struct pci_dev *pdev, void __iomem *hdm, + const struct cxl_pci_state *state) +{ + int i; + + writel(state->hdm_global_ctrl, hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + for (i = 0; i < state->hdm_count; i++) { + const struct cxl_hdm_decoder_snapshot *d = &state->decoders[i]; + unsigned long timeout; + u32 ctrl; + + if (!(d->ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if ((ctrl & CXL_HDM_DECODER0_CTRL_LOCK) && + (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) { + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + } + + writel(d->base_lo, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)); + writel(d->base_hi, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)); + writel(d->size_lo, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i)); + writel(d->size_hi, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)); + writel(d->tl_lo, hdm + CXL_HDM_DECODER0_TL_LOW(i)); + writel(d->tl_hi, hdm + CXL_HDM_DECODER0_TL_HIGH(i)); + + wmb(); + + ctrl = d->ctrl & ~(CXL_HDM_DECODER0_CTRL_COMMITTED | + CXL_HDM_DECODER0_CTRL_COMMIT_ERROR); + ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + + timeout = jiffies + msecs_to_jiffies(10); + for (;;) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + break; + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMIT_ERROR) { + pci_warn(pdev, + "HDM decoder %d commit error on restore\n", + i); + break; + } + if (time_after(jiffies, timeout)) { + pci_warn(pdev, + "HDM decoder %d commit timeout on restore\n", + i); + break; + } + cpu_relax(); + } + } +} + +static void cxl_save_hdm_decoders(struct pci_dev *pdev, + struct cxl_pci_state *state) +{ + int hdm_bar; + unsigned long hdm_bar_offset, hdm_map_size; + void __iomem *hdm; + u16 cmd; + u32 cap; + struct pci_cmd_saved saved __free(restore_pci_cmd) = { + .pdev = pdev, .cmd = PCI_COMMAND_MEMORY, + }; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + saved.cmd = cmd; + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); + + hdm = cxl_hdm_map(pdev, &hdm_bar, &hdm_bar_offset, &hdm_map_size); + if (!hdm) + return; + + cap = readl(hdm + CXL_HDM_DECODER_CAP_OFFSET); + cxl_save_hdm(pdev, hdm, state, cxl_hdm_decoder_count(cap)); + state->hdm_bar = hdm_bar; + state->hdm_bar_offset = hdm_bar_offset; + state->hdm_map_size = hdm_map_size; + state->hdm_valid = true; + pci_iounmap(pdev, hdm); +} + +static void cxl_restore_hdm_decoders(struct pci_dev *pdev, + const struct cxl_pci_state *state) +{ + void __iomem *hdm; + u16 cmd; + struct pci_cmd_saved saved __free(restore_pci_cmd) = { + .pdev = pdev, .cmd = PCI_COMMAND_MEMORY, + }; + + if (!state->hdm_valid) + return; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + saved.cmd = cmd; + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); + + hdm = pci_iomap_range(pdev, state->hdm_bar, state->hdm_bar_offset, + state->hdm_map_size); + if (!hdm) { + pci_warn(pdev, "CXL: failed to map HDM for restore\n"); + return; + } + + cxl_restore_hdm(pdev, hdm, state); + pci_iounmap(pdev, hdm); +} + void pci_allocate_cxl_save_buffer(struct pci_dev *dev) { if (!pcie_is_cxl(dev)) @@ -155,8 +444,10 @@ void pci_save_cxl_state(struct pci_dev *pdev) state = (struct cxl_pci_state *)save_state->cap.data; state->dvsec_valid = false; + state->hdm_valid = false; cxl_save_dvsec(pdev, state); + cxl_save_hdm_decoders(pdev, state); } void pci_restore_cxl_state(struct pci_dev *pdev) @@ -170,8 +461,9 @@ void pci_restore_cxl_state(struct pci_dev *pdev) return; state = (struct cxl_pci_state *)save_state->cap.data; - if (!state->dvsec_valid) + if (!state->dvsec_valid && !state->hdm_valid) return; cxl_restore_dvsec(pdev, state); + cxl_restore_hdm_decoders(pdev, state); } From 8c8c7156d239adfd020db7845de93fb0601e17f8 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:16 +0000 Subject: [PATCH 134/143] NVIDIA: VR: SAUCE: PCI: Add CXL DVSEC reset and capability register definitions Add CXL DVSEC register definitions needed for CXL device reset per CXL r3.2 section 8.1.3.1: - Capability bits: RST_CAPABLE, CACHE_CAPABLE, CACHE_WBI_CAPABLE, RST_TIMEOUT, RST_MEM_CLR_CAPABLE - Control2 register: DISABLE_CACHING, INIT_CACHE_WBI, INIT_CXL_RST, RST_MEM_CLR_EN - Status2 register: CACHE_INV, RST_DONE, RST_ERR - Non-CXL Function Map DVSEC register offset Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) [jan: Resolve conflicts where PCI_DVSEC_CXL_CACHE_CAPABLE is already added by "72bd823fb4f1 NVIDIA: VR: SAUCE: PCI: Allow ATS to be always on for CXL.cache capable devices"] Signed-off-by: Jiandi An --- include/uapi/linux/pci_regs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 68aec848510d5..0b16aa7864f42 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1248,10 +1248,22 @@ #define PCI_DVSEC_CXL_CACHE_CAPABLE _BITUL(0) #define PCI_DVSEC_CXL_MEM_CAPABLE _BITUL(2) #define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) +#define PCI_DVSEC_CXL_CACHE_WBI_CAPABLE _BITUL(6) +#define PCI_DVSEC_CXL_RST_CAPABLE _BITUL(7) +#define PCI_DVSEC_CXL_RST_TIMEOUT __GENMASK(10, 8) +#define PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE _BITUL(11) #define PCI_DVSEC_CXL_CTRL 0xC #define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) #define PCI_DVSEC_CXL_CTRL_RWL 0x5FED #define PCI_DVSEC_CXL_CTRL2 0x10 +#define PCI_DVSEC_CXL_DISABLE_CACHING _BITUL(0) +#define PCI_DVSEC_CXL_INIT_CACHE_WBI _BITUL(1) +#define PCI_DVSEC_CXL_INIT_CXL_RST _BITUL(2) +#define PCI_DVSEC_CXL_RST_MEM_CLR_EN _BITUL(3) +#define PCI_DVSEC_CXL_STATUS2 0x12 +#define PCI_DVSEC_CXL_CACHE_INV _BITUL(0) +#define PCI_DVSEC_CXL_RST_DONE _BITUL(1) +#define PCI_DVSEC_CXL_RST_ERR _BITUL(2) #define PCI_DVSEC_CXL_LOCK 0x14 #define PCI_DVSEC_CXL_LOCK_CONFIG _BITUL(0) #define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) @@ -1269,6 +1281,7 @@ /* CXL r4.0, 8.1.4: Non-CXL Function Map DVSEC */ #define PCI_DVSEC_CXL_FUNCTION_MAP 2 +#define PCI_DVSEC_CXL_FUNCTION_MAP_REG 0x0C /* CXL r4.0, 8.1.5: Extensions DVSEC for Ports */ #define PCI_DVSEC_CXL_PORT 3 From c0f8ddc053543ab2943a78ce12dbe8f14a5c4648 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:17 +0000 Subject: [PATCH 135/143] NVIDIA: VR: SAUCE: PCI: Export pci_dev_save_and_disable() and pci_dev_restore() Export pci_dev_save_and_disable() and pci_dev_restore() so that subsystems performing non-standard reset sequences (e.g. CXL) can reuse the PCI core standard pre/post reset lifecycle: driver reset_prepare/reset_done callbacks, PCI config space save/restore, and device disable/re-enable. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An --- drivers/pci/pci.c | 21 +++++++++++++++++++-- include/linux/pci.h | 3 +++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 193bbb6347566..f12e6eca601c3 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5184,7 +5184,15 @@ void pci_dev_unlock(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_dev_unlock); -static void pci_dev_save_and_disable(struct pci_dev *dev) +/** + * pci_dev_save_and_disable - Save device state and disable it + * @dev: PCI device to save and disable + * + * Save the PCI configuration state, invoke the driver's reset_prepare + * callback (if any), and disable the device by clearing the Command register. + * The device lock must be held by the caller. + */ +void pci_dev_save_and_disable(struct pci_dev *dev) { const struct pci_error_handlers *err_handler = dev->driver ? dev->driver->err_handler : NULL; @@ -5216,8 +5224,16 @@ static void pci_dev_save_and_disable(struct pci_dev *dev) */ pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); } +EXPORT_SYMBOL_GPL(pci_dev_save_and_disable); -static void pci_dev_restore(struct pci_dev *dev) +/** + * pci_dev_restore - Restore device state after reset + * @dev: PCI device to restore + * + * Restore the saved PCI configuration state and invoke the driver's + * reset_done callback (if any). The device lock must be held by the caller. + */ +void pci_dev_restore(struct pci_dev *dev) { const struct pci_error_handlers *err_handler = dev->driver ? dev->driver->err_handler : NULL; @@ -5234,6 +5250,7 @@ static void pci_dev_restore(struct pci_dev *dev) else if (dev->driver) pci_warn(dev, "reset done"); } +EXPORT_SYMBOL_GPL(pci_dev_restore); /* dev->reset_methods[] is a 0-terminated list of indices into this array */ const struct pci_reset_fn_method pci_reset_fn_methods[] = { diff --git a/include/linux/pci.h b/include/linux/pci.h index a03cdd8c96122..60edd5520f751 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1962,6 +1962,9 @@ int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) +void pci_dev_save_and_disable(struct pci_dev *dev); +void pci_dev_restore(struct pci_dev *dev); + /* * PCI domain support. Sometimes called PCI segment (eg by ACPI), * a PCI domain is defined to be a set of PCI buses which share From c405cfa3b79c9741874d0501c586926916b893be Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:18 +0000 Subject: [PATCH 136/143] NVIDIA: VR: SAUCE: cxl: Add memory offlining and cache flush helpers Add infrastructure for quiescing the CXL data path before reset: - Memory offlining: check if CXL-backed memory is online and offline it via offline_and_remove_memory() before reset, per CXL spec requirement to quiesce all CXL.mem transactions before issuing CXL Reset. - CPU cache flush: invalidate cache lines before reset as a safety measure after memory offline. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 110 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index ba2d393c540af..04651a156a79d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -932,3 +934,111 @@ int cxl_port_get_possible_dports(struct cxl_port *port) return ctx.count; } + +/* + * CXL Reset support - core-provided reset logic for CXL devices. + * + * These functions implement the CXL reset sequence. + */ + +/* + * If CXL memory backed by this decoder is online as System RAM, offline + * and remove it per CXL spec requirements before issuing CXL Reset. + * Returns 0 if memory was not online or was successfully offlined. + */ +static int __maybe_unused cxl_offline_memory(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct cxl_region_params *p; + int rc; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + cxlr = cxled->cxld.region; + if (!cxlr) + return 0; + + p = &cxlr->params; + if (!p->res) + return 0; + + if (walk_iomem_res_desc(IORES_DESC_NONE, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + p->res->start, p->res->end, NULL, NULL) <= 0) + return 0; + + dev_info(dev, "Offlining CXL memory [%pr] for reset\n", p->res); + +#ifdef CONFIG_MEMORY_HOTREMOVE + rc = offline_and_remove_memory(p->res->start, resource_size(p->res)); + if (rc) { + dev_err(dev, + "Failed to offline CXL memory [%pr]: %d\n", + p->res, rc); + return rc; + } +#else + dev_err(dev, "Memory hotremove not supported, cannot offline CXL memory\n"); + rc = -EOPNOTSUPP; + return rc; +#endif + + return 0; +} + +static int __maybe_unused cxl_reset_prepare_memdev(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + struct device *dev; + + if (!cxlmd || !cxlmd->cxlds) + return -ENODEV; + + dev = cxlmd->cxlds->dev; + endpoint = cxlmd->endpoint; + if (!endpoint) + return 0; + + return device_for_each_child(&endpoint->dev, NULL, + cxl_offline_memory); +} + +static int __maybe_unused cxl_decoder_flush_cache(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct resource *res; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + cxlr = cxled->cxld.region; + if (!cxlr || !cxlr->params.res) + return 0; + + res = cxlr->params.res; + cpu_cache_invalidate_memregion(res->start, resource_size(res)); + return 0; +} + +static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + + if (!cxlmd) + return 0; + + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return 0; + + if (!cpu_cache_has_invalidate_memregion()) + return 0; + + device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache); + return 0; +} From 9dcf9975a46115e4af081b0d6e205bf0ba245205 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:19 +0000 Subject: [PATCH 137/143] NVIDIA: VR: SAUCE: cxl: Add multi-function sibling coordination for CXL reset Add sibling PCI function save/disable/restore coordination for CXL reset. Before reset, all CXL.cachemem sibling functions are locked, saved, and disabled; after reset they are restored. The Non-CXL Function Map DVSEC and per-function DVSEC capability register are consulted to skip non-CXL and CXL.io-only functions. A global mutex serializes concurrent resets to prevent deadlocks between sibling functions. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 137 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 04651a156a79d..22b4f0b0ac4fa 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -16,6 +16,9 @@ #include "core.h" #include "trace.h" +/* Initial sibling array capacity: covers max non-ARI functions per slot */ +#define CXL_RESET_SIBLINGS_INIT 8 + /** * DOC: cxl core pci * @@ -1042,3 +1045,137 @@ static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd) device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache); return 0; } + +/* + * Serialize all CXL reset operations globally. + */ +static DEFINE_MUTEX(cxl_reset_mutex); + +struct cxl_reset_context { + struct pci_dev *target; + struct pci_dev **pci_functions; + int pci_func_count; + int pci_func_cap; +}; + +/* + * Check if a sibling function is non-CXL using the Non-CXL Function Map + * DVSEC. Returns true if fn is listed as non-CXL, false otherwise (including + * on any read failure). + */ +static bool cxl_is_non_cxl_function(struct pci_dev *pdev, + u16 func_map_dvsec, int fn) +{ + int reg, bit; + u32 map; + + if (pci_ari_enabled(pdev->bus)) { + reg = fn / 32; + bit = fn % 32; + } else { + reg = fn; + bit = PCI_SLOT(pdev->devfn); + } + + if (pci_read_config_dword(pdev, + func_map_dvsec + PCI_DVSEC_CXL_FUNCTION_MAP_REG + (reg * 4), + &map)) + return false; + + return map & BIT(bit); +} + +struct cxl_reset_walk_ctx { + struct cxl_reset_context *ctx; + u16 func_map_dvsec; + bool ari; +}; + +static int cxl_reset_collect_sibling(struct pci_dev *func, void *data) +{ + struct cxl_reset_walk_ctx *wctx = data; + struct cxl_reset_context *ctx = wctx->ctx; + struct pci_dev *pdev = ctx->target; + u16 dvsec, cap; + int fn; + + if (func == pdev) + return 0; + + if (!wctx->ari && + PCI_SLOT(func->devfn) != PCI_SLOT(pdev->devfn)) + return 0; + + fn = wctx->ari ? func->devfn : PCI_FUNC(func->devfn); + if (wctx->func_map_dvsec && + cxl_is_non_cxl_function(pdev, wctx->func_map_dvsec, fn)) + return 0; + + /* Only coordinate with siblings that have CXL.cachemem */ + dvsec = pci_find_dvsec_capability(func, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return 0; + if (pci_read_config_word(func, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return 0; + if (!(cap & (PCI_DVSEC_CXL_CACHE_CAPABLE | + PCI_DVSEC_CXL_MEM_CAPABLE))) + return 0; + + /* Grow sibling array; double capacity for ARI devices when running out of space */ + if (ctx->pci_func_count >= ctx->pci_func_cap) { + struct pci_dev **new; + int new_cap = ctx->pci_func_cap ? ctx->pci_func_cap * 2 + : CXL_RESET_SIBLINGS_INIT; + + new = krealloc(ctx->pci_functions, + new_cap * sizeof(*new), GFP_KERNEL); + if (!new) + return 1; + ctx->pci_functions = new; + ctx->pci_func_cap = new_cap; + } + + pci_dev_get(func); + ctx->pci_functions[ctx->pci_func_count++] = func; + return 0; +} + +static void __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) +{ + struct pci_dev *pdev = ctx->target; + struct cxl_reset_walk_ctx wctx; + int i; + + ctx->pci_func_count = 0; + ctx->pci_functions = NULL; + ctx->pci_func_cap = 0; + + wctx.ctx = ctx; + wctx.ari = pci_ari_enabled(pdev->bus); + wctx.func_map_dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_FUNCTION_MAP); + + /* Collect CXL.cachemem siblings under pci_bus_sem */ + pci_walk_bus(pdev->bus, cxl_reset_collect_sibling, &wctx); + + /* Lock and save/disable siblings outside pci_bus_sem */ + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_lock(ctx->pci_functions[i]); + pci_dev_save_and_disable(ctx->pci_functions[i]); + } +} + +static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_restore(ctx->pci_functions[i]); + pci_dev_unlock(ctx->pci_functions[i]); + pci_dev_put(ctx->pci_functions[i]); + } + kfree(ctx->pci_functions); + ctx->pci_functions = NULL; + ctx->pci_func_count = 0; +} From f4413ec2454b2e3aaffcd35130da47f75726f39d Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:20 +0000 Subject: [PATCH 138/143] NVIDIA: VR: SAUCE: cxl: Add CXL DVSEC reset sequence and flow orchestration cxl_dev_reset() implements the hardware reset sequence: optionally enable memory clear, initiate reset via CTRL2, wait for completion, and re-enable caching. cxl_do_reset() orchestrates the full reset flow: 1. CXL pre-reset: mem offlining and cache flush (when memdev present) 2. PCI save/disable: pci_dev_save_and_disable() automatically saves CXL DVSEC and HDM decoder state via PCI core hooks 3. Sibling coordination: save/disable CXL.cachemem sibling functions 4. Execute CXL DVSEC reset 5. Sibling restore: always runs to re-enable sibling functions 6. PCI restore: pci_dev_restore() automatically restores CXL state The CXL-specific DVSEC and HDM save/restore is handled by the PCI core's CXL save/restore infrastructure (drivers/pci/cxl.c). Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 181 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 179 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 22b4f0b0ac4fa..0955cceced110 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1141,7 +1141,7 @@ static int cxl_reset_collect_sibling(struct pci_dev *func, void *data) return 0; } -static void __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) +static void cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) { struct pci_dev *pdev = ctx->target; struct cxl_reset_walk_ctx wctx; @@ -1166,7 +1166,7 @@ static void __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_cont } } -static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) +static void cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) { int i; @@ -1179,3 +1179,180 @@ static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context ctx->pci_functions = NULL; ctx->pci_func_count = 0; } + +/* + * CXL device reset execution + */ +static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) +{ + static const u32 reset_timeout_ms[] = { 10, 100, 1000, 10000, 100000 }; + u16 cap, ctrl2, status2; + u32 timeout_ms; + int rc, idx; + + if (!pci_wait_for_pending_transaction(pdev)) + pci_err(pdev, "timed out waiting for pending transactions\n"); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap); + if (rc) + return rc; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + /* + * Disable caching and initiate cache writeback+invalidation if the + * device supports it. Poll for completion. + * Per CXL r3.2 section 9.6, software may use the cache size from + * DVSEC CXL Capability2 to compute a suitable timeout; we use a + * default of 10ms. + */ + if (cap & PCI_DVSEC_CXL_CACHE_WBI_CAPABLE) { + u32 wbi_poll_us = 100; + s32 wbi_remaining_us = 10000; + + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CACHE_WBI; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + do { + usleep_range(wbi_poll_us, wbi_poll_us + 1); + wbi_remaining_us -= wbi_poll_us; + rc = pci_read_config_word(pdev, + dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + } while (!(status2 & PCI_DVSEC_CXL_CACHE_INV) && + wbi_remaining_us > 0); + + if (!(status2 & PCI_DVSEC_CXL_CACHE_INV)) { + pci_err(pdev, "CXL cache WB+I timed out\n"); + return -ETIMEDOUT; + } + } else if (cap & PCI_DVSEC_CXL_CACHE_CAPABLE) { + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + if (cap & PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + idx = FIELD_GET(PCI_DVSEC_CXL_RST_TIMEOUT, cap); + if (idx >= ARRAY_SIZE(reset_timeout_ms)) + idx = ARRAY_SIZE(reset_timeout_ms) - 1; + timeout_ms = reset_timeout_ms[idx]; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CXL_RST; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + msleep(timeout_ms); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + + if (status2 & PCI_DVSEC_CXL_RST_ERR) { + pci_err(pdev, "CXL reset error\n"); + return -EIO; + } + + if (!(status2 & PCI_DVSEC_CXL_RST_DONE)) { + pci_err(pdev, "CXL reset timeout\n"); + return -ETIMEDOUT; + } + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 &= ~PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + return 0; +} + +static int match_memdev_by_parent(struct device *dev, const void *parent) +{ + return is_cxl_memdev(dev) && dev->parent == parent; +} + +static int cxl_do_reset(struct pci_dev *pdev) +{ + struct cxl_reset_context ctx = { .target = pdev }; + struct cxl_memdev *cxlmd = NULL; + struct device *memdev = NULL; + int dvsec, rc; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return -ENODEV; + + memdev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev, + match_memdev_by_parent); + if (memdev) { + cxlmd = to_cxl_memdev(memdev); + guard(device)(&cxlmd->dev); + } + + mutex_lock(&cxl_reset_mutex); + pci_dev_lock(pdev); + + if (cxlmd) { + rc = cxl_reset_prepare_memdev(cxlmd); + if (rc) + goto out_unlock; + + cxl_reset_flush_cpu_caches(cxlmd); + } + + pci_dev_save_and_disable(pdev); + cxl_pci_functions_reset_prepare(&ctx); + + rc = cxl_dev_reset(pdev, dvsec); + + cxl_pci_functions_reset_done(&ctx); + + pci_dev_restore(pdev); + +out_unlock: + pci_dev_unlock(pdev); + mutex_unlock(&cxl_reset_mutex); + + if (memdev) + put_device(memdev); + + return rc; +} From 9b3c4a19da44c2e1aeccb63f19165ad31d01e00d Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:21 +0000 Subject: [PATCH 139/143] NVIDIA: VR: SAUCE: cxl: Add cxl_reset sysfs interface for PCI devices Add a "cxl_reset" sysfs attribute to PCI devices that support CXL Reset (CXL r3.2 section 8.1.3.1). The attribute is visible only on devices with both CXL.cache and CXL.mem capabilities and the CXL Reset Capable bit set in the DVSEC. Writing "1" to the attribute triggers the full CXL reset flow via cxl_do_reset(). The interface is decoupled from memdev creation: when a CXL memdev exists, memory offlining and cache flush are performed; otherwise reset proceeds without the memory management. The sysfs attribute is managed entirely by the CXL module using sysfs_create_group() / sysfs_remove_group() rather than the PCI core's static attribute groups. This avoids cross-module symbol dependencies between the PCI core (always built-in) and CXL_BUS (potentially modular). At module init, existing PCI devices are scanned and a PCI bus notifier handles hot-plug/unplug. kernfs_drain() makes sure that any in-flight store() completes before sysfs_remove_group() returns, preventing use-after-free during module unload. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/core.h | 2 + drivers/cxl/core/pci.c | 113 ++++++++++++++++++++++++++++++++++++++++ drivers/cxl/core/port.c | 3 ++ 3 files changed, 118 insertions(+) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index e3c85ceda2485..f3d6e4e6ad81b 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -130,6 +130,8 @@ extern struct cxl_rwsem cxl_rwsem; int cxl_memdev_init(void); void cxl_memdev_exit(void); void cxl_mbox_init(void); +void cxl_reset_sysfs_init(void); +void cxl_reset_sysfs_exit(void); enum cxl_poison_trace_type { CXL_POISON_TRACE_LIST, diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 0955cceced110..497d99b8908d0 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1356,3 +1356,116 @@ static int cxl_do_reset(struct pci_dev *pdev) return rc; } + +/* + * CXL reset sysfs attribute management. + * + * The cxl_reset attribute is added to PCI devices that advertise CXL Reset + * capability. Managed entirely by the CXL module via subsys_interface on + * pci_bus_type, avoiding cross-module symbol dependencies between the PCI + * core (built-in) and CXL (potentially modular). + * + * subsys_interface handles existing devices at register time and hot-plug + * add/remove automatically. On unregister, remove_dev runs for all tracked + * devices under bus core serialization. + */ + +static bool pci_cxl_reset_capable(struct pci_dev *pdev) +{ + int dvsec; + u16 cap; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return false; + + if (pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return false; + + if (!(cap & PCI_DVSEC_CXL_CACHE_CAPABLE) || + !(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) + return false; + + return !!(cap & PCI_DVSEC_CXL_RST_CAPABLE); +} + +static ssize_t cxl_reset_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int rc; + + if (!sysfs_streq(buf, "1")) + return -EINVAL; + + rc = cxl_do_reset(pdev); + return rc ? rc : count; +} +static DEVICE_ATTR_WO(cxl_reset); + +static umode_t cxl_reset_attr_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return a->mode; +} + +static struct attribute *cxl_reset_attrs[] = { + &dev_attr_cxl_reset.attr, + NULL, +}; + +static const struct attribute_group cxl_reset_attr_group = { + .attrs = cxl_reset_attrs, + .is_visible = cxl_reset_attr_is_visible, +}; + +static int cxl_reset_add_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return sysfs_create_group(&dev->kobj, &cxl_reset_attr_group); +} + +static void cxl_reset_remove_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return; + + sysfs_remove_group(&dev->kobj, &cxl_reset_attr_group); +} + +static struct subsys_interface cxl_reset_interface = { + .name = "cxl_reset", + .subsys = &pci_bus_type, + .add_dev = cxl_reset_add_dev, + .remove_dev = cxl_reset_remove_dev, +}; + +void cxl_reset_sysfs_init(void) +{ + int rc; + + rc = subsys_interface_register(&cxl_reset_interface); + if (rc) + pr_warn("CXL: failed to register cxl_reset interface (%d)\n", + rc); +} + +void cxl_reset_sysfs_exit(void) +{ + subsys_interface_unregister(&cxl_reset_interface); +} diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 385588b8b30b5..929caeec5c954 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2505,6 +2505,8 @@ static __init int cxl_core_init(void) if (rc) goto err_ras; + cxl_reset_sysfs_init(); + return 0; err_ras: @@ -2520,6 +2522,7 @@ static __init int cxl_core_init(void) static void cxl_core_exit(void) { + cxl_reset_sysfs_exit(); cxl_ras_exit(); cxl_region_exit(); bus_unregister(&cxl_bus_type); From c3dd0abf2681e4975afc3c2b6fcee76b8f7230a9 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:22 +0000 Subject: [PATCH 140/143] NVIDIA: VR: SAUCE: Documentation: ABI: Add CXL PCI cxl_reset sysfs attribute Document the cxl_reset sysfs attribute added to PCI devices that support CXL Reset. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An --- Documentation/ABI/testing/sysfs-bus-pci | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 69f952fffec72..c11da40900f17 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -174,6 +174,28 @@ Description: similiar to writing 1 to their individual "reset" file, so use with caution. +What: /sys/bus/pci/devices/.../cxl_reset +Date: February 2026 +Contact: linux-cxl@vger.kernel.org +Description: + This attribute is only visible when the device advertises + CXL Reset Capable in the CXL DVSEC Capability register + (CXL r3.2, section 8.1.3). + + Writing 1 to this file triggers a CXL device reset which + affects CXL.cache and CXL.mem state on all CXL functions + (i.e. those not listed in the Non-CXL Function Map DVSEC, + section 8.1.4), not just CXL.io/PCIe state. This is + separate from the standard PCI reset interface because CXL + Reset has different scope. + + The reset will fail with -EBUSY if any CXL regions using this + device have drivers bound. Active regions are torn down as + part of the reset sequence. + + This attribute is registered by the CXL core when a CXL device + is discovered, independent of which driver binds the PCI device. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings From 693f075bb9ea4916daafbf233e3f1fb83071bfc5 Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Wed, 11 Mar 2026 17:09:13 -0500 Subject: [PATCH 141/143] NVIDIA: VR: SAUCE: [Config] CXL config annotations for Type-2 device and RAS support Add Ubuntu kernel config annotations for CXL-related configs introduced or changed by the following cherry-picked patch series: - drivers/cxl changes between v6.17.9 and upstream 7.0 (which includes a portion of Terry Bowman's v14 CXL RAS series merged via for-7.0/cxl-aer-prep) - Alejandro Lucero's v23 CXL Type-2 device support series - Smita Koralahalli's v6 patch 3/9 (cxl/region: Skip decoder reset on detach for autodiscovered regions) CONFIG_CXL_BUS: Enable CXL bus support built-in; required for CXL Type-2 device and RAS support CONFIG_CXL_PCI: Enable CXL PCI management built-in; auto-selects CXL_MEM; required for CXL Type-2 device support CONFIG_CXL_MEM: Auto-selected by CXL_PCI; required for CXL memory expansion and Type-2 device support CONFIG_CXL_PORT: Required for CXL port enumeration; defaults to CXL_BUS value CONFIG_FWCTL: Selected by CXL_BUS when CXL_FEATURES is enabled; required for CXL feature mailbox access CONFIG_CXL_RAS: New def_bool replacing PCIEAER_CXL (Terry Bowman v14); auto-enabled with ACPI_APEI_GHES+PCIEAER+ CXL_BUS for CXL RAS error handling CONFIG_SFC_CXL: Solarflare SFC9100-family CXL Type-2 device support; not needed for NVIDIA platforms (n) CONFIG_ACPI_APEI_EINJ: Required prerequisite for CONFIG_ACPI_APEI_EINJ_CXL CONFIG_ACPI_APEI_EINJ_CXL: CXL protocol error injection support via APEI EINJ CONFIG_PCIEAER_CXL: Remove it from debian.master policy. This config was removed from Kconfig by upstream commit d18f1b7beadf (PCI/AER: Replace PCIEAER_CXL symbol with CXL_RAS) which is included in this port. CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION: Override debian.master amd64-only policy to include arm64. Commit 4d873c5dc3ed added 'select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION' to arch/arm64/Kconfig, making this y on arm64 as well. CONFIG_GENERIC_CPU_CACHE_MAINTENANCE: New bool config defined by c460697d3472 in lib/Kconfig. Selected by arm64 via 4d873c5dc3ed; not selected by x86. Set arm64: y, amd64: -. CONFIG_CACHEMAINT_FOR_HOTPLUG: New optional menuconfig defined by 2ec3b54a6ff0 in drivers/cache/Kconfig. Depends on GENERIC_CPU_CACHE_MAINTENANCE so becomes visible on arm64. Defaults to n; HiSilicon HHA driver not needed for NVIDIA platforms. Set arm64: n, amd64: -. Signed-off-by: Jiandi An --- debian.master/config/annotations | 1 - debian.nvidia-6.17/config/annotations | 36 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/debian.master/config/annotations b/debian.master/config/annotations index b05e77739d76b..c27e5965589ab 100644 --- a/debian.master/config/annotations +++ b/debian.master/config/annotations @@ -10048,7 +10048,6 @@ CONFIG_PCENGINES_APU2 policy<{'amd64': 'm'}> CONFIG_PCI policy<{'amd64': 'y', 'arm64': 'y', 'armhf': 'y', 'ppc64el': 'y', 'riscv64': 'y', 's390x': 'y'}> CONFIG_PCI200SYN policy<{'amd64': 'm', 'arm64': 'm', 'armhf': 'm', 'ppc64el': 'm', 'riscv64': 'm'}> CONFIG_PCIEAER policy<{'amd64': 'y', 'arm64': 'y', 'armhf': 'y', 'ppc64el': '-', 'riscv64': 'y', 's390x': 'y'}> -CONFIG_PCIEAER_CXL policy<{'amd64': 'y', 'arm64': 'y', 'armhf': 'y', 'riscv64': 'y'}> CONFIG_PCIEAER_INJECT policy<{'amd64': 'n', 'arm64': 'n', 'armhf': 'n', 'ppc64el': '-', 'riscv64': 'n', 's390x': 'n'}> CONFIG_PCIEASPM policy<{'amd64': 'y', 'arm64': 'y', 'armhf': 'y', 'ppc64el': 'y', 'riscv64': 'y', 's390x': 'y'}> CONFIG_PCIEASPM_PERFORMANCE policy<{'amd64': 'n', 'arm64': 'n', 'armhf': 'n', 'ppc64el': 'n', 'riscv64': 'n', 's390x': 'n'}> diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index 4fb025e692b3a..800ca185cf530 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -204,6 +204,42 @@ CONFIG_UBUNTU_ODM_DRIVERS note<'Disable all Ubuntu ODM dri CONFIG_ULTRASOC_SMB policy<{'arm64': 'n'}> CONFIG_ULTRASOC_SMB note<'Required for Grace enablement'> +CONFIG_CXL_BUS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_BUS note<'Enable CXL bus support built-in; required for CXL Type-2 device and RAS support'> + +CONFIG_CXL_PCI policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_PCI note<'Enable CXL PCI management built-in; auto-selects CXL_MEM; required for CXL Type-2 device support'> + +CONFIG_CXL_MEM policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_MEM note<'Auto-selected by CXL_PCI; required for CXL memory expansion and Type-2 device support'> + +CONFIG_CXL_PORT policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_PORT note<'Required for CXL port enumeration; defaults to CXL_BUS value'> + +CONFIG_FWCTL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_FWCTL note<'Selected by CXL_BUS when CXL_FEATURES is enabled; required for CXL feature mailbox access'> + +CONFIG_CXL_RAS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_RAS note<'New def_bool replacing PCIEAER_CXL; auto-enabled with ACPI_APEI_GHES+PCIEAER+CXL_BUS; CXL RAS error handling support'> + +CONFIG_SFC_CXL policy<{'amd64': 'n', 'arm64': 'n'}> +CONFIG_SFC_CXL note<'Solarflare SFC9100-family CXL Type-2 device support; not needed for NVIDIA platforms'> + +CONFIG_ACPI_APEI_EINJ policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ACPI_APEI_EINJ note<'Required for CONFIG_ACPI_APEI_EINJ_CXL'> + +CONFIG_ACPI_APEI_EINJ_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ACPI_APEI_EINJ_CXL note<'CXL protocol error injection support via APEI EINJ'> + +CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION note<'Override debian.master amd64-only; arm64 selects this via arch/arm64/Kconfig since 4d873c5dc3ed'> + +CONFIG_GENERIC_CPU_CACHE_MAINTENANCE policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_GENERIC_CPU_CACHE_MAINTENANCE note<'Selected by arm64 via arch/arm64/Kconfig since 4d873c5dc3ed; not selected by x86'> + +CONFIG_CACHEMAINT_FOR_HOTPLUG policy<{'amd64': '-', 'arm64': 'n'}> +CONFIG_CACHEMAINT_FOR_HOTPLUG note<'Optional HiSilicon HHA cache maintenance driver; depends on GENERIC_CPU_CACHE_MAINTENANCE; not needed for NVIDIA platforms'> + # ---- Annotations without notes ---- From ce505ce1ac1ac7a261ae47ef6e49435f590543fe Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Wed, 11 Mar 2026 18:41:16 -0500 Subject: [PATCH 142/143] NVIDIA: VR: SAUCE: [Config] Enable CXL DAX and KMEM built-in for CXL memory access Override debian.master policy (m->y) for DEV_DAX, DEV_DAX_CXL, and DEV_DAX_KMEM to ensure CXL memory regions are accessible as both raw DAX devices and hotplugged System-RAM nodes. debian.master sets these to 'm' (modules). For NVIDIA platforms with CXL Type-2 devices, built-in (y) is required to ensure CXL memory regions provisioned early in boot are immediately accessible without relying on module loading order. CONFIG_DEV_DAX: Override m->y; prerequisite for DEV_DAX_CXL and DEV_DAX_KMEM to be built-in; depends on TRANSPARENT_HUGEPAGE (already y in debian.master) CONFIG_DEV_DAX_CXL: Override m->y; creates /dev/daxX.Y devices for CXL RAM regions not in the default system memory map (Soft Reserved or dynamically provisioned regions); depends on CXL_BUS+CXL_REGION+DEV_DAX (all y) CONFIG_DEV_DAX_KMEM: Override m->y; onlines CXL DAX devices as System-RAM NUMA nodes via memory hotplug, making CXL memory available for normal kernel and userspace allocation Signed-off-by: Jiandi An --- debian.nvidia-6.17/config/annotations | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index 800ca185cf530..c5aeb93d14ca9 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -225,6 +225,15 @@ CONFIG_CXL_RAS note<'New def_bool replacing PCI CONFIG_SFC_CXL policy<{'amd64': 'n', 'arm64': 'n'}> CONFIG_SFC_CXL note<'Solarflare SFC9100-family CXL Type-2 device support; not needed for NVIDIA platforms'> +CONFIG_DEV_DAX policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX note<'Override debian.master m->y; required built-in for DEV_DAX_CXL=y'> + +CONFIG_DEV_DAX_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX_CXL note<'Override debian.master m->y; CXL RAM region DAX access; depends on CXL_BUS+CXL_REGION+DEV_DAX'> + +CONFIG_DEV_DAX_KMEM policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX_KMEM note<'Override debian.master m->y; map CXL DAX devices as System-RAM'> + CONFIG_ACPI_APEI_EINJ policy<{'amd64': 'y', 'arm64': 'y'}> CONFIG_ACPI_APEI_EINJ note<'Required for CONFIG_ACPI_APEI_EINJ_CXL'> From acf188b93967c184dfc83dde5161fb0518ec6f54 Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Sun, 22 Mar 2026 20:28:29 -0500 Subject: [PATCH 143/143] NVIDIA: VR: SAUCE: [Config] Add PCI_CXL annotation for CXL state save/restore Add Ubuntu kernel config annotation for CONFIG_PCI_CXL introduced by the CXL DVSEC and HDM state save/restore series (Srirangan Madhavan). CONFIG_PCI_CXL: Hidden bool in drivers/pci/Kconfig; auto-enabled when CXL_BUS=y. Gates compilation of drivers/pci/cxl.o which saves and restores CXL DVSEC control/range registers and HDM decoder state across PCI resets and link transitions. Signed-off-by: Jiandi An --- debian.nvidia-6.17/config/annotations | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index c5aeb93d14ca9..9340e319beca8 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -249,6 +249,8 @@ CONFIG_GENERIC_CPU_CACHE_MAINTENANCE note<'Selected by arm64 via arch CONFIG_CACHEMAINT_FOR_HOTPLUG policy<{'amd64': '-', 'arm64': 'n'}> CONFIG_CACHEMAINT_FOR_HOTPLUG note<'Optional HiSilicon HHA cache maintenance driver; depends on GENERIC_CPU_CACHE_MAINTENANCE; not needed for NVIDIA platforms'> +CONFIG_PCI_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_PCI_CXL note<'Hidden bool; auto-enabled by CXL_BUS; PCI core CXL DVSEC and HDM state save/restore support'> # ---- Annotations without notes ----