From f27952269ddd0c6596d91358114c298282faadab Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 25 Mar 2026 18:47:58 -0700
Subject: [PATCH 1/2] refine _extract_layer_prefixes to better handle mtp
 modules

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py | 5 +++++
 examples/llm_ptq/hf_ptq.py        | 9 +++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 7496229a78..4ed2c1e106 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -364,6 +364,11 @@ def _extract_layer_prefixes(keys):
         mtp_layer_prefixes = set()
         for key in keys:
             parts = key.split(".")
+            # Capture the top-level MTP module prefix (e.g., "mtp" from "mtp.fc.weight")
+            # so that non-layer MTP weights like mtp.fc, mtp.norm are also excluded
+            if parts:
+                mtp_layer_prefixes.add(parts[0])
+            # Also capture specific layer prefixes (e.g., "mtp.layers.0")
             for i, part in enumerate(parts):
                 if part == "layers" and i + 1 < len(parts) and parts[i + 1].isdigit():
                     prefix = ".".join(parts[: i + 2])
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index b81dc60c01..6c8a7597fe 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -979,8 +979,13 @@ def quantize_main(
         if mtp_layer_prefixes:
             quant_cfg = copy.deepcopy(quant_cfg)
             for prefix in mtp_layer_prefixes:
-                # Add exclusion pattern for this MTP layer (e.g., "*layers.92*")
-                pattern = f"*{prefix.split('.')[-2]}.{prefix.split('.')[-1]}*"
+                parts = prefix.split(".")
+                if len(parts) >= 2:
+                    # Multi-component prefix (e.g., "mtp.layers.0" -> "*layers.0*")
+                    pattern = f"*{parts[-2]}.{parts[-1]}*"
+                else:
+                    # Single-component prefix (e.g., "mtp" -> "*mtp*")
+                    pattern = f"*{prefix}*"
                 quant_cfg["quant_cfg"][pattern] = {"enable": False}
                 print(f"Excluding MTP layer from quantization: {pattern}")
 

From 3d9d337c658cbdcd433cd6d3e1510fdc358f40c0 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Mon, 6 Apr 2026 11:39:43 -0700
Subject: [PATCH 2/2] update

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 6c8a7597fe..a1f7d0808a 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -979,13 +979,7 @@ def quantize_main(
         if mtp_layer_prefixes:
             quant_cfg = copy.deepcopy(quant_cfg)
             for prefix in mtp_layer_prefixes:
-                parts = prefix.split(".")
-                if len(parts) >= 2:
-                    # Multi-component prefix (e.g., "mtp.layers.0" -> "*layers.0*")
-                    pattern = f"*{parts[-2]}.{parts[-1]}*"
-                else:
-                    # Single-component prefix (e.g., "mtp" -> "*mtp*")
-                    pattern = f"*{prefix}*"
+                pattern = f"*{prefix}*"
                 quant_cfg["quant_cfg"][pattern] = {"enable": False}
                 print(f"Excluding MTP layer from quantization: {pattern}")