diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index e2d421af4d647..6e1ff1ef7aedc 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -29,6 +29,8 @@ jobs:
         "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU",
         "JobId=AndroidBinarySizeCheckJob_MinimalBaseline-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
         ]
+    env:
+      CCACHE_DIR: ~/.cache/ccache # explicitly set to prevent any fallback to `~/.ccache`
     steps:
     - name: Checkout repository
       uses: actions/checkout@v6
@@ -41,7 +43,7 @@ jobs:
         ndk-version: 28.0.13004108
 
     - name: Get Docker Image using Action
-      uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+      uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
       id: build_docker_image_step
       with:
         dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -71,6 +73,7 @@ jobs:
       shell: python
       working-directory: ${{ github.workspace }}
 
+    # FUTURE WORK: ccache, vcpkg cache
     - name: 1a. Build onnxruntime
       run: |
         set -e -x
@@ -119,6 +122,8 @@ jobs:
         "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU",
         "JobId=android_nnapi_ep-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
         ]
+    env:
+      CCACHE_DIR: ~/.cache/ccache # explicitly set to prevent any fallback to `~/.ccache`
     steps:
       - uses: actions/checkout@v6
 
@@ -129,9 +134,10 @@ jobs:
           java-version: '17'
           architecture: x64
 
-
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.08.27'
           vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079'
           cmake-version: '3.31.6'
@@ -144,6 +150,23 @@ jobs:
         with:
           ndk-version: 28.0.13004108
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          # Fully qualify by workflow. `actions/cache` does not isolate by workflow, unlike ADO cache actions.
+          key: ccache | android.yml | android_nnapi_ep
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | android.yml | android_nnapi_ep
+          path: ~/.cache/vcpkg
+
+      - name: CCache reset stats
+        run: ccache --zero-stats
+        shell: bash
+
       - name: NNAPI EP, Build, Test on Android Emulator
         run: >-
           python3 tools/ci_build/build.py
@@ -155,7 +178,10 @@ jobs:
           --android_abi=x86_64
           --android_api=29
           --skip_submodule_sync
-          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+          --parallel
+          --use_cache
+          --use_vcpkg
+          --use_vcpkg_ms_internal_asset_cache
           --use_nnapi
           --build_shared_lib
           --cmake_generator=Ninja
@@ -163,13 +189,16 @@ jobs:
           --update --build --test
         shell: bash
 
-
       - name: Build Minimal ORT with NNAPI and run tests
         run:
           tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh
           "$(pwd)"
         shell: bash
 
+      - name: CCache stats
+        run: ccache --show-stats -vv
+        shell: bash
+
       - name: Install psutil for emulator shutdown by run_android_emulator.py
         if: always()
         run: python3 -m pip install psutil
@@ -198,7 +227,8 @@ jobs:
         "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU",
         "JobId=android_cpu_ep-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
         ]
-
+    env:
+      CCACHE_DIR: ~/.cache/ccache # explicitly set to prevent any fallback to `~/.ccache`
     steps:
       - uses: actions/checkout@v6
 
@@ -209,11 +239,39 @@ jobs:
           java-version: '17'
           architecture: x64
 
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
+        with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
+          vcpkg-version: '2025.08.27'
+          vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079'
+          cmake-version: '3.31.6'
+          cmake-hash: '42395e20b10a8e9ef3e33014f9a4eed08d46ab952e02d2c1bbc8f6133eca0d7719fb75680f9bbff6552f20fcd1b73d86860f7f39388d631f98fb6f622b37cf04'
+          add-cmake-to-path: 'true'
+          disable-terrapin: 'true'
+
       - name: Setup Android NDK
         uses: ./.github/actions/setup-android-ndk
         with:
           ndk-version: 28.0.13004108
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          # Fully qualify by workflow. `actions/cache` does not isolate by workflow, unlike ADO cache actions.
+          key: ccache | android.yml | android_cpu_ep
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | android.yml | android_cpu_ep
+          path: ~/.cache/vcpkg
+
+      - name: CCache reset stats
+        run: ccache --zero-stats
+        shell: bash
+
       - name: CPU EP, Build and Test
         run: >-
           python3 tools/ci_build/build.py
@@ -225,12 +283,19 @@ jobs:
           --android_abi=x86_64
           --android_api=30
           --skip_submodule_sync
-          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+          --parallel
+          --use_cache
+          --use_vcpkg
+          --use_vcpkg_ms_internal_asset_cache
           --cmake_generator=Ninja
           --build_java
           --update --build --test
         shell: bash
 
+      - name: CCache stats
+        run: ccache --show-stats -vv
+        shell: bash
+
       - name: Install psutil for emulator shutdown by run_android_emulator.py
         if: always()
         run: python3 -m pip install psutil
diff --git a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
index 4288442720493..c53c61242e6bc 100644
--- a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
+++ b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
@@ -4,6 +4,9 @@ description: "This is a reusable workflow for Linux WASM CI pipelines to build a
 on:
   workflow_call:
     inputs:
+      job_name: # workflow-scope unique key
+        required: true
+        type: string
       build_config:
         required: true
         type: string
@@ -43,6 +46,7 @@ jobs:
       buildArch: x64
       common_build_args: >-
         --parallel
+        --use_cache
         ${{ inputs.use_vcpkg == true && '--use_vcpkg --use_vcpkg_ms_internal_asset_cache' || '' }}
         --config ${{ inputs.build_config }}
         --skip_submodule_sync
@@ -77,8 +81,23 @@ jobs:
       - name: Install python dependencies
         run: python -m pip install flatbuffers
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          # Fully qualify by workflow. `actions/cache` does not isolate by workflow, unlike ADO cache actions.
+          key: ccache | web.yml | ${{ inputs.job_name }}
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | web.yml | ${{ inputs.job_name }}
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.08.27'
           vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079'
           cmake-version: '3.31.6'
diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
index 948b28b276edb..92840f46bc1cf 100644
--- a/.github/workflows/linux_cuda_ci.yml
+++ b/.github/workflows/linux_cuda_ci.yml
@@ -52,7 +52,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v6
 
-      - uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+      - uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -95,7 +95,7 @@ jobs:
       # So build.py --build_dir build/Release inside the container correctly finds the artifacts.
       - name: Test ONNX Runtime
         id: test_step
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Release
diff --git a/.github/workflows/linux_minimal_build.yml b/.github/workflows/linux_minimal_build.yml
index 4058e7af99070..4705812d78548 100644
--- a/.github/workflows/linux_minimal_build.yml
+++ b/.github/workflows/linux_minimal_build.yml
@@ -40,8 +40,24 @@ jobs:
         with:
           node-version: 20
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
         with:
+          # Fully qualify by workflow. `actions/cache` does not isolate by workflow, unlike ADO cache actions.
+          key: ccache | linux_minimal_build.yml | build_full_ort
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          # ostensibly should be able to use the same cache for most of these, but in practice the hash does not match.
+          key: vcpkg-cache | linux_minimal_build.yml | build_full_ort
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
+        with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.08.27'
           vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079'
           cmake-version: '3.31.6'
@@ -50,7 +66,7 @@ jobs:
           disable-terrapin: 'true'
 
       - name: Build Full ORT and Prepare Test Files
-        uses: microsoft/onnxruntime-github-actions/build-and-prep-ort-files@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-and-prep-ort-files@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
 
       - name: Upload Test Data Artifact
         uses: actions/upload-artifact@v6
@@ -80,8 +96,20 @@ jobs:
         with:
           node-version: 20
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_minimal_exceptions_disabled
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_exceptions_disabled
+          path: ~/.cache/vcpkg
+
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -92,28 +120,32 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Run Build 2 (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Debug # From original --config Debug
           mode: 'update' # CMake configure step
           extra_build_flags: >-
             --cmake_generator Ninja
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --skip_tests
             --minimal_build
             --disable_exceptions
             --enable_training_ops
 
       - name: Run Build 2 (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Debug # From original --config Debug
           mode: 'build' # Actual build step
           extra_build_flags: >-
             --cmake_generator Ninja
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --skip_tests
             --minimal_build
             --disable_exceptions
@@ -141,8 +173,22 @@ jobs:
         with:
           node-version: 20
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_minimal_custom_ops
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_custom_ops
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.08.27'
           vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079'
           cmake-version: '3.31.6'
@@ -151,7 +197,7 @@ jobs:
           disable-terrapin: 'true'
 
       - name: Build Full ORT and Prepare Test Files
-        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           reduced-ops-config-file: required_ops.ort_models.config
           enable-custom-ops: 'true'
@@ -179,16 +225,31 @@ jobs:
         with:
           node-version: 20
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_minimal_type_reduction
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
         with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_type_reduction
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
+        with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.08.27'
           vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079'
           cmake-version: '3.31.6'
           cmake-hash: '42395e20b10a8e9ef3e33014f9a4eed08d46ab952e02d2c1bbc8f6133eca0d7719fb75680f9bbff6552f20fcd1b73d86860f7f39388d631f98fb6f622b37cf04'
           add-cmake-to-path: 'true'
           disable-terrapin: 'true'
+
       - name: Build Full ORT and Prepare Test Files
-        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           reduced-ops-config-file: required_ops_and_types.ort_models.config
           enable-type-reduction: 'true'
@@ -215,8 +276,22 @@ jobs:
         with:
           node-version: 20
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
         with:
+          key: ccache | linux_minimal_build.yml | build_minimal_globally_allowed_types
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_globally_allowed_types
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
+        with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.08.27'
           vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079'
           cmake-version: '3.31.6'
@@ -225,7 +300,7 @@ jobs:
           disable-terrapin: 'true'
 
       - name: Build Full ORT and Prepare Test Files
-        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           globally_allowed_types: 'bool,float,int8_t,uint8_t'
           enable-type-reduction: 'true'
@@ -253,8 +328,20 @@ jobs:
         with:
           node-version: 20
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_extended_minimal
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_extended_minimal
+          path: ~/.cache/vcpkg
+
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -266,7 +353,7 @@ jobs:
 
 
       - name: Run Build 5 (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Debug
@@ -274,11 +361,13 @@ jobs:
           extra_build_flags: >-
             --cmake_generator Ninja
             --build_shared_lib
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --minimal_build extended
 
       - name: Run Build 5 (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Debug
@@ -286,10 +375,13 @@ jobs:
           extra_build_flags: >-
             --cmake_generator Ninja
             --build_shared_lib
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --minimal_build extended
+
       - name: Run Build 5 (Test)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Debug
@@ -297,7 +389,9 @@ jobs:
           extra_build_flags: >-
             --cmake_generator Ninja
             --build_shared_lib
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --minimal_build extended
 
   # Job 6a: Regular build with python and all optional features disabled.
@@ -319,7 +413,7 @@ jobs:
           submodules: false
 
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -335,8 +429,20 @@ jobs:
           mkdir -p ${{ runner.temp }}/.test_data
           touch ${{ runner.temp }}/.test_data/include_no_operators.config
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_regular_no_optional
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_regular_no_optional
+          path: ~/.cache/vcpkg
+
       - name: Run Build 6a (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: MinSizeRel
@@ -344,14 +450,16 @@ jobs:
           extra_build_flags: >-
             --cmake_generator Ninja
             --build_wheel
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --disable_ml_ops
             --disable_types string sparsetensor float4 float8 optional
             --include_ops_by_config /onnxruntime_src/build/.test_data/include_no_operators.config
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
       - name: Run Build 6a (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: MinSizeRel
@@ -359,15 +467,16 @@ jobs:
           extra_build_flags: >-
             --cmake_generator Ninja
             --build_wheel
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --disable_ml_ops
             --disable_types string sparsetensor float4 float8 optional
             --include_ops_by_config /onnxruntime_src/build/.test_data/include_no_operators.config
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
-
       - name: Run Build 6a (Test)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: MinSizeRel
@@ -375,7 +484,9 @@ jobs:
           extra_build_flags: >-
             --cmake_generator Ninja
             --build_wheel
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --disable_ml_ops
             --disable_types string sparsetensor float4 float8 optional
             --include_ops_by_config /onnxruntime_src/build/.test_data/include_no_operators.config
@@ -406,7 +517,7 @@ jobs:
           touch ${{ runner.temp }}/.test_data/include_no_operators.config
 
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -416,15 +527,29 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_minimal_no_optional
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_no_optional
+          path: ~/.cache/vcpkg
+
       - name: Run Build 6b (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: MinSizeRel # From original --config MinSizeRel
           mode: 'update'
           extra_build_flags: >-
             --cmake_generator Ninja
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --minimal_build
             --disable_exceptions
             --disable_ml_ops
@@ -435,14 +560,16 @@ jobs:
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
       - name: Run Build 6b (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: MinSizeRel # From original --config MinSizeRel
           mode: 'build'
           extra_build_flags: >-
             --cmake_generator Ninja
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --minimal_build
             --disable_exceptions
             --disable_ml_ops
@@ -477,7 +604,7 @@ jobs:
           touch ${{ runner.temp }}/.test_data/include_no_operators.config
 
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -493,15 +620,29 @@ jobs:
           mkdir -p ${{ runner.temp }}/.test_data
           touch ${{ runner.temp }}/.test_data/include_no_operators.config
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_extended_minimal_no_optional
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_extended_minimal_no_optional
+          path: ~/.cache/vcpkg
+
       - name: Run Build 6c (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: MinSizeRel # From original --config MinSizeRel
           mode: 'update'
           extra_build_flags: >-
             --cmake_generator Ninja
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --minimal_build extended
             --disable_exceptions
             --disable_ml_ops
@@ -512,14 +653,16 @@ jobs:
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
       - name: Run Build 6c (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: MinSizeRel # From original --config MinSizeRel
           mode: 'build'
           extra_build_flags: >-
             --cmake_generator Ninja
+            --parallel
             --use_binskim_compliant_compile_flags
+            --use_cache
             --minimal_build extended
             --disable_exceptions
             --disable_ml_ops
@@ -558,7 +701,7 @@ jobs:
           path: ${{ runner.temp }}/.test_data/
 
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -574,6 +717,18 @@ jobs:
           ndk-version: 28.0.13004108
           # Use default android-sdk-root if not specified
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_extended_minimal_android
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_extended_minimal_android
+          path: ~/.cache/vcpkg
+
       - name: Run Build 7 (Using docker run)
         shell: bash
         run: |
@@ -593,7 +748,11 @@ jobs:
             export ANDROID_HOME=/usr/local/lib/android/sdk
           fi
 
+          # mount `~/.cache` inside docker. assume `onnxruntimedev` is the container user (should match the docker-file specified earlier)
+          mkdir -p ~/.cache/vcpkg
+
           docker run --rm \
+            --volume ~/.cache:/home/onnxruntimedev/.cache \
             --volume ${{ env.BUILD_SOURCES_DIRECTORY }}:/onnxruntime_src \
             --volume ${{ runner.temp }}:/build \
             --volume $ANDROID_HOME:/android_home \
@@ -607,7 +766,9 @@ jobs:
               --cmake_generator Ninja \
               --config MinSizeRel \
               --skip_submodule_sync \
-              --parallel --use_binskim_compliant_compile_flags \
+              --parallel \
+              --use_binskim_compliant_compile_flags \
+              --use_cache \
               --android \
               --android_sdk_path /android_home \
               --android_ndk_path /ndk_home \
diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml
index e7e17eff75d7e..dd53a9f88ff52 100644
--- a/.github/workflows/linux_tensorrt_ci.yml
+++ b/.github/workflows/linux_tensorrt_ci.yml
@@ -54,7 +54,7 @@ jobs:
 
       # --- Build the Docker image needed for testing ---
       - name: Build Docker Image for Testing
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -97,7 +97,7 @@ jobs:
       # So build.py --build_dir build/Release inside the container correctly finds the artifacts.
       - name: Test ONNX Runtime
         id: test_step
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Release
diff --git a/.github/workflows/reusable_linux_build.yml b/.github/workflows/reusable_linux_build.yml
index 9d2700683bedb..c5de2fcbfc762 100644
--- a/.github/workflows/reusable_linux_build.yml
+++ b/.github/workflows/reusable_linux_build.yml
@@ -89,7 +89,7 @@ jobs:
           python-version: ${{ inputs.python_version }}
 
       - name: Build Docker Image (${{ inputs.architecture }} / ${{ inputs.build_config }})
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/${{ inputs.dockerfile_path }}
@@ -100,41 +100,57 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
 
+      # FUTURE WORK:  Re-enable once total cache size limit has been increased.
+      #               We're prioritising getting vcpkg cache on all workflows and pipelines due to
+      #               reliability issues when fetching pkg contents from upstream.
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: 'ccache | "${{ inputs.job_identifier }}" | "${{ inputs.architecture }}" | "${{ inputs.build_config }}"'
+          path: ~/.cache/cache
+
+      # same idea as ccache, but for vcpkg artifacts. ideally we'd use vcpkg's nuget remote cache facility instead.
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: '"vcpkg-cache" | "${{ inputs.job_identifier }}" | "${{ inputs.architecture }}" | "${{ inputs.build_config }}"'
+          path: ~/.cache/vcpkg
+
       # ------------- Update Step (CMake Generation) -------------
       - name: Generate Build Files (CMake) (${{ inputs.architecture }} / ${{ inputs.build_config }})
         id: update_step
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: ${{ inputs.build_config }}
           mode: 'update'
           execution_providers: ${{ inputs.execution_providers }} # Pass down EP list
-          extra_build_flags: ${{ inputs.extra_build_flags }}
+          extra_build_flags: ${{ inputs.extra_build_flags }} --use_cache
           python_path_prefix: ${{ inputs.python_path_prefix }}
 
       # ------------- Build Step (Compilation) -------------
       - name: Build ONNX Runtime (${{ inputs.architecture }} / ${{ inputs.build_config }})
         id: build_step
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: ${{ inputs.build_config }}
           mode: 'build'
           execution_providers: ${{ inputs.execution_providers }} # Pass down EP list
-          extra_build_flags: ${{ inputs.extra_build_flags }}
+          extra_build_flags: ${{ inputs.extra_build_flags }} --use_cache
           python_path_prefix: ${{ inputs.python_path_prefix }}
 
       # ------------- Test Step -------------
       - name: Test ONNX Runtime (${{ inputs.architecture }} / ${{ inputs.build_config }})
         id: test_step
         if: inputs.run_tests == true
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: ${{ inputs.build_config }}
           mode: 'test'
           execution_providers: ${{ inputs.execution_providers }} # Pass down EP list
-          extra_build_flags: ${{ inputs.extra_build_flags }}
+          extra_build_flags: ${{ inputs.extra_build_flags }} --use_cache
           python_path_prefix: ${{ inputs.python_path_prefix }}
 
       # ------------- Prepare Artifact Step -------------
diff --git a/.github/workflows/web.yml b/.github/workflows/web.yml
index 6ae25ccc0bf3e..e9974fc66de4d 100644
--- a/.github/workflows/web.yml
+++ b/.github/workflows/web.yml
@@ -38,6 +38,7 @@ jobs:
     needs: precheck
     uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
     with:
+      job_name: wasm_Debug
       build_config: Debug
       extra_build_args: "--enable_wasm_profiling"
       build_jsep: true
@@ -47,6 +48,7 @@ jobs:
     needs: precheck
     uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
     with:
+      job_name: wasm_Release
       build_config: Release
       extra_build_args: "--target onnxruntime_webassembly --skip_tests --disable_rtti"
       build_jsep: true
@@ -56,6 +58,7 @@ jobs:
     needs: precheck
     uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
     with:
+      job_name: wasm_Release_static_library
       build_config: Release
       extra_build_args: "--skip_tests --disable_rtti --build_wasm_static_lib"
       use_vcpkg: false
@@ -68,6 +71,7 @@ jobs:
       - wasm_Debug
     uses: ./.github/workflows/windows-web-ci-workflow.yml
     with:
+      job_name: web_Debug
       commit_override: ${{ needs.precheck.outputs.commit_sha }}
       build_config: Debug
 
@@ -77,5 +81,6 @@ jobs:
       - wasm_Release
     uses: ./.github/workflows/windows-web-ci-workflow.yml
     with:
+      job_name: web_Release
       commit_override: ${{ needs.precheck.outputs.commit_sha }}
       build_config: Release
diff --git a/.github/workflows/windows-web-ci-workflow.yml b/.github/workflows/windows-web-ci-workflow.yml
index 266177623e9c5..9b40f8ee1dc17 100644
--- a/.github/workflows/windows-web-ci-workflow.yml
+++ b/.github/workflows/windows-web-ci-workflow.yml
@@ -4,6 +4,9 @@ description: "Windows Web CI pipeline for building and testing ONNX Runtime Web"
 on:
   workflow_call:
     inputs:
+      job_name: # workflow-scope unique key
+        required: true
+        type: string
       commit_override:
         type: string
         default: ""
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 385342479913a..0b70e01d15dbe 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -564,6 +564,14 @@ if(onnxruntime_USE_KLEIDIAI)
       set(${is_supported_var} FALSE PARENT_SCOPE)
       return()
     endif()
+
+    if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND MSVC_VERSION VERSION_LESS 1940)
+      message(WARNING "KleidiAI requires MSVC compiler version 19.40 or newer, KleidiAI will be disabled in this build.")
+
+      set(${is_supported_var} FALSE PARENT_SCOPE)
+      return()
+    endif()
+
     set(${is_supported_var} TRUE PARENT_SCOPE)
   endfunction()
 
diff --git a/docs/Optimizer_Layering_Annotations.md b/docs/Optimizer_Layering_Annotations.md
new file mode 100644
index 0000000000000..a268bd8fbe84f
--- /dev/null
+++ b/docs/Optimizer_Layering_Annotations.md
@@ -0,0 +1,130 @@
+# Optimizer Layering Annotations
+
+## Overview
+
+Layering annotations are per-node metadata strings that guide graph partitioning by indicating which execution provider (EP) layer a node belongs to. They are loaded from the ONNX model's `NodeProto` metadata (key `"layer_ann"`) and consumed during the partitioning phase to influence EP assignment.
+
+## Execution Pipeline
+
+Graph optimizers run in ordered levels:
+
+```
+Level 0 (Basic) ─► Level 1 (Extended) ─► Partitioning ─► Level 2+ (Layout, etc.)
+```
+
+1. **Level 0 and Level 1** optimizers run **before** partitioning. At this point, layering annotations are present on nodes and must be preserved through any graph transformations.
+2. **Partitioning** reads the annotations to assign nodes to execution providers.
+3. After partitioning, `Graph::RemoveAllLayeringAnnotations()` clears all annotations.
+4. **Level 2, 3, and 4** optimizers run **after** annotations have been cleared. They do not need to handle annotations.
+
+**Key rule: Only Level 1 (and Level 0) optimizers need to propagate layering annotations.**
+
+## Why Propagation Matters
+
+When an optimizer replaces, fuses, or decomposes nodes, the original annotated node is removed and new nodes are created. If the new nodes do not carry the original annotation, partitioning loses the assignment hint for that subgraph, potentially causing incorrect EP placement.
+
+## How to Propagate Annotations
+
+### Preferred: Use the `AddNode` Overload with `annotation_source`
+
+`Graph::AddNode` provides overloads that accept a `const Node& annotation_source` parameter. The new node automatically inherits the layering annotation from the source node.
+
+```cpp
+// Instead of:
+Node& new_node = graph.AddNode(name, op_type, description, inputs, outputs);
+// Missing annotation propagation!
+
+// Use:
+Node& new_node = graph.AddNode(name, op_type, description, inputs, outputs,
+                               original_node);  // annotation_source
+```
+
+All standard `AddNode` signatures have a corresponding `annotation_source` variant:
+
+```cpp
+// With const NodeAttributes*
+Node& AddNode(name, op_type, description,
+              gsl::span<NodeArg* const> inputs,
+              gsl::span<NodeArg* const> outputs,
+              const Node& annotation_source,
+              const NodeAttributes* attributes = nullptr,
+              const std::string& domain = kOnnxDomain);
+
+// With NodeAttributes&&
+Node& AddNode(name, op_type, description,
+              gsl::span<NodeArg* const> inputs,
+              gsl::span<NodeArg* const> outputs,
+              const Node& annotation_source,
+              NodeAttributes&& attributes,
+              const std::string& domain = kOnnxDomain);
+
+// initializer_list variants also available
+```
+
+### Legacy: `DuplicateNodeAnnotation`
+
+The utility function `optimizer_utils::DuplicateNodeAnnotation(src, dst)` copies annotations between existing nodes. This is still used when the annotation source is conditional (e.g., when the source node pointer may be null). Prefer the `AddNode` overload for unconditional propagation.
+
+### Automatic Propagation
+
+`Graph::AddNode(const Node& other)` — the copy overload used for duplicating nodes — automatically copies annotations. No additional action is needed when duplicating a node via this overload.
+
+## Post-Partitioning: Propagating EP Assignments
+
+Although Level 2+ optimizers do not deal with layering annotations directly (they have been cleared), they must still propagate **execution provider (EP) assignments**. EP assignments are the downstream result of the annotation-driven partitioning step. After partitioning, each node carries an EP assignment (e.g., `CUDAExecutionProvider`, `CPUExecutionProvider`) that determines where the node's kernel runs.
+
+When a Level 2+ optimizer creates new nodes that replace or derive from existing ones, it must copy the EP assignment from the source node:
+
+```cpp
+Node& new_node = graph.AddNode(name, op_type, description, inputs, outputs);
+new_node.SetExecutionProviderType(original_node.GetExecutionProviderType());
+```
+
+Failing to propagate the EP assignment causes the new node to fall back to the default provider (typically CPU), silently breaking the intended placement and potentially degrading performance or correctness. This requirement predates the layering annotation feature and applies to all optimizers that run after partitioning.
+
+> **Note:** The `AddNode` overload with `annotation_source` propagates both the layering annotation *and* nothing else — EP assignment is still set separately. Layering annotations and EP assignments serve different stages of the pipeline and are managed independently.
+
+## When You Do NOT Need to Propagate Annotations
+
+- **Level 2+ optimizers** — annotations have already been consumed and cleared (but EP assignments must still be propagated, see above).
+- **Training optimizers** — training runs after partitioning.
+- **Optimizers that only remove nodes** (e.g., identity elimination) — no new nodes are created.
+- **Optimizers that modify nodes in-place** — the annotation remains on the existing node.
+
+## Examples
+
+### Fusion (replacing multiple nodes with one)
+
+```cpp
+// GeluFusion: fusing Div + Erf + Add + Mul + Mul into a single Gelu
+Node& gelu_node = graph.AddNode(
+    graph.GenerateNodeName("Gelu"),
+    "Gelu", "fused Gelu subgraphs",
+    {gelu_input}, {gelu_output},
+    div_node);  // propagate annotation from the root matched node
+```
+
+### Decomposition (replacing one node with many)
+
+```cpp
+// STFT decomposition: each new node inherits from the original STFT node
+auto [reshape_node, reshape_out] = AddNode(graph, "Reshape", ep, inputs, &stft);
+auto [conv_node, conv_out]       = AddNode(graph, "Conv", ep, conv_inputs, &stft);
+auto [concat_node, concat_out]   = AddNode(graph, "Concat", ep, concat_inputs, &stft);
+```
+
+### Conditional source (use DuplicateNodeAnnotation)
+
+```cpp
+Node& q_node = graph.AddNode(...);
+if (src_node) {
+    optimizer_utils::DuplicateNodeAnnotation(*src_node, q_node);
+}
+```
+
+## Checklist for New Level 1 Optimizers
+
+1. Identify the "source" node whose annotation should propagate (typically the root of the matched pattern).
+2. For every `graph.AddNode(...)` call that creates a replacement node, use the `annotation_source` overload.
+3. If the source is conditional (may be null), use `optimizer_utils::DuplicateNodeAnnotation` after the `AddNode` call.
+4. Test with an annotated model to verify annotations survive the transformation.
diff --git a/include/onnxruntime/core/framework/resource_accountant.h b/include/onnxruntime/core/framework/resource_accountant.h
index b072e27816463..7bb5a993d140b 100644
--- a/include/onnxruntime/core/framework/resource_accountant.h
+++ b/include/onnxruntime/core/framework/resource_accountant.h
@@ -45,18 +45,31 @@ class IResourceAccountant {
   virtual ResourceCount GetConsumedAmount() const = 0;
   virtual void AddConsumedAmount(const ResourceCount& amount) = 0;
   virtual void RemoveConsumedAmount(const ResourceCount& amount) = 0;
-  virtual ResourceCount ComputeResourceCount(const Node& node) const = 0;
+  virtual ResourceCount ComputeResourceCount(const Node& node) = 0;
 
   std::optional<ResourceCount> GetThreshold() const {
     return threshold_;
   }
 
+  void SetThreshold(const ResourceCount& threshold) {
+    threshold_ = threshold;
+  }
+
   void SetStopAssignment() noexcept {
     stop_assignment_ = true;
   }
 
   bool IsStopIssued() const noexcept { return stop_assignment_; }
 
+  // Called before each GetCapability pass to discard pending weight tracking
+  // from a previous (discarded) pass. Default no-op for stats-based accountants.
+  virtual void ResetPendingWeights() {}
+
+  // Called when a node's cost is committed (AccountForNode/AccountForAllNodes).
+  // Moves the node's pending weights into the committed set so they persist
+  // across GetCapability passes. Default no-op for stats-based accountants.
+  virtual void CommitWeightsForNode(size_t /*node_index*/) {}
+
   static std::string MakeUniqueNodeName(const Node& node);
 
  private:
@@ -114,11 +127,6 @@ class NodeStatsRecorder {
 
   void DumpStats(const std::filesystem::path& model_path) const;
 
-  [[nodiscard]] static Status CreateAccountants(
-      const ConfigOptions& config_options,
-      const std::filesystem::path& model_path,
-      std::optional<ResourceAccountantMap>& acc_map);
-
  private:
   void DumpStats(std::ostream& os) const;
 
@@ -126,4 +134,9 @@ class NodeStatsRecorder {
   std::unique_ptr<Impl> impl_;
 };
 
+Status CreateAccountants(
+    const ConfigOptions& config_options,
+    const std::filesystem::path& model_path,
+    std::optional<ResourceAccountantMap>& acc_map);
+
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 58473a79ddaa6..c5351bc5dfef7 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -174,7 +174,14 @@ class Node {
   */
   void SetSinceVersion(int since_version) noexcept { since_version_ = since_version; }
 
+  void SetLayeringAnnotation(std::string annotation) { layering_annotation_ = std::move(annotation); }
+
+  const std::string& GetLayeringAnnotation() const noexcept { return layering_annotation_; }
+
+  const Graph* GetContainingGraph() const noexcept { return graph_; }
+
 #if !defined(ORT_MINIMAL_BUILD)
+
   /** Gets the Node's OpSchema.
   @remarks The graph containing this node must be resolved, otherwise nullptr will be returned. */
   const ONNX_NAMESPACE::OpSchema* Op() const noexcept { return op_; }
@@ -256,6 +263,13 @@ class Node {
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  // Make sure that the annotation does not occupy memory after partitioning is done.
+  void ClearLayeringAnnotation() {
+    std::string t;
+    layering_annotation_.swap(t);
+  }
+
   /** Gets a modifiable count of arguments for each of the Node's explicit inputs.
   @todo This should be removed in favor of a method that updates the input args and the count.
         Currently these operations are separate which is not a good setup. */
@@ -685,6 +699,8 @@ class Node {
   // Graph instances for subgraphs that are owned by this Node
   std::vector<std::unique_ptr<Graph>> subgraphs_;
 
+  std::string layering_annotation_;
+
   // Can be saved? The node cannot be saved anymore if removable attributes have been cleared.
   bool can_be_saved_;
 };
@@ -1044,6 +1060,41 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
                 gsl::span<NodeArg* const> output_args,
                 NodeAttributes&& attributes,
                 const std::string& domain = kOnnxDomain);
+
+  /** Add a Node to this Graph, propagating the layering annotation from an existing node.
+  This is the preferred way to create new nodes in Level 1 (pre-partitioning) graph optimizers.
+  The new node automatically inherits the layering annotation from @p annotation_source, which
+  ensures correct layer-based partitioning when annotations are present.
+  @param name The Node name. Must be unique in this Graph.
+  @param op_type The operator type. e.g. ONNX operator name.
+  @param description Arbitrary description of the Node.
+  @param input_args The explicit inputs to this Node.
+  @param output_args The outputs from this Node.
+  @param annotation_source The node from which to inherit the layering annotation.
+  @param attributes Optional NodeAttributes to add.
+  @param domain The domain for the op_type.
+  @returns Reference to the new Node.
+  @remarks Use this overload in Level 1 optimizers that create nodes replacing or derived from
+  existing annotated nodes. See docs/Optimizer_Layering_Annotations.md for details.
+  */
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                gsl::span<NodeArg* const> input_args,
+                gsl::span<NodeArg* const> output_args,
+                const Node& annotation_source,
+                const NodeAttributes* attributes = nullptr,
+                const std::string& domain = kOnnxDomain);
+
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                gsl::span<NodeArg* const> input_args,
+                gsl::span<NodeArg* const> output_args,
+                const Node& annotation_source,
+                NodeAttributes&& attributes,
+                const std::string& domain = kOnnxDomain);
+
   Node& AddNode(const std::string& name,
                 const std::string& op_type,
                 const std::string& description,
@@ -1057,6 +1108,21 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
                    attributes, domain);
   }
 
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                std::initializer_list<NodeArg*> input_args,
+                std::initializer_list<NodeArg*> output_args,
+                const Node& annotation_source,
+                const NodeAttributes* attributes = nullptr,
+                const std::string& domain = kOnnxDomain) {
+    return AddNode(name, op_type, description,
+                   AsSpan(input_args),
+                   AsSpan(output_args),
+                   annotation_source,
+                   attributes, domain);
+  }
+
   Node& AddNode(const std::string& name,
                 const std::string& op_type,
                 const std::string& description,
@@ -1070,16 +1136,46 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
                    attributes, domain);
   }
 
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                gsl::span<NodeArg* const> input_args,
+                std::initializer_list<NodeArg*> output_args,
+                const Node& annotation_source,
+                const NodeAttributes* attributes = nullptr,
+                const std::string& domain = kOnnxDomain) {
+    return AddNode(name, op_type, description,
+                   input_args,
+                   AsSpan(output_args),
+                   annotation_source,
+                   attributes, domain);
+  }
+
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                std::initializer_list<NodeArg*> input_args,
+                gsl::span<NodeArg* const> output_args,
+                const NodeAttributes* attributes = nullptr,
+                const std::string& domain = kOnnxDomain) {
+    return AddNode(name, op_type, description,
+                   AsSpan(input_args),
+                   output_args,
+                   attributes, domain);
+  }
+
   Node& AddNode(const std::string& name,
                 const std::string& op_type,
                 const std::string& description,
                 std::initializer_list<NodeArg*> input_args,
                 gsl::span<NodeArg* const> output_args,
+                const Node& annotation_source,
                 const NodeAttributes* attributes = nullptr,
                 const std::string& domain = kOnnxDomain) {
     return AddNode(name, op_type, description,
                    AsSpan(input_args),
                    output_args,
+                   annotation_source,
                    attributes, domain);
   }
 
@@ -1322,10 +1418,12 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
   The Graph needs to be Resolve()d after this call.
   @param func_to_inline
+  @param parent_annotation. Annotation inherited from the parent node that is being inlined.
   @returns Status indicating success or providing an error message.
   */
 
-  Status InlineFunctionProto(const ONNX_NAMESPACE::FunctionProto& func_to_inline);
+  Status InlineFunctionProto(const ONNX_NAMESPACE::FunctionProto& func_to_inline,
+                             const std::string& parent_annotation);
 
   /** Mark a NodeArg name as coming from the outer scope when programmatically constructing a Graph that will
   be used as a GraphProto attribute in another Node.
@@ -1569,6 +1667,11 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   // compiled model during partitioning, leaving them unused in the ORT Graph. To allow the memory to be freed
   // we need to manually run the cleanup that would usually happen as part of Graph::Resolve.
   Status RemovedUnusedInitializersOrtFormat();
+
+  // This examines all the nodes and removes any annotations that are only used for layering.
+  // This potentially saves memory.
+  Status RemoveAllLayeringAnnotations();
+
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
   // This friendship relationship should only be used to call Graph::Graph and
diff --git a/include/onnxruntime/core/graph/indexed_sub_graph.h b/include/onnxruntime/core/graph/indexed_sub_graph.h
index 8ef4fdb66e1e6..54e878761ba87 100644
--- a/include/onnxruntime/core/graph/indexed_sub_graph.h
+++ b/include/onnxruntime/core/graph/indexed_sub_graph.h
@@ -86,18 +86,32 @@ struct IndexedSubGraph {
 
   // Should call IsAccountingEnabled() first
   // Takes the previously computed ResourceCount for the node
-  // (usually during GetCapabiilty())
+  // (usually during GetCapability())
   // if present and adds it to the consumed amount
   void AccountForNode(size_t cost_index) const {
     assert(cost_index < nodes_costs.size());
     resource_accountant->AddConsumedAmount(nodes_costs[cost_index]);
+    resource_accountant->CommitWeightsForNode(nodes[cost_index]);
   }
 
-  // This computes and accounts for the resource cost for the node that just
-  // been fused from other nodes, and the EP did not had a chance to compute the costs.
-  void ComputeAndAccountForNode(const Node& node) const {
+  // Accounts for all constituent nodes by summing their pre-stored costs.
+  // Use this when fusing nodes into a single node so the total cost
+  // reflects what was computed during GetCapability() (with correct
+  // cross-node weight deduplication already applied).
+  void AccountForAllNodes() const {
     assert(resource_accountant != nullptr);
-    resource_accountant->AddConsumedAmount(resource_accountant->ComputeResourceCount(node));
+    for (size_t i = 0; i < nodes_costs.size(); ++i) {
+      resource_accountant->AddConsumedAmount(nodes_costs[i]);
+      resource_accountant->CommitWeightsForNode(nodes[i]);
+    }
+  }
+
+  // Accounts for a node given its index and a pre-computed resource cost.
+  // Use this when the cost was computed externally (e.g. for a fused node).
+  void AccountForNode(NodeIndex node_index, const ResourceCount& resource_count) const {
+    assert(resource_accountant != nullptr);
+    resource_accountant->AddConsumedAmount(resource_count);
+    resource_accountant->CommitWeightsForNode(node_index);
   }
 
   void SetAccountant(IResourceAccountant* res_accountant) {
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index a9d9ac8323b16..9941224258506 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -325,13 +325,33 @@ static const char* const kOrtSessionOptionsCollectNodeMemoryStatsToFile = "sessi
 /// This is a composite CSV setting formatted as "memory limit in kb,file name for collected stats"
 /// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. `limit` is optional and when absent
 /// the provider may attempt to figure out the memory available automatically.
+/// The setting with no pre-recorded stats is expected to look like: "limit > 0,".
+/// In this case, the EP will calculate memory using the initializers referenced by the node.
+///   This enables an ad-hoc and flexible scenarios with no pre-recorded stats, but may be less accurate.
 /// The setting with no limit is expected to look like: ",file name for collected stats"
-///  The EP will place nodes on device "file name" :
+/// Finally a setting with both limit and pre-recorded stats absent can contain a single comma: ",".
+///  The EP will attempt to place nodes on device (currently only CUDA is supported) :
 /// this file is expected to be found at the same folder with the model. The file contains
 /// pre-recorded stats collected when running with kOrtSessionOptionsCollectNodeMemoryStatsToFile enforce (see above)
 static const char* const kOrtSessionOptionsResourceCudaPartitioningSettings =
     "session.resource_cuda_partitioning_settings";
 
+/// <summary>
+/// This is a setting that contains string annotations or annotation prefixes to be matched
+/// against individual nodes metadata entry 'layer_ann' to guide layer assignment during partitioning.
+/// The value is a semicolon separated list of strings or string prefixes per device.
+/// Format: device1(annotation1, annotation2, ...); device2(annotation1, =annotation3, ...);...
+/// Where:
+/// - device1, device2, ... are the recognized device names to be matched against EPs configured in
+///   the given session.
+/// - annotation1, annotation2, ... are annotation prefixes to be matched against node annotations. Any
+///   node annotation that starts with one of these prefixes will be matched.
+/// - =annotation3 indicates an exact match for annotation3. Only node annotations that are exactly
+///   equal to 'annotation3' will be matched.
+/// TODO: add a list of recognized devices here.
+/// </summary>
+static const char* const kOrtSessionOptionsLayerAssignmentSettings = "session.layer_assignment_settings";
+
 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index fa3d42faffb52..dc125288eaee5 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -265,12 +265,13 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "9.0.3",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz",
-      "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==",
+      "version": "9.0.9",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz",
+      "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==",
       "dev": true,
+      "license": "ISC",
       "dependencies": {
-        "brace-expansion": "^2.0.1"
+        "brace-expansion": "^2.0.2"
       },
       "engines": {
         "node": ">=16 || 14 >=14.17"
@@ -639,12 +640,12 @@
       }
     },
     "minimatch": {
-      "version": "9.0.3",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz",
-      "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==",
+      "version": "9.0.9",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz",
+      "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==",
       "dev": true,
       "requires": {
-        "brace-expansion": "^2.0.1"
+        "brace-expansion": "^2.0.2"
       }
     },
     "path-type": {
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 8b8582d06e779..d94de095555b3 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -16,7 +16,7 @@
       ],
       "dependencies": {
         "adm-zip": "^0.5.16",
-        "global-agent": "^3.0.0",
+        "global-agent": "^4.1.3",
         "onnxruntime-common": "file:../common"
       },
       "devDependencies": {
@@ -158,12 +158,6 @@
         "url": "https://github.com/chalk/ansi-styles?sponsor=1"
       }
     },
-    "node_modules/boolean": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
-      "integrity": "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==",
-      "deprecated": "Package no longer supported. Contact Support at https://www.npmjs.com/support for more info."
-    },
     "node_modules/chownr": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz",
@@ -258,6 +252,7 @@
       "version": "1.1.4",
       "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
       "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
+      "license": "MIT",
       "dependencies": {
         "es-define-property": "^1.0.0",
         "es-errors": "^1.3.0",
@@ -274,6 +269,7 @@
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz",
       "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==",
+      "license": "MIT",
       "dependencies": {
         "define-data-property": "^1.0.1",
         "has-property-descriptors": "^1.0.0",
@@ -286,11 +282,6 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/detect-node": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
-      "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="
-    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -310,6 +301,7 @@
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
       "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
       }
@@ -318,15 +310,11 @@
       "version": "1.3.0",
       "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
       "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
       }
     },
-    "node_modules/es6-error": {
-      "version": "4.1.1",
-      "resolved": "https://registry.npmjs.org/es6-error/-/es6-error-4.1.1.tgz",
-      "integrity": "sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg=="
-    },
     "node_modules/escalade": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
@@ -340,6 +328,7 @@
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
       "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
+      "license": "MIT",
       "engines": {
         "node": ">=10"
       },
@@ -377,16 +366,15 @@
       }
     },
     "node_modules/global-agent": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-3.0.0.tgz",
-      "integrity": "sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q==",
+      "version": "4.1.3",
+      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-4.1.3.tgz",
+      "integrity": "sha512-KUJEViiuFT3I97t+GYMikLPJS2Lfo/S2F+DQuBWzuzaMPnvt5yyZePzArx36fBzpGTxZjIpDbXLeySLgh+k76g==",
+      "license": "BSD-3-Clause",
       "dependencies": {
-        "boolean": "^3.0.1",
-        "es6-error": "^4.1.1",
-        "matcher": "^3.0.0",
-        "roarr": "^2.15.3",
-        "semver": "^7.3.2",
-        "serialize-error": "^7.0.1"
+        "globalthis": "^1.0.2",
+        "matcher": "^4.0.0",
+        "semver": "^7.3.5",
+        "serialize-error": "^8.1.0"
       },
       "engines": {
         "node": ">=10.0"
@@ -396,6 +384,7 @@
       "version": "1.0.4",
       "resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.4.tgz",
       "integrity": "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==",
+      "license": "MIT",
       "dependencies": {
         "define-properties": "^1.2.1",
         "gopd": "^1.0.1"
@@ -411,6 +400,7 @@
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
       "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
       },
@@ -428,6 +418,7 @@
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz",
       "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==",
+      "license": "MIT",
       "dependencies": {
         "es-define-property": "^1.0.0"
       },
@@ -471,11 +462,6 @@
       "integrity": "sha512-mrqyZKfX5EhL7hvqcV6WG1yYjnjeuYDzDhhcAAUrq8Po85NBQBJP+ZDUT75qZQ98IkUoBqdkExkukOU7Ts2wrw==",
       "dev": true
     },
-    "node_modules/json-stringify-safe": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
-      "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="
-    },
     "node_modules/jsonc": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/jsonc/-/jsonc-2.0.0.tgz",
@@ -512,14 +498,18 @@
       "dev": true
     },
     "node_modules/matcher": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/matcher/-/matcher-3.0.0.tgz",
-      "integrity": "sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng==",
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/matcher/-/matcher-4.0.0.tgz",
+      "integrity": "sha512-S6x5wmcDmsDRRU/c2dkccDwQPXoFczc5+HpQ2lON8pnvHlnvHAHj5WlLVvw6n6vNyHuVugYrFohYxbS+pvFpKQ==",
+      "license": "MIT",
       "dependencies": {
         "escape-string-regexp": "^4.0.0"
       },
       "engines": {
         "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/minimist": {
@@ -586,6 +576,7 @@
       "version": "1.1.1",
       "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz",
       "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==",
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
       }
@@ -664,22 +655,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/roarr": {
-      "version": "2.15.4",
-      "resolved": "https://registry.npmjs.org/roarr/-/roarr-2.15.4.tgz",
-      "integrity": "sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A==",
-      "dependencies": {
-        "boolean": "^3.0.1",
-        "detect-node": "^2.0.4",
-        "globalthis": "^1.0.1",
-        "json-stringify-safe": "^5.0.1",
-        "semver-compare": "^1.0.0",
-        "sprintf-js": "^1.1.2"
-      },
-      "engines": {
-        "node": ">=8.0"
-      }
-    },
     "node_modules/semver": {
       "version": "7.7.3",
       "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
@@ -691,17 +666,13 @@
         "node": ">=10"
       }
     },
-    "node_modules/semver-compare": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/semver-compare/-/semver-compare-1.0.0.tgz",
-      "integrity": "sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow=="
-    },
     "node_modules/serialize-error": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/serialize-error/-/serialize-error-7.0.1.tgz",
-      "integrity": "sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw==",
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/serialize-error/-/serialize-error-8.1.0.tgz",
+      "integrity": "sha512-3NnuWfM6vBYoy5gZFvHiYsVbafvI9vZv/+jlIigFn4oP4zjNPK3LhcY0xSCgeb1a5L8jO71Mit9LlNoi2UfDDQ==",
+      "license": "MIT",
       "dependencies": {
-        "type-fest": "^0.13.1"
+        "type-fest": "^0.20.2"
       },
       "engines": {
         "node": ">=10"
@@ -710,11 +681,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/sprintf-js": {
-      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz",
-      "integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA=="
-    },
     "node_modules/string-width": {
       "version": "4.2.3",
       "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
@@ -779,9 +745,10 @@
       }
     },
     "node_modules/type-fest": {
-      "version": "0.13.1",
-      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz",
-      "integrity": "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==",
+      "version": "0.20.2",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz",
+      "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==",
+      "license": "(MIT OR CC0-1.0)",
       "engines": {
         "node": ">=10"
       },
@@ -988,11 +955,6 @@
         "color-convert": "^2.0.1"
       }
     },
-    "boolean": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
-      "integrity": "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw=="
-    },
     "chownr": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz",
@@ -1077,11 +1039,6 @@
         "object-keys": "^1.1.1"
       }
     },
-    "detect-node": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
-      "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="
-    },
     "emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -1107,11 +1064,6 @@
       "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
       "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="
     },
-    "es6-error": {
-      "version": "4.1.1",
-      "resolved": "https://registry.npmjs.org/es6-error/-/es6-error-4.1.1.tgz",
-      "integrity": "sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg=="
-    },
     "escalade": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
@@ -1147,16 +1099,14 @@
       "dev": true
     },
     "global-agent": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-3.0.0.tgz",
-      "integrity": "sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q==",
+      "version": "4.1.3",
+      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-4.1.3.tgz",
+      "integrity": "sha512-KUJEViiuFT3I97t+GYMikLPJS2Lfo/S2F+DQuBWzuzaMPnvt5yyZePzArx36fBzpGTxZjIpDbXLeySLgh+k76g==",
       "requires": {
-        "boolean": "^3.0.1",
-        "es6-error": "^4.1.1",
-        "matcher": "^3.0.0",
-        "roarr": "^2.15.3",
-        "semver": "^7.3.2",
-        "serialize-error": "^7.0.1"
+        "globalthis": "^1.0.2",
+        "matcher": "^4.0.0",
+        "semver": "^7.3.5",
+        "serialize-error": "^8.1.0"
       }
     },
     "globalthis": {
@@ -1217,11 +1167,6 @@
       "integrity": "sha512-mrqyZKfX5EhL7hvqcV6WG1yYjnjeuYDzDhhcAAUrq8Po85NBQBJP+ZDUT75qZQ98IkUoBqdkExkukOU7Ts2wrw==",
       "dev": true
     },
-    "json-stringify-safe": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
-      "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="
-    },
     "jsonc": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/jsonc/-/jsonc-2.0.0.tgz",
@@ -1253,9 +1198,9 @@
       "dev": true
     },
     "matcher": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/matcher/-/matcher-3.0.0.tgz",
-      "integrity": "sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng==",
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/matcher/-/matcher-4.0.0.tgz",
+      "integrity": "sha512-S6x5wmcDmsDRRU/c2dkccDwQPXoFczc5+HpQ2lON8pnvHlnvHAHj5WlLVvw6n6vNyHuVugYrFohYxbS+pvFpKQ==",
       "requires": {
         "escape-string-regexp": "^4.0.0"
       }
@@ -1376,42 +1321,19 @@
       "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
       "dev": true
     },
-    "roarr": {
-      "version": "2.15.4",
-      "resolved": "https://registry.npmjs.org/roarr/-/roarr-2.15.4.tgz",
-      "integrity": "sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A==",
-      "requires": {
-        "boolean": "^3.0.1",
-        "detect-node": "^2.0.4",
-        "globalthis": "^1.0.1",
-        "json-stringify-safe": "^5.0.1",
-        "semver-compare": "^1.0.0",
-        "sprintf-js": "^1.1.2"
-      }
-    },
     "semver": {
       "version": "7.7.3",
       "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
       "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q=="
     },
-    "semver-compare": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/semver-compare/-/semver-compare-1.0.0.tgz",
-      "integrity": "sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow=="
-    },
     "serialize-error": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/serialize-error/-/serialize-error-7.0.1.tgz",
-      "integrity": "sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw==",
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/serialize-error/-/serialize-error-8.1.0.tgz",
+      "integrity": "sha512-3NnuWfM6vBYoy5gZFvHiYsVbafvI9vZv/+jlIigFn4oP4zjNPK3LhcY0xSCgeb1a5L8jO71Mit9LlNoi2UfDDQ==",
       "requires": {
-        "type-fest": "^0.13.1"
+        "type-fest": "^0.20.2"
       }
     },
-    "sprintf-js": {
-      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz",
-      "integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA=="
-    },
     "string-width": {
       "version": "4.2.3",
       "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
@@ -1458,9 +1380,9 @@
       }
     },
     "type-fest": {
-      "version": "0.13.1",
-      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz",
-      "integrity": "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg=="
+      "version": "0.20.2",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz",
+      "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ=="
     },
     "universalify": {
       "version": "2.0.1",
diff --git a/js/node/package.json b/js/node/package.json
index 4d35ec8c424d5..18c2b2ce9c905 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -14,7 +14,7 @@
   "version": "1.25.0",
   "dependencies": {
     "adm-zip": "^0.5.16",
-    "global-agent": "^3.0.0",
+    "global-agent": "^4.1.3",
     "onnxruntime-common": "file:../common"
   },
   "scripts": {
diff --git a/js/package-lock.json b/js/package-lock.json
index 1ba8fc900bbd8..29d45184920d1 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -4,13 +4,14 @@
   "requires": true,
   "packages": {
     "": {
+      "name": "js",
       "license": "MIT",
       "devDependencies": {
         "@eslint/compat": "^1.4.0",
         "@eslint/eslintrc": "^3.3.1",
         "@eslint/js": "^9.38.0",
         "@types/fs-extra": "^11.0.4",
-        "@types/global-agent": "^2.1.3",
+        "@types/global-agent": "^3.0.0",
         "@types/mocha": "^10.0.2",
         "@types/node": "^20.10.0",
         "@types/npmlog": "^4.1.4",
@@ -27,7 +28,7 @@
         "eslint-plugin-prefer-arrow": "^1.2.3",
         "eslint-plugin-unicorn": "^62.0.0",
         "fs-extra": "^11.2.0",
-        "global-agent": "^3.0",
+        "global-agent": "^4.1.3",
         "globals": "^16.4.0",
         "jszip": "^3.10.1",
         "mocha": "^11.0.1",
@@ -979,9 +980,9 @@
       }
     },
     "node_modules/@types/global-agent": {
-      "version": "2.1.3",
-      "resolved": "https://registry.npmjs.org/@types/global-agent/-/global-agent-2.1.3.tgz",
-      "integrity": "sha512-rGtZZcgZcKWuKNTkGBGsqyOQ7Nn2MjXh4+xeZbf+5b5KMUx8H1rTqLRackxos7pUlreszbYjQcop5JvqCnZlLw==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/@types/global-agent/-/global-agent-3.0.0.tgz",
+      "integrity": "sha512-OmvaPJtTaY/wd1hxelLJmf8oKQpmKZdrlfQ+MWL59eKSEHJDDEifIo69248bdJ0yLIN+iMNQ6sKMtnwU6AxajA==",
       "dev": true,
       "license": "MIT"
     },
@@ -1231,13 +1232,13 @@
       }
     },
     "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": {
-      "version": "9.0.5",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
-      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+      "version": "9.0.9",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz",
+      "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==",
       "dev": true,
       "license": "ISC",
       "dependencies": {
-        "brace-expansion": "^2.0.1"
+        "brace-expansion": "^2.0.2"
       },
       "engines": {
         "node": ">=16 || 14 >=14.17"
@@ -1337,10 +1338,11 @@
       }
     },
     "node_modules/ajv": {
-      "version": "6.12.6",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
-      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "version": "6.14.0",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.14.0.tgz",
+      "integrity": "sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "fast-deep-equal": "^3.1.1",
         "fast-json-stable-stringify": "^2.0.0",
@@ -1352,16 +1354,6 @@
         "url": "https://github.com/sponsors/epoberezkin"
       }
     },
-    "node_modules/ansi-colors": {
-      "version": "4.1.3",
-      "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-4.1.3.tgz",
-      "integrity": "sha512-/6w/C21Pm1A7aZitlI5Ni/2J6FFQN8i1Cvz3kHABAAbw93v/NlvKdVOqz7CCWz/3iv/JplRSEEZ83XION15ovw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
     "node_modules/ansi-regex": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
@@ -1386,19 +1378,6 @@
         "url": "https://github.com/chalk/ansi-styles?sponsor=1"
       }
     },
-    "node_modules/anymatch": {
-      "version": "3.1.3",
-      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
-      "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
-      "dev": true,
-      "dependencies": {
-        "normalize-path": "^3.0.0",
-        "picomatch": "^2.0.4"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
     "node_modules/aproba": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz",
@@ -1632,23 +1611,6 @@
         "baseline-browser-mapping": "dist/cli.js"
       }
     },
-    "node_modules/binary-extensions": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz",
-      "integrity": "sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==",
-      "dev": true,
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/boolean": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
-      "integrity": "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==",
-      "deprecated": "Package no longer supported. Contact Support at https://www.npmjs.com/support for more info.",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/brace-expansion": {
       "version": "1.1.12",
       "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
@@ -1871,42 +1833,19 @@
       "license": "MIT"
     },
     "node_modules/chokidar": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.5.3.tgz",
-      "integrity": "sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw==",
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz",
+      "integrity": "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==",
       "dev": true,
-      "funding": [
-        {
-          "type": "individual",
-          "url": "https://paulmillr.com/funding/"
-        }
-      ],
+      "license": "MIT",
       "dependencies": {
-        "anymatch": "~3.1.2",
-        "braces": "~3.0.2",
-        "glob-parent": "~5.1.2",
-        "is-binary-path": "~2.1.0",
-        "is-glob": "~4.0.1",
-        "normalize-path": "~3.0.0",
-        "readdirp": "~3.6.0"
+        "readdirp": "^4.0.1"
       },
       "engines": {
-        "node": ">= 8.10.0"
+        "node": ">= 14.16.0"
       },
-      "optionalDependencies": {
-        "fsevents": "~2.3.2"
-      }
-    },
-    "node_modules/chokidar/node_modules/glob-parent": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
-      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
-      "dev": true,
-      "dependencies": {
-        "is-glob": "^4.0.1"
-      },
-      "engines": {
-        "node": ">= 6"
+      "funding": {
+        "url": "https://paulmillr.com/funding/"
       }
     },
     "node_modules/ci-info": {
@@ -1947,14 +1886,18 @@
       }
     },
     "node_modules/cliui": {
-      "version": "7.0.4",
-      "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz",
-      "integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==",
+      "version": "8.0.1",
+      "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
+      "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
       "dev": true,
+      "license": "ISC",
       "dependencies": {
         "string-width": "^4.2.0",
-        "strip-ansi": "^6.0.0",
+        "strip-ansi": "^6.0.1",
         "wrap-ansi": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=12"
       }
     },
     "node_modules/color-convert": {
@@ -2178,17 +2121,10 @@
       "integrity": "sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==",
       "dev": true
     },
-    "node_modules/detect-node": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
-      "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/diff": {
-      "version": "5.2.0",
-      "resolved": "https://registry.npmjs.org/diff/-/diff-5.2.0.tgz",
-      "integrity": "sha512-uIFDxqpRZGZ6ThOk84hEfqWoHx2devRFvpTZcTHur85vImfaxUbTW9Ryh4CpCuDnToOP1CEtXKIgytHBPVff5A==",
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/diff/-/diff-7.0.0.tgz",
+      "integrity": "sha512-PJWHUb1RFevKCwaFA9RlG5tCd+FO5iRh9A8HEtkmBH2Li03iJriB6m6JIN4rGz3K3JLawI7/veA1xzRKP6ISBw==",
       "dev": true,
       "license": "BSD-3-Clause",
       "engines": {
@@ -2389,13 +2325,6 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/es6-error": {
-      "version": "4.1.1",
-      "resolved": "https://registry.npmjs.org/es6-error/-/es6-error-4.1.1.tgz",
-      "integrity": "sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg==",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/esbuild": {
       "version": "0.25.0",
       "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.0.tgz",
@@ -3079,20 +3008,6 @@
         "node": ">=14.14"
       }
     },
-    "node_modules/fsevents": {
-      "version": "2.3.2",
-      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
-      "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
-      "dev": true,
-      "hasInstallScript": true,
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
-      }
-    },
     "node_modules/function-bind": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
@@ -3168,6 +3083,7 @@
       "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
       "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
       "dev": true,
+      "license": "ISC",
       "engines": {
         "node": "6.* || 8.* || >= 10.*"
       }
@@ -3273,13 +3189,13 @@
       }
     },
     "node_modules/glob/node_modules/minimatch": {
-      "version": "9.0.5",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
-      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+      "version": "9.0.9",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz",
+      "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==",
       "dev": true,
       "license": "ISC",
       "dependencies": {
-        "brace-expansion": "^2.0.1"
+        "brace-expansion": "^2.0.2"
       },
       "engines": {
         "node": ">=16 || 14 >=14.17"
@@ -3289,18 +3205,16 @@
       }
     },
     "node_modules/global-agent": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-3.0.0.tgz",
-      "integrity": "sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q==",
+      "version": "4.1.3",
+      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-4.1.3.tgz",
+      "integrity": "sha512-KUJEViiuFT3I97t+GYMikLPJS2Lfo/S2F+DQuBWzuzaMPnvt5yyZePzArx36fBzpGTxZjIpDbXLeySLgh+k76g==",
       "dev": true,
       "license": "BSD-3-Clause",
       "dependencies": {
-        "boolean": "^3.0.1",
-        "es6-error": "^4.1.1",
-        "matcher": "^3.0.0",
-        "roarr": "^2.15.3",
-        "semver": "^7.3.2",
-        "serialize-error": "^7.0.1"
+        "globalthis": "^1.0.2",
+        "matcher": "^4.0.0",
+        "semver": "^7.3.5",
+        "serialize-error": "^8.1.0"
       },
       "engines": {
         "node": ">=10.0"
@@ -3644,18 +3558,6 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/is-binary-path": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
-      "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==",
-      "dev": true,
-      "dependencies": {
-        "binary-extensions": "^2.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
     "node_modules/is-boolean-object": {
       "version": "1.2.2",
       "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.2.2.tgz",
@@ -3871,6 +3773,16 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/is-path-inside": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/is-path-inside/-/is-path-inside-3.0.3.tgz",
+      "integrity": "sha512-Fd4gABb+ycGAmKou8eMftCupSir5lRxqf4aD/vd0cD2qc4HL07OjCeuHMr8Ro4CoMaeCKDB0/ECBOVWjTwUvPQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/is-plain-obj": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-2.1.0.tgz",
@@ -4119,13 +4031,6 @@
       "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
       "dev": true
     },
-    "node_modules/json-stringify-safe": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
-      "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
-      "dev": true,
-      "license": "ISC"
-    },
     "node_modules/json5": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz",
@@ -4233,9 +4138,9 @@
       }
     },
     "node_modules/matcher": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/matcher/-/matcher-3.0.0.tgz",
-      "integrity": "sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng==",
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/matcher/-/matcher-4.0.0.tgz",
+      "integrity": "sha512-S6x5wmcDmsDRRU/c2dkccDwQPXoFczc5+HpQ2lON8pnvHlnvHAHj5WlLVvw6n6vNyHuVugYrFohYxbS+pvFpKQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -4243,6 +4148,9 @@
       },
       "engines": {
         "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/math-intrinsics": {
@@ -4312,31 +4220,32 @@
       }
     },
     "node_modules/mocha": {
-      "version": "11.0.1",
-      "resolved": "https://registry.npmjs.org/mocha/-/mocha-11.0.1.tgz",
-      "integrity": "sha512-+3GkODfsDG71KSCQhc4IekSW+ItCK/kiez1Z28ksWvYhKXV/syxMlerR/sC7whDp7IyreZ4YxceMLdTs5hQE8A==",
+      "version": "11.7.5",
+      "resolved": "https://registry.npmjs.org/mocha/-/mocha-11.7.5.tgz",
+      "integrity": "sha512-mTT6RgopEYABzXWFx+GcJ+ZQ32kp4fMf0xvpZIIfSq9Z8lC/++MtcCnQ9t5FP2veYEP95FIYSvW+U9fV4xrlig==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "ansi-colors": "^4.1.3",
         "browser-stdout": "^1.3.1",
-        "chokidar": "^3.5.3",
+        "chokidar": "^4.0.1",
         "debug": "^4.3.5",
-        "diff": "^5.2.0",
+        "diff": "^7.0.0",
         "escape-string-regexp": "^4.0.0",
         "find-up": "^5.0.0",
         "glob": "^10.4.5",
         "he": "^1.2.0",
+        "is-path-inside": "^3.0.3",
         "js-yaml": "^4.1.0",
         "log-symbols": "^4.1.0",
-        "minimatch": "^5.1.6",
+        "minimatch": "^9.0.5",
         "ms": "^2.1.3",
+        "picocolors": "^1.1.1",
         "serialize-javascript": "^6.0.2",
         "strip-json-comments": "^3.1.1",
         "supports-color": "^8.1.1",
-        "workerpool": "^6.5.1",
-        "yargs": "^16.2.0",
-        "yargs-parser": "^20.2.9",
+        "workerpool": "^9.2.0",
+        "yargs": "^17.7.2",
+        "yargs-parser": "^21.1.1",
         "yargs-unparser": "^2.0.0"
       },
       "bin": {
@@ -4358,16 +4267,19 @@
       }
     },
     "node_modules/mocha/node_modules/minimatch": {
-      "version": "5.1.6",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
-      "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+      "version": "9.0.9",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz",
+      "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==",
       "dev": true,
       "license": "ISC",
       "dependencies": {
-        "brace-expansion": "^2.0.1"
+        "brace-expansion": "^2.0.2"
       },
       "engines": {
-        "node": ">=10"
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
       }
     },
     "node_modules/mocha/node_modules/supports-color": {
@@ -4405,15 +4317,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/normalize-path": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
-      "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
-      "dev": true,
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
     "node_modules/npmlog": {
       "version": "7.0.1",
       "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-7.0.1.tgz",
@@ -4821,15 +4724,17 @@
       }
     },
     "node_modules/readdirp": {
-      "version": "3.6.0",
-      "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
-      "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==",
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz",
+      "integrity": "sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==",
       "dev": true,
-      "dependencies": {
-        "picomatch": "^2.2.1"
-      },
+      "license": "MIT",
       "engines": {
-        "node": ">=8.10.0"
+        "node": ">= 14.18.0"
+      },
+      "funding": {
+        "type": "individual",
+        "url": "https://paulmillr.com/funding/"
       }
     },
     "node_modules/reflect.getprototypeof": {
@@ -4903,6 +4808,7 @@
       "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
       "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">=0.10.0"
       }
@@ -4961,24 +4867,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/roarr": {
-      "version": "2.15.4",
-      "resolved": "https://registry.npmjs.org/roarr/-/roarr-2.15.4.tgz",
-      "integrity": "sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A==",
-      "dev": true,
-      "license": "BSD-3-Clause",
-      "dependencies": {
-        "boolean": "^3.0.1",
-        "detect-node": "^2.0.4",
-        "globalthis": "^1.0.1",
-        "json-stringify-safe": "^5.0.1",
-        "semver-compare": "^1.0.0",
-        "sprintf-js": "^1.1.2"
-      },
-      "engines": {
-        "node": ">=8.0"
-      }
-    },
     "node_modules/run-parallel": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
@@ -5091,21 +4979,14 @@
         "node": ">=10"
       }
     },
-    "node_modules/semver-compare": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/semver-compare/-/semver-compare-1.0.0.tgz",
-      "integrity": "sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/serialize-error": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/serialize-error/-/serialize-error-7.0.1.tgz",
-      "integrity": "sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw==",
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/serialize-error/-/serialize-error-8.1.0.tgz",
+      "integrity": "sha512-3NnuWfM6vBYoy5gZFvHiYsVbafvI9vZv/+jlIigFn4oP4zjNPK3LhcY0xSCgeb1a5L8jO71Mit9LlNoi2UfDDQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "type-fest": "^0.13.1"
+        "type-fest": "^0.20.2"
       },
       "engines": {
         "node": ">=10"
@@ -5114,19 +4995,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/serialize-error/node_modules/type-fest": {
-      "version": "0.13.1",
-      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz",
-      "integrity": "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==",
-      "dev": true,
-      "license": "(MIT OR CC0-1.0)",
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/serialize-javascript": {
       "version": "6.0.2",
       "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-6.0.2.tgz",
@@ -5345,13 +5213,6 @@
       "dev": true,
       "license": "CC0-1.0"
     },
-    "node_modules/sprintf-js": {
-      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz",
-      "integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==",
-      "dev": true,
-      "license": "BSD-3-Clause"
-    },
     "node_modules/stop-iteration-iterator": {
       "version": "1.1.0",
       "resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.1.0.tgz",
@@ -5635,6 +5496,19 @@
         "node": ">= 0.8.0"
       }
     },
+    "node_modules/type-fest": {
+      "version": "0.20.2",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz",
+      "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==",
+      "dev": true,
+      "license": "(MIT OR CC0-1.0)",
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/typed-array-buffer": {
       "version": "1.0.3",
       "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.3.tgz",
@@ -5928,9 +5802,9 @@
       }
     },
     "node_modules/workerpool": {
-      "version": "6.5.1",
-      "resolved": "https://registry.npmjs.org/workerpool/-/workerpool-6.5.1.tgz",
-      "integrity": "sha512-Fs4dNYcsdpYSAfVxhnl1L5zTksjvOJxtC5hzMNl+1t9B8hTJTdKDyZ5ju7ztgPy+ft9tBFXoOlDNiOT9WUXZlA==",
+      "version": "9.3.4",
+      "resolved": "https://registry.npmjs.org/workerpool/-/workerpool-9.3.4.tgz",
+      "integrity": "sha512-TmPRQYYSAnnDiEB0P/Ytip7bFGvqnSU6I2BcuSw7Hx+JSg/DsUi5ebYfc8GYaSdpuvOcEs6dXxPurOYpe9QFwg==",
       "dev": true,
       "license": "Apache-2.0"
     },
@@ -5939,6 +5813,7 @@
       "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
       "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "ansi-styles": "^4.0.0",
         "string-width": "^4.1.0",
@@ -5975,36 +5850,38 @@
       "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
       "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
       "dev": true,
+      "license": "ISC",
       "engines": {
         "node": ">=10"
       }
     },
     "node_modules/yargs": {
-      "version": "16.2.0",
-      "resolved": "https://registry.npmjs.org/yargs/-/yargs-16.2.0.tgz",
-      "integrity": "sha512-D1mvvtDG0L5ft/jGWkLpG1+m0eQxOfaBvTNELraWj22wSVUMWxZUvYgJYcKh6jGGIkJFhH4IZPQhR4TKpc8mBw==",
+      "version": "17.7.2",
+      "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
+      "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "cliui": "^7.0.2",
+        "cliui": "^8.0.1",
         "escalade": "^3.1.1",
         "get-caller-file": "^2.0.5",
         "require-directory": "^2.1.1",
-        "string-width": "^4.2.0",
+        "string-width": "^4.2.3",
         "y18n": "^5.0.5",
-        "yargs-parser": "^20.2.2"
+        "yargs-parser": "^21.1.1"
       },
       "engines": {
-        "node": ">=10"
+        "node": ">=12"
       }
     },
     "node_modules/yargs-parser": {
-      "version": "20.2.9",
-      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.9.tgz",
-      "integrity": "sha512-y11nGElTIV+CT3Zv9t7VKl+Q3hTQoT9a1Qzezhhl6Rp21gJ/IVTW7Z3y9EWXhuUBC2Shnf+DX0antecpAwSP8w==",
+      "version": "21.1.1",
+      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
+      "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
       "dev": true,
       "license": "ISC",
       "engines": {
-        "node": ">=10"
+        "node": ">=12"
       }
     },
     "node_modules/yargs-unparser": {
@@ -6552,9 +6429,9 @@
       }
     },
     "@types/global-agent": {
-      "version": "2.1.3",
-      "resolved": "https://registry.npmjs.org/@types/global-agent/-/global-agent-2.1.3.tgz",
-      "integrity": "sha512-rGtZZcgZcKWuKNTkGBGsqyOQ7Nn2MjXh4+xeZbf+5b5KMUx8H1rTqLRackxos7pUlreszbYjQcop5JvqCnZlLw==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/@types/global-agent/-/global-agent-3.0.0.tgz",
+      "integrity": "sha512-OmvaPJtTaY/wd1hxelLJmf8oKQpmKZdrlfQ+MWL59eKSEHJDDEifIo69248bdJ0yLIN+iMNQ6sKMtnwU6AxajA==",
       "dev": true
     },
     "@types/json-schema": {
@@ -6712,12 +6589,12 @@
           }
         },
         "minimatch": {
-          "version": "9.0.5",
-          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
-          "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+          "version": "9.0.9",
+          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz",
+          "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==",
           "dev": true,
           "requires": {
-            "brace-expansion": "^2.0.1"
+            "brace-expansion": "^2.0.2"
           }
         }
       }
@@ -6775,9 +6652,9 @@
       "requires": {}
     },
     "ajv": {
-      "version": "6.12.6",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
-      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "version": "6.14.0",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.14.0.tgz",
+      "integrity": "sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==",
       "dev": true,
       "requires": {
         "fast-deep-equal": "^3.1.1",
@@ -6786,12 +6663,6 @@
         "uri-js": "^4.2.2"
       }
     },
-    "ansi-colors": {
-      "version": "4.1.3",
-      "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-4.1.3.tgz",
-      "integrity": "sha512-/6w/C21Pm1A7aZitlI5Ni/2J6FFQN8i1Cvz3kHABAAbw93v/NlvKdVOqz7CCWz/3iv/JplRSEEZ83XION15ovw==",
-      "dev": true
-    },
     "ansi-regex": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
@@ -6807,16 +6678,6 @@
         "color-convert": "^2.0.1"
       }
     },
-    "anymatch": {
-      "version": "3.1.3",
-      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
-      "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
-      "dev": true,
-      "requires": {
-        "normalize-path": "^3.0.0",
-        "picomatch": "^2.0.4"
-      }
-    },
     "aproba": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz",
@@ -6972,18 +6833,6 @@
       "integrity": "sha512-JMWsdF+O8Orq3EMukbUN1QfbLK9mX2CkUmQBcW2T0s8OmdAUL5LLM/6wFwSrqXzlXB13yhyK9gTKS1rIizOduQ==",
       "dev": true
     },
-    "binary-extensions": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz",
-      "integrity": "sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==",
-      "dev": true
-    },
-    "boolean": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
-      "integrity": "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==",
-      "dev": true
-    },
     "brace-expansion": {
       "version": "1.1.12",
       "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
@@ -7111,30 +6960,12 @@
       "dev": true
     },
     "chokidar": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.5.3.tgz",
-      "integrity": "sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw==",
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz",
+      "integrity": "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==",
       "dev": true,
       "requires": {
-        "anymatch": "~3.1.2",
-        "braces": "~3.0.2",
-        "fsevents": "~2.3.2",
-        "glob-parent": "~5.1.2",
-        "is-binary-path": "~2.1.0",
-        "is-glob": "~4.0.1",
-        "normalize-path": "~3.0.0",
-        "readdirp": "~3.6.0"
-      },
-      "dependencies": {
-        "glob-parent": {
-          "version": "5.1.2",
-          "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
-          "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
-          "dev": true,
-          "requires": {
-            "is-glob": "^4.0.1"
-          }
-        }
+        "readdirp": "^4.0.1"
       }
     },
     "ci-info": {
@@ -7161,13 +6992,13 @@
       }
     },
     "cliui": {
-      "version": "7.0.4",
-      "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz",
-      "integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==",
+      "version": "8.0.1",
+      "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
+      "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
       "dev": true,
       "requires": {
         "string-width": "^4.2.0",
-        "strip-ansi": "^6.0.0",
+        "strip-ansi": "^6.0.1",
         "wrap-ansi": "^7.0.0"
       }
     },
@@ -7324,16 +7155,10 @@
       "integrity": "sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==",
       "dev": true
     },
-    "detect-node": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
-      "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==",
-      "dev": true
-    },
     "diff": {
-      "version": "5.2.0",
-      "resolved": "https://registry.npmjs.org/diff/-/diff-5.2.0.tgz",
-      "integrity": "sha512-uIFDxqpRZGZ6ThOk84hEfqWoHx2devRFvpTZcTHur85vImfaxUbTW9Ryh4CpCuDnToOP1CEtXKIgytHBPVff5A==",
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/diff/-/diff-7.0.0.tgz",
+      "integrity": "sha512-PJWHUb1RFevKCwaFA9RlG5tCd+FO5iRh9A8HEtkmBH2Li03iJriB6m6JIN4rGz3K3JLawI7/veA1xzRKP6ISBw==",
       "dev": true
     },
     "dir-compare": {
@@ -7490,12 +7315,6 @@
         "is-symbol": "^1.0.4"
       }
     },
-    "es6-error": {
-      "version": "4.1.1",
-      "resolved": "https://registry.npmjs.org/es6-error/-/es6-error-4.1.1.tgz",
-      "integrity": "sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg==",
-      "dev": true
-    },
     "esbuild": {
       "version": "0.25.0",
       "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.0.tgz",
@@ -7987,13 +7806,6 @@
         "universalify": "^2.0.0"
       }
     },
-    "fsevents": {
-      "version": "2.3.2",
-      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
-      "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
-      "dev": true,
-      "optional": true
-    },
     "function-bind": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
@@ -8111,12 +7923,12 @@
           }
         },
         "minimatch": {
-          "version": "9.0.5",
-          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
-          "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+          "version": "9.0.9",
+          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz",
+          "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==",
           "dev": true,
           "requires": {
-            "brace-expansion": "^2.0.1"
+            "brace-expansion": "^2.0.2"
           }
         }
       }
@@ -8131,17 +7943,15 @@
       }
     },
     "global-agent": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-3.0.0.tgz",
-      "integrity": "sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q==",
+      "version": "4.1.3",
+      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-4.1.3.tgz",
+      "integrity": "sha512-KUJEViiuFT3I97t+GYMikLPJS2Lfo/S2F+DQuBWzuzaMPnvt5yyZePzArx36fBzpGTxZjIpDbXLeySLgh+k76g==",
       "dev": true,
       "requires": {
-        "boolean": "^3.0.1",
-        "es6-error": "^4.1.1",
-        "matcher": "^3.0.0",
-        "roarr": "^2.15.3",
-        "semver": "^7.3.2",
-        "serialize-error": "^7.0.1"
+        "globalthis": "^1.0.2",
+        "matcher": "^4.0.0",
+        "semver": "^7.3.5",
+        "serialize-error": "^8.1.0"
       }
     },
     "globals": {
@@ -8346,15 +8156,6 @@
         "has-bigints": "^1.0.2"
       }
     },
-    "is-binary-path": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
-      "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==",
-      "dev": true,
-      "requires": {
-        "binary-extensions": "^2.0.0"
-      }
-    },
     "is-boolean-object": {
       "version": "1.2.2",
       "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.2.2.tgz",
@@ -8481,6 +8282,12 @@
         "has-tostringtag": "^1.0.2"
       }
     },
+    "is-path-inside": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/is-path-inside/-/is-path-inside-3.0.3.tgz",
+      "integrity": "sha512-Fd4gABb+ycGAmKou8eMftCupSir5lRxqf4aD/vd0cD2qc4HL07OjCeuHMr8Ro4CoMaeCKDB0/ECBOVWjTwUvPQ==",
+      "dev": true
+    },
     "is-plain-obj": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-2.1.0.tgz",
@@ -8636,12 +8443,6 @@
       "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
       "dev": true
     },
-    "json-stringify-safe": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
-      "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
-      "dev": true
-    },
     "json5": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz",
@@ -8727,9 +8528,9 @@
       }
     },
     "matcher": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/matcher/-/matcher-3.0.0.tgz",
-      "integrity": "sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng==",
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/matcher/-/matcher-4.0.0.tgz",
+      "integrity": "sha512-S6x5wmcDmsDRRU/c2dkccDwQPXoFczc5+HpQ2lON8pnvHlnvHAHj5WlLVvw6n6vNyHuVugYrFohYxbS+pvFpKQ==",
       "dev": true,
       "requires": {
         "escape-string-regexp": "^4.0.0"
@@ -8779,30 +8580,31 @@
       "dev": true
     },
     "mocha": {
-      "version": "11.0.1",
-      "resolved": "https://registry.npmjs.org/mocha/-/mocha-11.0.1.tgz",
-      "integrity": "sha512-+3GkODfsDG71KSCQhc4IekSW+ItCK/kiez1Z28ksWvYhKXV/syxMlerR/sC7whDp7IyreZ4YxceMLdTs5hQE8A==",
+      "version": "11.7.5",
+      "resolved": "https://registry.npmjs.org/mocha/-/mocha-11.7.5.tgz",
+      "integrity": "sha512-mTT6RgopEYABzXWFx+GcJ+ZQ32kp4fMf0xvpZIIfSq9Z8lC/++MtcCnQ9t5FP2veYEP95FIYSvW+U9fV4xrlig==",
       "dev": true,
       "requires": {
-        "ansi-colors": "^4.1.3",
         "browser-stdout": "^1.3.1",
-        "chokidar": "^3.5.3",
+        "chokidar": "^4.0.1",
         "debug": "^4.3.5",
-        "diff": "^5.2.0",
+        "diff": "^7.0.0",
         "escape-string-regexp": "^4.0.0",
         "find-up": "^5.0.0",
         "glob": "^10.4.5",
         "he": "^1.2.0",
+        "is-path-inside": "^3.0.3",
         "js-yaml": "^4.1.0",
         "log-symbols": "^4.1.0",
-        "minimatch": "^5.1.6",
+        "minimatch": "^9.0.5",
         "ms": "^2.1.3",
+        "picocolors": "^1.1.1",
         "serialize-javascript": "^6.0.2",
         "strip-json-comments": "^3.1.1",
         "supports-color": "^8.1.1",
-        "workerpool": "^6.5.1",
-        "yargs": "^16.2.0",
-        "yargs-parser": "^20.2.9",
+        "workerpool": "^9.2.0",
+        "yargs": "^17.7.2",
+        "yargs-parser": "^21.1.1",
         "yargs-unparser": "^2.0.0"
       },
       "dependencies": {
@@ -8816,12 +8618,12 @@
           }
         },
         "minimatch": {
-          "version": "5.1.6",
-          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
-          "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+          "version": "9.0.9",
+          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz",
+          "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==",
           "dev": true,
           "requires": {
-            "brace-expansion": "^2.0.1"
+            "brace-expansion": "^2.0.2"
           }
         },
         "supports-color": {
@@ -8853,12 +8655,6 @@
       "integrity": "sha512-S2M9YimhSjBSvYnlr5/+umAnPHE++ODwt5e2Ij6FoX45HA/s4vHdkDx1eax2pAPeAOqu4s9b7ppahsyEFdVqQA==",
       "dev": true
     },
-    "normalize-path": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
-      "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
-      "dev": true
-    },
     "npmlog": {
       "version": "7.0.1",
       "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-7.0.1.tgz",
@@ -9138,13 +8934,10 @@
       }
     },
     "readdirp": {
-      "version": "3.6.0",
-      "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
-      "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==",
-      "dev": true,
-      "requires": {
-        "picomatch": "^2.2.1"
-      }
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz",
+      "integrity": "sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==",
+      "dev": true
     },
     "reflect.getprototypeof": {
       "version": "1.0.10",
@@ -9226,20 +9019,6 @@
       "integrity": "sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==",
       "dev": true
     },
-    "roarr": {
-      "version": "2.15.4",
-      "resolved": "https://registry.npmjs.org/roarr/-/roarr-2.15.4.tgz",
-      "integrity": "sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A==",
-      "dev": true,
-      "requires": {
-        "boolean": "^3.0.1",
-        "detect-node": "^2.0.4",
-        "globalthis": "^1.0.1",
-        "json-stringify-safe": "^5.0.1",
-        "semver-compare": "^1.0.0",
-        "sprintf-js": "^1.1.2"
-      }
-    },
     "run-parallel": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
@@ -9311,27 +9090,13 @@
       "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
       "dev": true
     },
-    "semver-compare": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/semver-compare/-/semver-compare-1.0.0.tgz",
-      "integrity": "sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==",
-      "dev": true
-    },
     "serialize-error": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/serialize-error/-/serialize-error-7.0.1.tgz",
-      "integrity": "sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw==",
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/serialize-error/-/serialize-error-8.1.0.tgz",
+      "integrity": "sha512-3NnuWfM6vBYoy5gZFvHiYsVbafvI9vZv/+jlIigFn4oP4zjNPK3LhcY0xSCgeb1a5L8jO71Mit9LlNoi2UfDDQ==",
       "dev": true,
       "requires": {
-        "type-fest": "^0.13.1"
-      },
-      "dependencies": {
-        "type-fest": {
-          "version": "0.13.1",
-          "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz",
-          "integrity": "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==",
-          "dev": true
-        }
+        "type-fest": "^0.20.2"
       }
     },
     "serialize-javascript": {
@@ -9499,12 +9264,6 @@
       "integrity": "sha512-4PRT4nh1EImPbt2jASOKHX7PB7I+e4IWNLvkKFDxNhJlfjbYlleYQh285Z/3mPTHSAK/AvdMmw5BNNuYH8ShgQ==",
       "dev": true
     },
-    "sprintf-js": {
-      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz",
-      "integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==",
-      "dev": true
-    },
     "stop-iteration-iterator": {
       "version": "1.1.0",
       "resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.1.0.tgz",
@@ -9694,6 +9453,12 @@
         "prelude-ls": "^1.2.1"
       }
     },
+    "type-fest": {
+      "version": "0.20.2",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz",
+      "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==",
+      "dev": true
+    },
     "typed-array-buffer": {
       "version": "1.0.3",
       "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.3.tgz",
@@ -9890,9 +9655,9 @@
       }
     },
     "workerpool": {
-      "version": "6.5.1",
-      "resolved": "https://registry.npmjs.org/workerpool/-/workerpool-6.5.1.tgz",
-      "integrity": "sha512-Fs4dNYcsdpYSAfVxhnl1L5zTksjvOJxtC5hzMNl+1t9B8hTJTdKDyZ5ju7ztgPy+ft9tBFXoOlDNiOT9WUXZlA==",
+      "version": "9.3.4",
+      "resolved": "https://registry.npmjs.org/workerpool/-/workerpool-9.3.4.tgz",
+      "integrity": "sha512-TmPRQYYSAnnDiEB0P/Ytip7bFGvqnSU6I2BcuSw7Hx+JSg/DsUi5ebYfc8GYaSdpuvOcEs6dXxPurOYpe9QFwg==",
       "dev": true
     },
     "wrap-ansi": {
@@ -9924,24 +9689,24 @@
       "dev": true
     },
     "yargs": {
-      "version": "16.2.0",
-      "resolved": "https://registry.npmjs.org/yargs/-/yargs-16.2.0.tgz",
-      "integrity": "sha512-D1mvvtDG0L5ft/jGWkLpG1+m0eQxOfaBvTNELraWj22wSVUMWxZUvYgJYcKh6jGGIkJFhH4IZPQhR4TKpc8mBw==",
+      "version": "17.7.2",
+      "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
+      "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
       "dev": true,
       "requires": {
-        "cliui": "^7.0.2",
+        "cliui": "^8.0.1",
         "escalade": "^3.1.1",
         "get-caller-file": "^2.0.5",
         "require-directory": "^2.1.1",
-        "string-width": "^4.2.0",
+        "string-width": "^4.2.3",
         "y18n": "^5.0.5",
-        "yargs-parser": "^20.2.2"
+        "yargs-parser": "^21.1.1"
       }
     },
     "yargs-parser": {
-      "version": "20.2.9",
-      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.9.tgz",
-      "integrity": "sha512-y11nGElTIV+CT3Zv9t7VKl+Q3hTQoT9a1Qzezhhl6Rp21gJ/IVTW7Z3y9EWXhuUBC2Shnf+DX0antecpAwSP8w==",
+      "version": "21.1.1",
+      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
+      "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
       "dev": true
     },
     "yargs-unparser": {
diff --git a/js/package.json b/js/package.json
index cb8b09f4247a6..65cfa4a59e4e3 100644
--- a/js/package.json
+++ b/js/package.json
@@ -4,7 +4,7 @@
     "@eslint/eslintrc": "^3.3.1",
     "@eslint/js": "^9.38.0",
     "@types/fs-extra": "^11.0.4",
-    "@types/global-agent": "^2.1.3",
+    "@types/global-agent": "^3.0.0",
     "@types/mocha": "^10.0.2",
     "@types/node": "^20.10.0",
     "@types/npmlog": "^4.1.4",
@@ -21,7 +21,7 @@
     "eslint-plugin-prefer-arrow": "^1.2.3",
     "eslint-plugin-unicorn": "^62.0.0",
     "fs-extra": "^11.2.0",
-    "global-agent": "^3.0",
+    "global-agent": "^4.1.3",
     "globals": "^16.4.0",
     "jszip": "^3.10.1",
     "mocha": "^11.0.1",
diff --git a/js/react_native/package-lock.json b/js/react_native/package-lock.json
index 6073725939e87..fdbc414b284a7 100644
--- a/js/react_native/package-lock.json
+++ b/js/react_native/package-lock.json
@@ -92,7 +92,6 @@
       "integrity": "sha512-BBt3opiCOxUr9euZ5/ro/Xv8/V7yJ5bjYMqG/C1YAo8MIKAnumZalCN+msbci3Pigy4lIQfPUpfMM27HMGaYEA==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@ampproject/remapping": "^2.2.0",
         "@babel/code-frame": "^7.24.7",
@@ -1943,7 +1942,6 @@
       "integrity": "sha512-vX3qPGE8sEKEAZCWk05k3cpTAE3/nOYca++JA+Rd0z2NCNzabmYvEiSShKzm10zdquOIAVXsy2Ei/DTW34KlKQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@babel/compat-data": "^7.26.8",
         "@babel/helper-compilation-targets": "^7.26.5",
@@ -3341,9 +3339,9 @@
       }
     },
     "node_modules/babel-plugin-module-resolver/node_modules/minimatch": {
-      "version": "8.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-8.0.4.tgz",
-      "integrity": "sha512-W0Wvr9HyFXZRGIDgCicunpQ299OKXs9RgZfaukz4qAW/pJhcpUfupc9c+OObPOFueNy8VSrZgEmDtk6Kh4WzDA==",
+      "version": "8.0.7",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-8.0.7.tgz",
+      "integrity": "sha512-V+1uQNdzybxa14e/p00HZnQNNcTjnRJjDxg2V8wtkjFctq4M7hXFws4oekyTP0Jebeq7QYtpFyOeBAjc88zvYg==",
       "dev": true,
       "license": "ISC",
       "dependencies": {
@@ -3511,7 +3509,6 @@
         }
       ],
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "caniuse-lite": "^1.0.30001688",
         "electron-to-chromium": "^1.5.73",
@@ -4349,9 +4346,9 @@
       }
     },
     "node_modules/fast-xml-parser": {
-      "version": "4.5.3",
-      "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.3.tgz",
-      "integrity": "sha512-RKihhV+SHsIUGXObeVy9AXiBbFwkVk7Syp8XgwN5U3JV416+Gwp/GO9i0JYKmikykgz/UHRrrV4ROuZEo/T0ig==",
+      "version": "4.5.5",
+      "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.5.tgz",
+      "integrity": "sha512-cK9c5I/DwIOI7/Q7AlGN3DuTdwN61gwSfL8rvuVPK+0mcCNHHGxRrpiFtaZZRfRMJL3Gl8B2AFlBG6qXf03w9A==",
       "dev": true,
       "funding": [
         {
@@ -4361,7 +4358,7 @@
       ],
       "license": "MIT",
       "dependencies": {
-        "strnum": "^1.1.1"
+        "strnum": "^1.0.5"
       },
       "bin": {
         "fxparser": "src/cli/cli.js"
@@ -7006,7 +7003,6 @@
       "integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "loose-envify": "^1.1.0"
       },
@@ -7036,7 +7032,6 @@
       "integrity": "sha512-yvQIX+ZXOHMFnhmwZ1fBpRI/53k+iLN8DxVf24Fx4ABU63RGAYfyCZC0/3W+5OUVx4KSIZUv4Tv+/NGIieBOwg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@jest/create-cache-key-function": "^29.6.3",
         "@react-native-community/cli": "12.3.7",
@@ -7241,9 +7236,9 @@
       }
     },
     "node_modules/react-native-builder-bob/node_modules/minimatch": {
-      "version": "5.1.6",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
-      "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+      "version": "5.1.9",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.9.tgz",
+      "integrity": "sha512-7o1wEA2RyMP7Iu7GNba9vc0RWWGACJOCZBJX2GJWip0ikV+wcOsgVuY9uE8CPiyQhkGFSlhuSkZPavN7u1c2Fw==",
       "dev": true,
       "license": "ISC",
       "dependencies": {
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 0e6d47f952c43..a02b86ec1ddc9 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -531,16 +531,16 @@
       }
     },
     "node_modules/browserstack-local": {
-      "version": "1.5.1",
-      "resolved": "https://registry.npmjs.org/browserstack-local/-/browserstack-local-1.5.1.tgz",
-      "integrity": "sha512-T/wxyWDzvBHbDvl7fZKpFU7mYze6nrUkBhNy+d+8bXBqgQX10HTYvajIGO0wb49oGSLCPM0CMZTV/s7e6LF0sA==",
+      "version": "1.5.12",
+      "resolved": "https://registry.npmjs.org/browserstack-local/-/browserstack-local-1.5.12.tgz",
+      "integrity": "sha512-xrdpG4rw6Ktxa/gM8x0esnohFlw0V33bQiUX08rrHWKbnJAG57KTHGvJ4mvgc9eRL63pEKal+WuNDg3vEUz4hA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "agent-base": "^6.0.2",
         "https-proxy-agent": "^5.0.1",
         "is-running": "^2.1.0",
-        "ps-tree": "=1.2.0",
-        "temp-fs": "^0.9.9"
+        "tree-kill": "^1.2.2"
       }
     },
     "node_modules/browserstack-local/node_modules/https-proxy-agent": {
@@ -1061,12 +1061,6 @@
         "node": ">= 0.4"
       }
     },
-    "node_modules/duplexer": {
-      "version": "0.1.2",
-      "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz",
-      "integrity": "sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg==",
-      "dev": true
-    },
     "node_modules/edge-launcher": {
       "version": "1.2.2",
       "resolved": "https://registry.npmjs.org/edge-launcher/-/edge-launcher-1.2.2.tgz",
@@ -1243,21 +1237,6 @@
         "node": ">=0.8.0"
       }
     },
-    "node_modules/event-stream": {
-      "version": "3.3.4",
-      "resolved": "https://registry.npmjs.org/event-stream/-/event-stream-3.3.4.tgz",
-      "integrity": "sha512-QHpkERcGsR0T7Qm3HNJSyXKEEj8AHNxkY3PK8TS2KJvQ7NiSHe3DDpwVKKtoYprL/AreyzFBeIkBIWChAqn60g==",
-      "dev": true,
-      "dependencies": {
-        "duplexer": "~0.1.1",
-        "from": "~0",
-        "map-stream": "~0.1.0",
-        "pause-stream": "0.0.11",
-        "split": "0.3",
-        "stream-combiner": "~0.0.4",
-        "through": "~2.3.1"
-      }
-    },
     "node_modules/eventemitter3": {
       "version": "4.0.7",
       "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz",
@@ -1415,10 +1394,11 @@
       "license": "Apache-2.0"
     },
     "node_modules/flatted": {
-      "version": "3.2.7",
-      "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.2.7.tgz",
-      "integrity": "sha512-5nqDSxl8nn5BSNxyR3n4I6eDmbolI6WT+QqR547RwxQapgjQBmtktdP+HTBb/a/zLsbzERTONyUB5pefh5TtjQ==",
-      "dev": true
+      "version": "3.4.2",
+      "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.2.tgz",
+      "integrity": "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==",
+      "dev": true,
+      "license": "ISC"
     },
     "node_modules/follow-redirects": {
       "version": "1.15.6",
@@ -1440,12 +1420,6 @@
         }
       }
     },
-    "node_modules/from": {
-      "version": "0.1.7",
-      "resolved": "https://registry.npmjs.org/from/-/from-0.1.7.tgz",
-      "integrity": "sha512-twe20eF1OxVxp/ML/kq2p1uc6KvFK/+vs8WjEbeKmV2He22MKm7YF2ANIt+EOqhJ5L3K/SuuPhk0hWQDjOM23g==",
-      "dev": true
-    },
     "node_modules/fs-extra": {
       "version": "8.1.0",
       "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz",
@@ -2369,12 +2343,6 @@
         "node": ">=10"
       }
     },
-    "node_modules/map-stream": {
-      "version": "0.1.0",
-      "resolved": "https://registry.npmjs.org/map-stream/-/map-stream-0.1.0.tgz",
-      "integrity": "sha512-CkYQrPYZfWnu/DAmVCpTSX/xHpKZ80eKh2lAkyA6AJTef6bW+6JpbQZN5rofum7da+SyN1bi5ctTm+lTfcCW3g==",
-      "dev": true
-    },
     "node_modules/matcher": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/matcher/-/matcher-3.0.0.tgz",
@@ -2484,12 +2452,13 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "7.4.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.2.tgz",
-      "integrity": "sha512-xy4q7wou3vUoC9k1xGTXc+awNdGaGVHtFUaey8tiX4H1QRc04DZ/rmDFwNm2EBsuYEhAZ6SgMmYf3InGY6OauA==",
+      "version": "7.4.9",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.9.tgz",
+      "integrity": "sha512-Brg/fp/iAVDOQoHxkuN5bEYhyQlZhxddI78yWsCbeEwTHXQjlNLtiJDUsp1GIptVqMI7/gkJMz4vVAc01mpoBw==",
       "dev": true,
+      "license": "ISC",
       "dependencies": {
-        "brace-expansion": "^2.0.1"
+        "brace-expansion": "^2.0.2"
       },
       "engines": {
         "node": ">=10"
@@ -2722,15 +2691,6 @@
         "node": "*"
       }
     },
-    "node_modules/pause-stream": {
-      "version": "0.0.11",
-      "resolved": "https://registry.npmjs.org/pause-stream/-/pause-stream-0.0.11.tgz",
-      "integrity": "sha512-e3FBlXLmN/D1S+zHzanP4E/4Z60oFAa3O051qt1pxa7DEJWKAyil6upYVXCWadEnuoqa4Pkc9oUx9zsxYeRv8A==",
-      "dev": true,
-      "dependencies": {
-        "through": "~2.3"
-      }
-    },
     "node_modules/pend": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
@@ -2786,21 +2746,6 @@
         "node": ">=12.0.0"
       }
     },
-    "node_modules/ps-tree": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/ps-tree/-/ps-tree-1.2.0.tgz",
-      "integrity": "sha512-0VnamPPYHl4uaU/nSFeZZpR21QAWRz+sRv4iW9+v/GS/J5U5iZB5BNN6J0RMoOvdx2gWM2+ZFMIm58q24e4UYA==",
-      "dev": true,
-      "dependencies": {
-        "event-stream": "=3.3.4"
-      },
-      "bin": {
-        "ps-tree": "bin/ps-tree.js"
-      },
-      "engines": {
-        "node": ">= 0.10"
-      }
-    },
     "node_modules/pump": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
@@ -3227,18 +3172,44 @@
       }
     },
     "node_modules/socket.io-parser": {
-      "version": "4.2.4",
-      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.4.tgz",
-      "integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
+      "version": "4.2.6",
+      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.6.tgz",
+      "integrity": "sha512-asJqbVBDsBCJx0pTqw3WfesSY0iRX+2xzWEWzrpcH7L6fLzrhyF8WPI8UaeM4YCuDfpwA/cgsdugMsmtz8EJeg==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@socket.io/component-emitter": "~3.1.0",
-        "debug": "~4.3.1"
+        "debug": "~4.4.1"
       },
       "engines": {
         "node": ">=10.0.0"
       }
     },
+    "node_modules/socket.io-parser/node_modules/debug": {
+      "version": "4.4.3",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
+      "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ms": "^2.1.3"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "peerDependenciesMeta": {
+        "supports-color": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/socket.io-parser/node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/source-map": {
       "version": "0.7.4",
       "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.4.tgz",
@@ -3248,18 +3219,6 @@
         "node": ">= 8"
       }
     },
-    "node_modules/split": {
-      "version": "0.3.3",
-      "resolved": "https://registry.npmjs.org/split/-/split-0.3.3.tgz",
-      "integrity": "sha512-wD2AeVmxXRBoX44wAycgjVpMhvbwdI2aZjCkvfNcH1YqHQvJVa1duWc73OyVGJUc05fhFaTZeQ/PYsrmyH0JVA==",
-      "dev": true,
-      "dependencies": {
-        "through": "2"
-      },
-      "engines": {
-        "node": "*"
-      }
-    },
     "node_modules/sprintf-js": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.2.tgz",
@@ -3276,15 +3235,6 @@
         "node": ">= 0.6"
       }
     },
-    "node_modules/stream-combiner": {
-      "version": "0.0.4",
-      "resolved": "https://registry.npmjs.org/stream-combiner/-/stream-combiner-0.0.4.tgz",
-      "integrity": "sha512-rT00SPnTVyRsaSz5zgSPma/aHSOic5U1prhYdRy5HS2kTZviFpmDgzilbtsJsxiroqACmayynDN/9VzIbX5DOw==",
-      "dev": true,
-      "dependencies": {
-        "duplexer": "~0.1.1"
-      }
-    },
     "node_modules/streamroller": {
       "version": "3.1.5",
       "resolved": "https://registry.npmjs.org/streamroller/-/streamroller-3.1.5.tgz",
@@ -3391,36 +3341,6 @@
         "node": ">=4"
       }
     },
-    "node_modules/temp-fs": {
-      "version": "0.9.9",
-      "resolved": "https://registry.npmjs.org/temp-fs/-/temp-fs-0.9.9.tgz",
-      "integrity": "sha512-WfecDCR1xC9b0nsrzSaxPf3ZuWeWLUWblW4vlDQAa1biQaKHiImHnJfeQocQe/hXKMcolRzgkcVX/7kK4zoWbw==",
-      "dev": true,
-      "dependencies": {
-        "rimraf": "~2.5.2"
-      },
-      "engines": {
-        "node": ">=0.8.0"
-      }
-    },
-    "node_modules/temp-fs/node_modules/rimraf": {
-      "version": "2.5.4",
-      "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.5.4.tgz",
-      "integrity": "sha512-Lw7SHMjssciQb/rRz7JyPIy9+bbUshEucPoLRvWqy09vC5zQixl8Uet+Zl+SROBB/JMWHJRdCk1qdxNWHNMvlQ==",
-      "dev": true,
-      "dependencies": {
-        "glob": "^7.0.5"
-      },
-      "bin": {
-        "rimraf": "bin.js"
-      }
-    },
-    "node_modules/through": {
-      "version": "2.3.8",
-      "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
-      "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==",
-      "dev": true
-    },
     "node_modules/tmp": {
       "version": "0.2.5",
       "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz",
@@ -3451,6 +3371,16 @@
         "node": ">=0.6"
       }
     },
+    "node_modules/tree-kill": {
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz",
+      "integrity": "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "tree-kill": "cli.js"
+      }
+    },
     "node_modules/type-detect": {
       "version": "4.0.8",
       "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
@@ -4119,16 +4049,15 @@
       }
     },
     "browserstack-local": {
-      "version": "1.5.1",
-      "resolved": "https://registry.npmjs.org/browserstack-local/-/browserstack-local-1.5.1.tgz",
-      "integrity": "sha512-T/wxyWDzvBHbDvl7fZKpFU7mYze6nrUkBhNy+d+8bXBqgQX10HTYvajIGO0wb49oGSLCPM0CMZTV/s7e6LF0sA==",
+      "version": "1.5.12",
+      "resolved": "https://registry.npmjs.org/browserstack-local/-/browserstack-local-1.5.12.tgz",
+      "integrity": "sha512-xrdpG4rw6Ktxa/gM8x0esnohFlw0V33bQiUX08rrHWKbnJAG57KTHGvJ4mvgc9eRL63pEKal+WuNDg3vEUz4hA==",
       "dev": true,
       "requires": {
         "agent-base": "^6.0.2",
         "https-proxy-agent": "^5.0.1",
         "is-running": "^2.1.0",
-        "ps-tree": "=1.2.0",
-        "temp-fs": "^0.9.9"
+        "tree-kill": "^1.2.2"
       },
       "dependencies": {
         "https-proxy-agent": {
@@ -4536,12 +4465,6 @@
         "gopd": "^1.2.0"
       }
     },
-    "duplexer": {
-      "version": "0.1.2",
-      "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz",
-      "integrity": "sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg==",
-      "dev": true
-    },
     "edge-launcher": {
       "version": "1.2.2",
       "resolved": "https://registry.npmjs.org/edge-launcher/-/edge-launcher-1.2.2.tgz",
@@ -4683,21 +4606,6 @@
       "integrity": "sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg==",
       "dev": true
     },
-    "event-stream": {
-      "version": "3.3.4",
-      "resolved": "https://registry.npmjs.org/event-stream/-/event-stream-3.3.4.tgz",
-      "integrity": "sha512-QHpkERcGsR0T7Qm3HNJSyXKEEj8AHNxkY3PK8TS2KJvQ7NiSHe3DDpwVKKtoYprL/AreyzFBeIkBIWChAqn60g==",
-      "dev": true,
-      "requires": {
-        "duplexer": "~0.1.1",
-        "from": "~0",
-        "map-stream": "~0.1.0",
-        "pause-stream": "0.0.11",
-        "split": "0.3",
-        "stream-combiner": "~0.0.4",
-        "through": "~2.3.1"
-      }
-    },
     "eventemitter3": {
       "version": "4.0.7",
       "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz",
@@ -4832,9 +4740,9 @@
       "integrity": "sha512-Ni+KCqYquU30UEgGkrrwpbYtUcUmNuLFcQ5Xdy9DK7WUaji+AAov+Bf12FEYmu0eI15y31oD38utnBexe0cAYA=="
     },
     "flatted": {
-      "version": "3.2.7",
-      "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.2.7.tgz",
-      "integrity": "sha512-5nqDSxl8nn5BSNxyR3n4I6eDmbolI6WT+QqR547RwxQapgjQBmtktdP+HTBb/a/zLsbzERTONyUB5pefh5TtjQ==",
+      "version": "3.4.2",
+      "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.2.tgz",
+      "integrity": "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==",
       "dev": true
     },
     "follow-redirects": {
@@ -4843,12 +4751,6 @@
       "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
-    "from": {
-      "version": "0.1.7",
-      "resolved": "https://registry.npmjs.org/from/-/from-0.1.7.tgz",
-      "integrity": "sha512-twe20eF1OxVxp/ML/kq2p1uc6KvFK/+vs8WjEbeKmV2He22MKm7YF2ANIt+EOqhJ5L3K/SuuPhk0hWQDjOM23g==",
-      "dev": true
-    },
     "fs-extra": {
       "version": "8.1.0",
       "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz",
@@ -5572,12 +5474,6 @@
         "yallist": "^4.0.0"
       }
     },
-    "map-stream": {
-      "version": "0.1.0",
-      "resolved": "https://registry.npmjs.org/map-stream/-/map-stream-0.1.0.tgz",
-      "integrity": "sha512-CkYQrPYZfWnu/DAmVCpTSX/xHpKZ80eKh2lAkyA6AJTef6bW+6JpbQZN5rofum7da+SyN1bi5ctTm+lTfcCW3g==",
-      "dev": true
-    },
     "matcher": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/matcher/-/matcher-3.0.0.tgz",
@@ -5653,12 +5549,12 @@
       "dev": true
     },
     "minimatch": {
-      "version": "7.4.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.2.tgz",
-      "integrity": "sha512-xy4q7wou3vUoC9k1xGTXc+awNdGaGVHtFUaey8tiX4H1QRc04DZ/rmDFwNm2EBsuYEhAZ6SgMmYf3InGY6OauA==",
+      "version": "7.4.9",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.9.tgz",
+      "integrity": "sha512-Brg/fp/iAVDOQoHxkuN5bEYhyQlZhxddI78yWsCbeEwTHXQjlNLtiJDUsp1GIptVqMI7/gkJMz4vVAc01mpoBw==",
       "dev": true,
       "requires": {
-        "brace-expansion": "^2.0.1"
+        "brace-expansion": "^2.0.2"
       }
     },
     "minimist": {
@@ -5827,15 +5723,6 @@
       "integrity": "sha512-Dp6zGqpTdETdR63lehJYPeIOqpiNBNtc7BpWSLrOje7UaIsE5aY92r/AunQA7rsXvet3lrJ3JnZX29UPTKXyKQ==",
       "dev": true
     },
-    "pause-stream": {
-      "version": "0.0.11",
-      "resolved": "https://registry.npmjs.org/pause-stream/-/pause-stream-0.0.11.tgz",
-      "integrity": "sha512-e3FBlXLmN/D1S+zHzanP4E/4Z60oFAa3O051qt1pxa7DEJWKAyil6upYVXCWadEnuoqa4Pkc9oUx9zsxYeRv8A==",
-      "dev": true,
-      "requires": {
-        "through": "~2.3"
-      }
-    },
     "pend": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
@@ -5878,15 +5765,6 @@
         "long": "^5.0.0"
       }
     },
-    "ps-tree": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/ps-tree/-/ps-tree-1.2.0.tgz",
-      "integrity": "sha512-0VnamPPYHl4uaU/nSFeZZpR21QAWRz+sRv4iW9+v/GS/J5U5iZB5BNN6J0RMoOvdx2gWM2+ZFMIm58q24e4UYA==",
-      "dev": true,
-      "requires": {
-        "event-stream": "=3.3.4"
-      }
-    },
     "pump": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
@@ -6183,13 +6061,30 @@
       }
     },
     "socket.io-parser": {
-      "version": "4.2.4",
-      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.4.tgz",
-      "integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
+      "version": "4.2.6",
+      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.6.tgz",
+      "integrity": "sha512-asJqbVBDsBCJx0pTqw3WfesSY0iRX+2xzWEWzrpcH7L6fLzrhyF8WPI8UaeM4YCuDfpwA/cgsdugMsmtz8EJeg==",
       "dev": true,
       "requires": {
         "@socket.io/component-emitter": "~3.1.0",
-        "debug": "~4.3.1"
+        "debug": "~4.4.1"
+      },
+      "dependencies": {
+        "debug": {
+          "version": "4.4.3",
+          "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
+          "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
+          "dev": true,
+          "requires": {
+            "ms": "^2.1.3"
+          }
+        },
+        "ms": {
+          "version": "2.1.3",
+          "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+          "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+          "dev": true
+        }
       }
     },
     "source-map": {
@@ -6198,15 +6093,6 @@
       "integrity": "sha512-l3BikUxvPOcn5E74dZiq5BGsTb5yEwhaTSzccU6t4sDOH8NWJCstKO5QT2CvtFoK6F0saL7p9xHAqHOlCPJygA==",
       "dev": true
     },
-    "split": {
-      "version": "0.3.3",
-      "resolved": "https://registry.npmjs.org/split/-/split-0.3.3.tgz",
-      "integrity": "sha512-wD2AeVmxXRBoX44wAycgjVpMhvbwdI2aZjCkvfNcH1YqHQvJVa1duWc73OyVGJUc05fhFaTZeQ/PYsrmyH0JVA==",
-      "dev": true,
-      "requires": {
-        "through": "2"
-      }
-    },
     "sprintf-js": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.2.tgz",
@@ -6220,15 +6106,6 @@
       "integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
       "dev": true
     },
-    "stream-combiner": {
-      "version": "0.0.4",
-      "resolved": "https://registry.npmjs.org/stream-combiner/-/stream-combiner-0.0.4.tgz",
-      "integrity": "sha512-rT00SPnTVyRsaSz5zgSPma/aHSOic5U1prhYdRy5HS2kTZviFpmDgzilbtsJsxiroqACmayynDN/9VzIbX5DOw==",
-      "dev": true,
-      "requires": {
-        "duplexer": "~0.1.1"
-      }
-    },
     "streamroller": {
       "version": "3.1.5",
       "resolved": "https://registry.npmjs.org/streamroller/-/streamroller-3.1.5.tgz",
@@ -6307,32 +6184,6 @@
         "has-flag": "^3.0.0"
       }
     },
-    "temp-fs": {
-      "version": "0.9.9",
-      "resolved": "https://registry.npmjs.org/temp-fs/-/temp-fs-0.9.9.tgz",
-      "integrity": "sha512-WfecDCR1xC9b0nsrzSaxPf3ZuWeWLUWblW4vlDQAa1biQaKHiImHnJfeQocQe/hXKMcolRzgkcVX/7kK4zoWbw==",
-      "dev": true,
-      "requires": {
-        "rimraf": "~2.5.2"
-      },
-      "dependencies": {
-        "rimraf": {
-          "version": "2.5.4",
-          "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.5.4.tgz",
-          "integrity": "sha512-Lw7SHMjssciQb/rRz7JyPIy9+bbUshEucPoLRvWqy09vC5zQixl8Uet+Zl+SROBB/JMWHJRdCk1qdxNWHNMvlQ==",
-          "dev": true,
-          "requires": {
-            "glob": "^7.0.5"
-          }
-        }
-      }
-    },
-    "through": {
-      "version": "2.3.8",
-      "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
-      "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==",
-      "dev": true
-    },
     "tmp": {
       "version": "0.2.5",
       "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz",
@@ -6354,6 +6205,12 @@
       "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==",
       "dev": true
     },
+    "tree-kill": {
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz",
+      "integrity": "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==",
+      "dev": true
+    },
     "type-detect": {
       "version": "4.0.8",
       "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index cb7cfbb4fb97a..d2996b122c5f7 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -69,7 +69,7 @@ GetComputeType<MLFloat16>(size_t nbits, size_t block_size, int64_t accuracy_leve
   // By converting Fp16 to Fp32, there is not precision increase, and the performance
   // becomes worse.
   if (accuracy_level_attr == static_cast<int64_t>(Level4) &&
-      MlasIsQNBitGemmAvailable(nbits, block_size, HQNBIT_CompInt8)) {
+      MlasIsQNBitGemmAvailable(nbits, block_size, SQNBIT_CompInt8)) {
     return HQNBIT_CompInt8;
   }
 
@@ -258,20 +258,50 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
         prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
       }
     } else {
-      packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, has_zp_input_, compute_type_, &mlas_backend_kernel_selector_config_);
+      // For HQNBIT_CompInt8, route through SQNBIT_CompInt8 for sizing and packing.
+      // This gets KleidiAI-sized buffer when available for 4-bit and packs B+scales correctly.
+      const auto effective_compute_type = (compute_type_ == HQNBIT_CompInt8)
+                                              ? SQNBIT_CompInt8
+                                              : compute_type_;
+
+      packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, has_zp_input_, effective_compute_type, &mlas_backend_kernel_selector_config_);
       if (packed_b_size_ == 0) {
         return Status::OK();
       }
+
       auto qptr = tensor.DataRaw();
-      // For HQNBIT compute types, scales are fp16 and cannot be passed directly
-      // to packing functions that expect float*. Pass nullptr here; scales will
-      // be properly converted and packed in a subsequent PrePack call.
-      auto scale_ptr = (scales && compute_type_ != HQNBIT_CompInt8 && compute_type_ != HQNBIT_CompFp16)
-                           ? scales->DataRaw()
-                           : nullptr;
+      const void* scale_ptr = nullptr;
+
+      // For HQNBIT_CompInt8: convert constant fp16 scales to fp32 for packing.
+      // KleidiAI bakes scales into packed B for 4-bit; 8-bit needs fp32 scales for SQ8BitGemmPackQuantBDataAndBlkSum.
+      if (compute_type_ == HQNBIT_CompInt8 && scales) {
+        auto sptr_fp16 = scales->Data<MLFloat16>();
+        auto scales_size = static_cast<size_t>(scales->Shape().Size());
+        scales_fp32_ = IAllocator::MakeUniquePtr<float>(alloc, scales_size, true);
+        MlasConvertHalfToFloatBuffer(sptr_fp16, scales_fp32_.get(), scales_size);
+        scale_ptr = scales_fp32_.get();
+      } else if (scales && compute_type_ != HQNBIT_CompInt8 && compute_type_ != HQNBIT_CompFp16) {
+        // For non-HQNBIT compute types, scales are already float.
+        scale_ptr = scales->DataRaw();
+      }
+
       packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
-      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), scale_ptr,
+      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, qptr, packed_b_.get(), scale_ptr,
                                   has_zp_input_, nullptr, threadpool_ptr, &mlas_backend_kernel_selector_config_);
+
+#if defined(MLAS_TARGET_ARM64)
+      // For KleidiAI asymmetric 4-bit path: compute BZpCorr now while scales and zero_points are accessible.
+      if (compute_type_ == HQNBIT_CompInt8 && nbits_ == 4 && has_zp_input_ && scales_fp32_ &&
+          MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, SQNBIT_CompInt8, has_zp_input_, &mlas_backend_kernel_selector_config_)) {
+        const Tensor* zp_tensor = nullptr;
+        OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor);
+        if (zp_tensor != nullptr) {
+          auto zptr = zp_tensor->Data<uint8_t>();
+          MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(),
+                                      scales_fp32_.get(), has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_);
+        }
+      }
+#endif  // MLAS_TARGET_ARM64
     }
     is_packed = true;
   } else if (compute_type_ == SQNBIT_CompInt8) {
@@ -337,25 +367,75 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
       }
     }
 #endif  // MLAS_TARGET_ARM64
-  } else if (compute_type_ == HQNBIT_CompInt8 && nbits_ == 8) {
-    // For 8-bit HQNBIT_CompInt8, scales are fp16 but the SQ8 packing functions expect float.
+  } else if (compute_type_ == HQNBIT_CompInt8) {
+    // For HQNBIT_CompInt8 (both 4-bit and 8-bit), scales are fp16 but packing functions expect float.
     // Convert fp16 scales to float and pack using the SQNBIT_CompInt8 path.
+    // At compute time, we delegate to MlasQNBitGemmBatch<float> with SQNBIT_CompInt8.
     if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
-      auto sptr_fp16 = tensor.Data<MLFloat16>();
-      std::vector<float> scales_fp32(static_cast<size_t>(tensor.Shape().Size()));
-      MlasConvertHalfToFloatBuffer(sptr_fp16, scales_fp32.data(), scales_fp32.size());
-      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(),
-                                  scales_fp32.data(), has_zp_input_, nullptr, nullptr,
-                                  &mlas_backend_kernel_selector_config_);
-      is_packed = false;
+#if defined(MLAS_TARGET_ARM64)
+      // For 4-bit on ARM64: check if KleidiAI packs scales into B (scales already packed during B packing).
+      if (nbits_ == 4 &&
+          MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, SQNBIT_CompInt8,
+                                    has_zp_input_, &mlas_backend_kernel_selector_config_)) {
+        // For asymmetric quantization, require zero_points to be constant for KleidiAI.
+        if (has_zp_input_) {
+          const Tensor* zp_tensor = nullptr;
+          OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor);
+          if (zp_tensor == nullptr) {
+            // zero_points is dynamic: fall back to non-KleidiAI path.
+            // Convert scales to fp32 for use at compute time.
+            auto sptr_fp16 = tensor.Data<MLFloat16>();
+            auto tensor_size = static_cast<size_t>(tensor.Shape().Size());
+            if (!scales_fp32_) {
+              scales_fp32_ = IAllocator::MakeUniquePtr<float>(alloc, tensor_size, true);
+              MlasConvertHalfToFloatBuffer(sptr_fp16, scales_fp32_.get(), tensor_size);
+            }
+            return Status::OK();
+          }
+        }
+
+        // BZpCorr was already computed during B packing in Step 1 (if applicable).
+        scales_are_packed_ = true;
+        is_packed = true;
+      } else
+#endif  // MLAS_TARGET_ARM64
+      {
+        // Non-KleidiAI path (or 8-bit): convert fp16 scales to fp32.
+        auto sptr_fp16 = tensor.Data<MLFloat16>();
+        auto tensor_size = static_cast<size_t>(tensor.Shape().Size());
+        if (!scales_fp32_) {
+          scales_fp32_ = IAllocator::MakeUniquePtr<float>(alloc, tensor_size, true);
+          MlasConvertHalfToFloatBuffer(sptr_fp16, scales_fp32_.get(), tensor_size);
+        }
+        // Pack scales separately only for 8-bit. For 4-bit on ARM64, scales are already packed
+        // during B packing or used as a raw pointer at compute time (matching standard
+        // SQNBIT_CompInt8 behavior where should_pack_scale_and_zp_inputs = (nbits_ == 8) on ARM64).
+        if (nbits_ == 8) {
+          MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(),
+                                      scales_fp32_.get(), has_zp_input_, nullptr, nullptr,
+                                      &mlas_backend_kernel_selector_config_);
+        }
+        is_packed = false;
+      }
     }
 
-    if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
+    // Pack zero_points separately only for 8-bit (matching standard SQNBIT_CompInt8 behavior).
+    // For 4-bit, zero_points are passed directly in data params or handled via KleidiAI BZpCorr.
+    if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8) {
       auto zptr = tensor.Data<uint8_t>();
       MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), nullptr,
                                   has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_);
       is_packed = false;
     }
+
+    // Pre-convert fp16 bias to fp32 for use at compute time.
+    if (input_idx == InputIndex::bias) {
+      auto bptr_fp16 = tensor.Data<MLFloat16>();
+      auto tensor_size = static_cast<size_t>(tensor.Shape().Size());
+      bias_fp32_ = IAllocator::MakeUniquePtr<float>(alloc, tensor_size, true);
+      MlasConvertHalfToFloatBuffer(bptr_fp16, bias_fp32_.get(), tensor_size);
+      is_packed = false;
+    }
   } else if (prefer_lut_gemm_) {
     // Pack scales/zero_points for LUT GEMM if B was already packed but scales weren't available then
     if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
@@ -519,9 +599,7 @@ Status MatMulNBits<T1>::ComputeBPacked(const Tensor* a,
                                        concurrency::ThreadPool* thread_pool,
                                        const MatMulComputeHelper& helper) const {
   const auto* a_data = a->Data<T1>();
-  const auto* scales_data = scales == nullptr ? nullptr : scales->Data<T1>();
   const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
-  const auto* bias_data = bias == nullptr ? nullptr : bias->Data<T1>();
   auto* y_data = y->MutableData<T1>();
 
   const size_t batch_count = helper.OutputOffsets().size();
@@ -530,6 +608,91 @@ Status MatMulNBits<T1>::ComputeBPacked(const Tensor* a,
   const size_t K = static_cast<size_t>(helper.K());
   const size_t lda = helper.Lda(false);
 
+  // For HQNBIT_CompInt8 with fp16 inputs: delegate to fp32 MLAS path (SQNBIT_CompInt8).
+  // The HQ CompInt8 kernels are just wrappers that convert fp16->fp32 per-tile and call the same
+  // SQ fp32 kernels. By doing bulk conversion at the operator level we eliminate per-tile overhead
+  // and automatically get KleidiAI support for 4-bit (since SQ4BitGemm_CompInt8 checks KleidiAI).
+  // This matches the approach used by x64 and Apple ARM64 (non-fp16-intrinsics fallback).
+  if constexpr (std::is_same_v<T1, MLFloat16>) {
+    if (compute_type_ == HQNBIT_CompInt8) {
+      const auto* a_data_fp16 = a->Data<MLFloat16>();
+      const auto* bias_data_fp16 = bias == nullptr ? nullptr : bias->Data<MLFloat16>();
+
+      // Bulk convert A from fp16 to fp32.
+      auto a_size = static_cast<size_t>(a->Shape().Size());
+      auto tmp_a_data_ptr = IAllocator::MakeUniquePtr<float>(allocator, a_size, true);
+      MlasConvertHalfToFloatBuffer(a_data_fp16, tmp_a_data_ptr.get(), a_size);
+
+      // Use pre-converted fp32 scales, or nullptr if scales are baked into packed B (KleidiAI).
+      // For non-KleidiAI 4-bit: scales_fp32_ was set during PrePack.
+      // For 8-bit: scales are packed inside PackedQuantBDataStruct and extracted at dispatch.
+      float* scales_ptr = nullptr;
+      IAllocatorUniquePtr<float> tmp_scales;
+      if (!scales_are_packed_) {
+        if (scales_fp32_) {
+          scales_ptr = scales_fp32_.get();
+        } else {
+          // Dynamic scales (non-constant input): convert fp16 to fp32 at compute time.
+          ORT_ENFORCE(scales != nullptr, "scales must be provided when not packed and not pre-converted");
+          auto scales_size = static_cast<size_t>(scales->Shape().Size());
+          tmp_scales = IAllocator::MakeUniquePtr<float>(allocator, scales_size, true);
+          MlasConvertHalfToFloatBuffer(scales->Data<MLFloat16>(), tmp_scales.get(), scales_size);
+          scales_ptr = tmp_scales.get();
+        }
+      }
+
+      // Use pre-converted fp32 bias, or convert on the fly.
+      float* bias_ptr = nullptr;
+      IAllocatorUniquePtr<float> tmp_bias;
+      if (bias_data_fp16) {
+        if (bias_fp32_) {
+          bias_ptr = bias_fp32_.get();
+        } else {
+          auto bias_size = static_cast<size_t>(bias->Shape().Size());
+          tmp_bias = IAllocator::MakeUniquePtr<float>(allocator, bias_size, true);
+          MlasConvertHalfToFloatBuffer(bias_data_fp16, tmp_bias.get(), bias_size);
+          bias_ptr = tmp_bias.get();
+        }
+      }
+
+      // Allocate fp32 output buffer.
+      auto c_size = static_cast<size_t>(y->Shape().Size());
+      auto tmp_c = IAllocator::MakeUniquePtr<float>(allocator, c_size, true);
+
+      // Compute workspace sized for SQNBIT_CompInt8 (includes KleidiAI workspace when available).
+      IAllocatorUniquePtr<std::byte> workspace{};
+      const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize(
+          M, N, K, batch_count, nbits_, block_size_, zero_points, SQNBIT_CompInt8, &mlas_backend_kernel_selector_config_);
+      if (workspace_size > 0) {
+        workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size, true);
+      }
+
+      InlinedVector<MLAS_QNBIT_GEMM_DATA_PARAMS<float>> data(batch_count);
+      for (size_t i = 0; i < batch_count; ++i) {
+        data[i].A = tmp_a_data_ptr.get() + helper.LeftOffsets()[i];
+        data[i].lda = lda;
+        data[i].QuantBDataWorkspace = packed_b_.get();
+        data[i].PackedQuantBData = static_cast<std::byte*>(packed_b_.get());
+        data[i].QuantBScale = scales_ptr;
+        data[i].QuantBZeroPoint = zero_points_data;
+        data[i].Bias = bias_ptr;
+        data[i].C = tmp_c.get() + helper.OutputOffsets()[i];
+        data[i].ldc = N;
+      }
+
+      MlasQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, SQNBIT_CompInt8, data.data(), workspace.get(),
+                         thread_pool, &mlas_backend_kernel_selector_config_);
+
+      // Bulk convert output from fp32 to fp16.
+      MlasConvertFloatToHalfBuffer(tmp_c.get(), y_data, c_size);
+      return Status::OK();
+    }
+  }
+
+  // Standard path for non-HQNBIT_CompInt8 compute types (fp32 inputs, CompFp32, CompFp16, etc.)
+  const auto* scales_data = scales == nullptr ? nullptr : scales->Data<T1>();
+  const auto* bias_data = bias == nullptr ? nullptr : bias->Data<T1>();
+
   IAllocatorUniquePtr<std::byte> workspace{};
   const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize(
       M, N, K, batch_count, nbits_, block_size_, zero_points, compute_type_, &mlas_backend_kernel_selector_config_);
@@ -542,7 +705,7 @@ Status MatMulNBits<T1>::ComputeBPacked(const Tensor* a,
   for (size_t i = 0; i < batch_count; ++i) {
     data[i].A = a_data + helper.LeftOffsets()[i];
     data[i].lda = lda;
-    if (compute_type_ == SQNBIT_CompInt8 || (compute_type_ == HQNBIT_CompInt8 && nbits_ == 8)) {
+    if (compute_type_ == SQNBIT_CompInt8) {
       data[i].QuantBDataWorkspace = packed_b_.get();
     }
     data[i].PackedQuantBData = static_cast<std::byte*>(packed_b_.get());
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 9cb2111670ba6..cc65142318d02 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/framework/kernel_lookup.h"
 #include "core/framework/kernel_registry_manager.h"
 #include "core/framework/kernel_registry.h"
+#include "core/framework/layering_annotations.h"
 #include "core/framework/resource_accountant.h"
 #include "core/graph/function.h"
 #include "core/graph/function_utils.h"
@@ -69,6 +70,7 @@ struct PartitionParams {
   std::reference_wrapper<const layout_transformation::DebugGraphFn> debug_graph_fn;
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   std::reference_wrapper<const OnPartitionAssignmentFunction> on_partition_assignment_fn;
+  LayeringIndex* layering_index;
 };
 }  // namespace
 
@@ -150,6 +152,7 @@ struct GetCapabilityForEPParams {
   IResourceAccountant* resource_accountant;
   std::reference_wrapper<const GraphOptimizerRegistry> graph_optimizer_registry;
   std::reference_wrapper<const CheckLoadCancellationFn> check_load_cancellation_fn;
+  LayeringIndex* layering_index;  // Added member
 };
 
 auto get_capabilities = [](const IExecutionProvider& ep,
@@ -193,10 +196,94 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l
   auto& capabilities = params.capabilities.get();
   const auto& graph_optimizer_registry = params.graph_optimizer_registry.get();
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  InlinedVector<NodeIndex> assigned_filtered_in_nodes;
+  InlinedVector<const Node*> filtered_in_nodes;
+#endif
+  // Helper to create a GraphViewer that filters nodes based on layering_index if present.
+  auto create_graph_viewer = [&](std::unique_ptr<IndexedSubGraph>& out_sub_graph,
+                                 std::unique_ptr<GraphViewer>& out_viewer) -> Status {
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+    if (params.layering_index) {
+      assigned_filtered_in_nodes.clear();
+      filtered_in_nodes.clear();
+      filtered_in_nodes.reserve(graph.NumberOfNodes());
+
+      auto rules_opt = params.layering_index->GetLayeringRulesForThisEp(ep_type);
+      if (rules_opt) {
+        assigned_filtered_in_nodes.reserve(rules_opt->get().size());
+      }
+
+      for (auto& node : graph.Nodes()) {
+        auto rule_idx_opt = params.layering_index->GetNodeAssignment(graph, node.Index());
+        bool include = true;
+        if (rule_idx_opt) {
+          // If node has an assignment, include it only if it is assigned to this EP
+          if (!rules_opt || rules_opt->get().count(*rule_idx_opt) == 0) {
+            include = false;
+          } else {
+            assigned_filtered_in_nodes.push_back(node.Index());
+          }
+        }
+        // If node has no assignment, it is included (available to any EP)
+
+        if (include) {
+          filtered_in_nodes.push_back(&node);
+        }
+      }
+      ORT_RETURN_IF_ERROR(graph_utils::CreateFilteredIndexedGraph(filtered_in_nodes, graph, out_sub_graph));
+      out_viewer = std::make_unique<GraphViewer>(graph, *out_sub_graph);
+      return Status::OK();
+    }
+#else
+    ORT_UNUSED_PARAMETER(out_sub_graph);
+#endif
+    out_viewer = std::make_unique<GraphViewer>(graph);
+    return Status::OK();
+  };
+  // Helper to un-assign nodes that were assigned to this EP but not claimed by updated capabilities.
+  auto reset_assignment_unclaimed_nodes = [&]() {
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+    if (params.layering_index) {
+      auto rules_opt = params.layering_index->GetLayeringRulesForThisEp(ep_type);
+      if (rules_opt) {
+        const auto& ep_rules = rules_opt->get();
+        InlinedHashSet<NodeIndex> claimed;
+        for (const auto& cap : capabilities) {
+          if (cap && cap->sub_graph) {
+            for (auto idx : cap->sub_graph->nodes) claimed.insert(idx);
+          }
+        }
+
+        // Check if all assigned filtered-in nodes are claimed
+        // and if not make them available for subsequent EPs
+        for (auto& node_index : assigned_filtered_in_nodes) {
+          if (claimed.count(node_index) == 0) {
+            auto rule_idx_opt = params.layering_index->GetNodeAssignment(graph, node_index);
+            if (rule_idx_opt && ep_rules.count(*rule_idx_opt) > 0) {
+              params.layering_index->MakeNodeUnassigned(graph, node_index);
+            }
+          }
+        }
+        assigned_filtered_in_nodes.clear();
+      }
+    }
+#endif
+  };
+
   {
-    const GraphViewer graph_viewer(graph);
-    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant,
+    std::unique_ptr<IndexedSubGraph> sub_graph_holder;
+    std::unique_ptr<GraphViewer> graph_viewer;
+    ORT_RETURN_IF_ERROR(create_graph_viewer(sub_graph_holder, graph_viewer));
+
+    if (params.resource_accountant) {
+      params.resource_accountant->ResetPendingWeights();
+    }
+    capabilities = get_capabilities(current_ep, *graph_viewer, kernel_lookup, params.resource_accountant,
                                     graph_optimizer_registry);
+
+    reset_assignment_unclaimed_nodes();
+
     if (params.check_load_cancellation_fn()) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, MODEL_LOAD_CANCELED,
                              "Graph partitioning was canceled by user request");
@@ -241,9 +328,33 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l
 
     capabilities.clear();
 
-    const GraphViewer graph_viewer(graph);
-    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant,
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+    if (params.layering_index && end_node > first_new_node) {
+      // We need to update the LayeringIndex with newly created nodes
+      // as the layout transformation may have created new nodes
+      // with inherited annotations
+      InlinedVector<NodeIndex> new_node_indices;
+      for (NodeIndex idx = first_new_node; idx < end_node; ++idx) {
+        if (graph.GetNode(idx) != nullptr) {
+          new_node_indices.push_back(idx);
+        }
+      }
+      params.layering_index->Update(graph, new_node_indices);
+    }
+#endif
+
+    std::unique_ptr<IndexedSubGraph> sub_graph_holder;
+    std::unique_ptr<GraphViewer> graph_viewer;
+    ORT_RETURN_IF_ERROR(create_graph_viewer(sub_graph_holder, graph_viewer));
+
+    if (params.resource_accountant) {
+      params.resource_accountant->ResetPendingWeights();
+    }
+    capabilities = get_capabilities(current_ep, *graph_viewer, kernel_lookup, params.resource_accountant,
                                     graph_optimizer_registry);
+
+    reset_assignment_unclaimed_nodes();
+
     if (params.check_load_cancellation_fn()) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, MODEL_LOAD_CANCELED,
                              "GetCapabilities was canceled by user request");
@@ -388,13 +499,13 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability,
 
       fused_node->SetExecutionProviderType(provider_type);
       if (acc_enabled) {
-        // We account for the fused node. We operate under assumption
-        // that the fused node would use no more memory when the nodes we are fusing.
-        // and potentially less than that, and therefore, no threshold check is needed here.
-        // All threshold checks are done within the EP.
-        capability.ComputeAndAccountForNode(*fused_node);
+        // Account for all constituent nodes using the per-node costs computed
+        // during GetCapability() (which already includes within-pass weight dedup).
+        // Computing the cost for the newly created fused node would undercount
+        // because the fused node often doesn't expose all original initializers,
+        // and would commit weights for the wrong node index.
+        capability.AccountForAllNodes();
       }
-
       result = fused_node;
     } else {
       // assign the nodes in the indexed subgraph to the current EP so that level 2+ optimizers will not change them.
@@ -430,7 +541,8 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
                                            const OnPartitionAssignmentFunction& on_partition_assignment_fn,
                                            const logging::Logger& logger, IResourceAccountant* resource_accountant,
                                            const GraphOptimizerRegistry& graph_optimizer_registry,
-                                           bool disable_model_compile) {
+                                           bool disable_model_compile,
+                                           LayeringIndex* layering_index) {  // Added arg
   // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
   // doing it here saves all providers checking for this in GetCapability
   if (graph.NumberOfNodes() == 0) {
@@ -448,7 +560,8 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
                                                        check_load_cancellation_fn,
                                                        on_partition_assignment_fn,
                                                        logger, resource_accountant,
-                                                       graph_optimizer_registry, disable_model_compile));
+                                                       graph_optimizer_registry, disable_model_compile,
+                                                       layering_index));  // Pass through
     }
   }
 
@@ -474,7 +587,8 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
       std::cref(debug_graph_fn),
       resource_accountant,
       std::ref(graph_optimizer_registry),
-      std::cref(check_load_cancellation_fn)};
+      std::cref(check_load_cancellation_fn),
+      layering_index};  // Pass param
 
   ORT_RETURN_IF_ERROR(GetCapabilityForEP(get_capability_params, logger));
   if (capabilities.empty()) {
@@ -654,17 +768,17 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
 }
 
 // expand any nodes that have an ONNX function definition but no matching ORT kernel
-static Status InlineNodes(Graph& graph, bool& modified_graph) {
+static Status InlineNodes(Graph& graph, bool& modified_graph, LayeringIndex* layering_index) {
   // recurse into nested graphs first so we process from bottom up
   for (auto& node : graph.Nodes()) {
     for (auto& entry : node.GetAttributeNameToMutableSubgraphMap()) {
       Graph* subgraph = entry.second;
-      ORT_RETURN_IF_ERROR(InlineNodes(*subgraph, modified_graph));
+      ORT_RETURN_IF_ERROR(InlineNodes(*subgraph, modified_graph, layering_index));
     }
   }
 
-  // See if the node with no provider can be inlined. If one such nodes can be
-  // successfully inlined, we re-run the partitioner on the modified graph.
+  // See if the node with no provider can be inlined. If one such nodes can be successfully inlined,
+  // we re-run the partitioner on the modified graph.
   // NOTE: Inlining the function will change the nodes in the Graph instance, so we can't do that while iterating
   // using graph.Nodes().
   InlinedVector<Node*> nodes_to_inline;
@@ -674,9 +788,50 @@ static Status InlineNodes(Graph& graph, bool& modified_graph) {
     }
   }
 
+  // Collect new node indices for nodes inlined from annotated parents so we can
+  // update the LayeringIndex in one batch.
+  InlinedVector<NodeIndex> new_node_indices;
+
   for (auto* node : nodes_to_inline) {
+    // Check for an effective layering assignment: either from an explicit annotation
+    // on the node, or from an inherited assignment via the LayeringIndex (e.g., a function
+    // call node inside an annotated If/Loop subgraph that inherited its parent's rule).
+    const bool has_explicit_annotation = !node->GetLayeringAnnotation().empty();
+    bool has_effective_assignment = has_explicit_annotation;
+
+    if (layering_index != nullptr && !has_explicit_annotation) {
+      // The node may have an inherited-only assignment with no stored annotation string.
+      // Materialize the annotation on the node so Graph::InlineFunction propagates it
+      // to the newly created inlined nodes.
+      auto rule_idx = layering_index->GetNodeAssignment(graph, node->Index());
+      if (rule_idx) {
+        has_effective_assignment = true;
+        const auto& rules = layering_index->GetRules();
+        if (*rule_idx < rules.rules.size()) {
+          node->SetLayeringAnnotation(rules.rules[*rule_idx].annotation);
+        }
+      }
+    }
+
+    const int max_before = has_effective_assignment ? graph.MaxNodeIndex() : 0;
+
     ORT_RETURN_IF_ERROR(graph.InlineFunction(*node));
     modified_graph = true;
+
+    if (has_effective_assignment) {
+      const int max_after = graph.MaxNodeIndex();
+      for (int i = max_before; i < max_after; ++i) {
+        if (graph.GetNode(static_cast<NodeIndex>(i)) != nullptr) {
+          new_node_indices.push_back(static_cast<NodeIndex>(i));
+        }
+      }
+    }
+  }
+
+  // Update the LayeringIndex so the next partitioning round filters correctly
+  // for the newly inlined nodes that inherited their parent's annotation.
+  if (layering_index != nullptr && !new_node_indices.empty()) {
+    layering_index->Update(graph, new_node_indices);
   }
 
   return Status::OK();
@@ -1018,7 +1173,7 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
                                        KernelRegistryManager& kernel_registry_manager,
                                        const std::optional<ResourceAccountantMap>& acc_map,
                                        const GraphOptimizerRegistry& graph_optimizer_registry,
-                                       const logging::Logger& logger, bool disable_model_compile) {
+                                       const logging::Logger& logger, bool disable_model_compile) {  // Added arg
   bool modified_graph = false;
 
   auto& graph = partition_params.graph.get();
@@ -1046,12 +1201,13 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
                                                        check_load_cancellation_fn,
                                                        on_partition_assignment_fn,
                                                        logger, resource_accountant, graph_optimizer_registry,
-                                                       disable_model_compile));
+                                                       disable_model_compile,
+                                                       partition_params.layering_index));  // Pass param
     }
 
     // expand any nodes that have an ONNX function definition but no matching ORT kernel.
     modified_graph = false;
-    ORT_RETURN_IF_ERROR(InlineNodes(graph, modified_graph));
+    ORT_RETURN_IF_ERROR(InlineNodes(graph, modified_graph, partition_params.layering_index));
 
     // Resolve and rerun graph partitioning and inlining if there was a change
     if (modified_graph) {
@@ -1101,7 +1257,8 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       nullptr,
       std::ref(graph_optimizer_registry),
-      partition_params.check_load_cancellation_fn
+      partition_params.check_load_cancellation_fn,
+      partition_params.layering_index
   };
   // clang-format on
 
@@ -1135,7 +1292,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       Node& fused_node = graph.BeginFuseSubGraph(indexed_sub_graph, node_name);
       fused_node.SetExecutionProviderType(type);
       if (indexed_sub_graph.IsAccountingEnabled()) {
-        indexed_sub_graph.ComputeAndAccountForNode(fused_node);
+        indexed_sub_graph.AccountForAllNodes();
       }
 
       // create filtered graph viewer for this set of nodes
@@ -1143,6 +1300,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       // TODO: Could avoid the topological sort in the GraphViewer ctor by constructing from an existing
       // GraphViewer instance instead of the Graph (copying the topological order instead of recalculating).
       auto viewer = std::make_unique<GraphViewer>(graph, indexed_sub_graph);
+
       compilation_entries.push_back(CompilationEntry{std::move(viewer), fused_node, *capability});
 #else   // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Compiling capabilities is not supported in this build.");
@@ -1153,7 +1311,6 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   // We will compile the fused nodes one by one, and fuse the subgraph if successful.
   for (const auto& compilation_entry : compilation_entries) {
-    const bool acc_enabled = compilation_entry.capability.get().sub_graph->IsAccountingEnabled();
     Node& node = compilation_entry.fused_node;
     std::vector<NodeComputeInfo> single_node_compute_func;
     ORT_RETURN_IF_ERROR(current_ep.Compile({IExecutionProvider::FusedNodeAndGraph{node, *compilation_entry.viewer}},
@@ -1184,9 +1341,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 
     // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one
     graph.FinalizeFuseSubGraph(indexed_sub_graph, node);
-    if (acc_enabled) {
-      compilation_entry.capability.get().sub_graph->ComputeAndAccountForNode(node);
-    }
+    // accounting was already done via AccountForAllNodes() when the fused node was created above.
   }
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
@@ -1259,9 +1414,10 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
                                    const ConfigOptions& config_options,
                                    const logging::Logger& logger,
+                                   LayeringIndex* layering_index,
                                    Mode mode,
                                    const epctx::ModelGenOptions& ep_context_gen_options,
-                                   const layout_transformation::DebugGraphFn& debug_graph_fn) const {
+                                   const layout_transformation::DebugGraphFn& debug_graph_fn) const {  // Added arg
   // It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
   // 1. Execution providers' capabilities are checked one by one.
   // 2. All sub-graphs that an execution provider returns will be assigned to it if it's not assigned yet.
@@ -1292,7 +1448,8 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
       std::ref(fused_node_unique_id),
       std::cref(transform_layout_function),
       std::cref(debug_graph_fn),
-      std::cref(on_partition_assignment_fn_)};
+      std::cref(on_partition_assignment_fn_),
+      layering_index};
 
 #else  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
@@ -1303,7 +1460,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
       std::ref(graph),
       std::cref(check_load_cancellation_fn),
       std::cref(on_partition_assignment_fn_),
-  };
+      layering_index};
 
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
@@ -1323,12 +1480,12 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     // We use this only if Resource Aware Partitioning is enabled for any of the EPs
     // The map is empty if not created if not enabled
     std::optional<ResourceAccountantMap> ep_acc_map;
-    ORT_RETURN_IF_ERROR(NodeStatsRecorder::CreateAccountants(config_options, graph.ModelPath(), ep_acc_map));
+    ORT_RETURN_IF_ERROR(CreateAccountants(config_options, graph.ModelPath(), ep_acc_map));
 
     bool disable_model_compile = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableModelCompile, "0") == "1";
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_,
                                                  ep_acc_map, *graph_optimizer_registry_, logger,
-                                                 disable_model_compile));
+                                                 disable_model_compile));  // Pass param
 
     if (ep_context_gen_options.enable) {
       ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_gen_options, logger));
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index eb70b9f89933d..4de9d94781b18 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -13,6 +13,7 @@ namespace onnxruntime {
 
 class ExecutionProviders;
 class KernelRegistryManager;
+class LayeringIndex;
 class Model;
 struct ConfigOptions;
 
@@ -60,6 +61,7 @@ class GraphPartitioner {
                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
                    const ConfigOptions& config_options,
                    const logging::Logger& logger,
+                   LayeringIndex* layering_index,
                    Mode mode = Mode::kNormal,
                    const epctx::ModelGenOptions& ep_context_gen_options = {},
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
diff --git a/onnxruntime/core/framework/layering_annotations.cc b/onnxruntime/core/framework/layering_annotations.cc
new file mode 100644
index 0000000000000..91df102abef17
--- /dev/null
+++ b/onnxruntime/core/framework/layering_annotations.cc
@@ -0,0 +1,584 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#include "core/graph/constants.h"
+#include "core/common/narrow.h"
+#include "core/common/parse_string.h"
+#include "core/common/string_utils.h"
+#include "core/framework/layering_annotations.h"
+#include "core/framework/ortmemoryinfo.h"
+#include "core/session/abi_devices.h"
+#include "core/framework/execution_providers.h"
+#include "core/graph/graph.h"
+
+#include <limits>
+
+namespace onnxruntime {
+
+common::Status LayeringRules::FromConfigString(const std::string& config_value, LayeringRules& rules) {
+  rules.rules.clear();
+  if (config_value.empty()) {
+    return common::Status::OK();
+  }
+
+  // Track seen annotations to reject duplicates.
+  // Separate sets for exact and prefix match annotations.
+  InlinedHashSet<std::string> seen_exact_annotations;
+  InlinedHashSet<std::string> seen_prefix_annotations;
+
+  auto entries = utils::SplitString(config_value, ";");
+  for (const auto& e : entries) {
+    auto entry = utils::TrimString(e);
+    if (entry.empty()) {
+      continue;
+    }
+
+    const size_t open_paren = entry.find('(');
+    const size_t close_paren = entry.find(')');
+
+    if (open_paren == std::string::npos) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid layering config: Missing '(' in entry: ", entry);
+    }
+    if (close_paren == std::string::npos) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid layering config: Missing ')' in entry: ", entry);
+    }
+    if (close_paren < open_paren) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid layering config: ')' comes before '(' in entry: ", entry);
+    }
+
+    std::string device = entry.substr(0, open_paren);
+    device = utils::TrimString(device);
+
+    if (device.empty()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid layering config: Empty device name in entry: ", entry);
+    }
+
+    std::string annotations_list = entry.substr(open_paren + 1, close_paren - open_paren - 1);
+    auto annotations = utils::SplitString(annotations_list, ",");
+    for (auto& a : annotations) {
+      auto ann = utils::TrimString(a);
+      if (ann.empty()) {
+        continue;
+      }
+
+      bool prefix_match = true;
+      if (ann[0] == '=') {
+        prefix_match = false;
+        ann = ann.substr(1);
+        ann = utils::TrimString(ann);
+      }
+
+      if (ann.empty()) {
+        continue;
+      }
+
+      // Check for duplicate annotation (same annotation string and match type)
+      auto& seen_set = prefix_match ? seen_prefix_annotations : seen_exact_annotations;
+      auto [it, inserted] = seen_set.insert(ann);
+      if (!inserted) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Invalid layering config: Duplicate ", (prefix_match ? "prefix" : "exact"),
+                               " match annotation '", ann, "' found in entry: ", entry);
+      }
+
+      rules.rules.push_back({device, std::move(ann), prefix_match});
+    }
+  }
+
+  return common::Status::OK();
+}
+
+LayeringRuleMatcher::LayeringRuleMatcher(const LayeringRules& rules) {
+  for (size_t i = 0; i < rules.rules.size(); ++i) {
+    const auto& rule = rules.rules[i];
+    ORT_ENFORCE(!rule.annotation.empty(), "Layering rule annotation cannot be empty");
+    if (rule.prefix_match) {
+      AddPrefixRule(rule.annotation, i);
+    } else {
+      AddExactRule(rule.annotation, i);
+    }
+  }
+}
+
+std::optional<size_t> LayeringRuleMatcher::Match(const std::string& node_annotation) const {
+  std::optional<size_t> best_match = std::nullopt;
+
+  // 1. Check Prefix Matches via Trie. Prefix have priority over exact matches.
+  const TrieNode* current = &root_;
+
+  // No empty annotations
+  // so we omit checking the root.
+
+  for (char c : node_annotation) {
+    if (best_match && *best_match == 0) {
+      // Optimization: If we already found index 0, we can't do better.
+      return best_match;
+    }
+
+    auto child_it = current->children.find(c);
+    if (child_it == current->children.end()) {
+      break;
+    }
+    current = child_it->second.get();
+    if (current->rule_index) {
+      UpdateBestMatch(best_match, *current->rule_index);
+    }
+  }
+
+  if (best_match) {
+    return best_match;
+  }
+
+  // 2. Check Exact Matches (fallback)
+  auto it = exact_match_rules_.find(node_annotation);
+  if (it != exact_match_rules_.end()) {
+    best_match = it->second;
+  }
+
+  return best_match;
+}
+
+namespace {
+bool CaseInsensitiveCompare(std::string_view a, std::string_view b) {
+  return std::equal(a.begin(), a.end(), b.begin(), b.end(),
+                    [](char c1, char c2) {
+                      return std::tolower(static_cast<unsigned char>(c1)) ==
+                             std::tolower(static_cast<unsigned char>(c2));
+                    });
+}
+
+bool TryParseIndex(const std::string& str, uint32_t& index) {
+  if (str.empty()) return false;
+  return TryParseStringWithClassicLocale(str, index);
+}
+
+// Sentinel value representing an unknown/unavailable device type.
+// Used when an OrtEpDevice has neither hardware info nor memory info,
+// so we cannot determine the actual device type.
+constexpr OrtDevice::DeviceType kDeviceTypeUnknown = static_cast<OrtDevice::DeviceType>(-1);
+
+// Normalized view of an EP's device properties used by the matching logic.
+// All fields are non-owning references or value types.
+struct EpDeviceView {
+  std::string_view ep_name;
+  OrtDevice::DeviceType device_type;  // OrtDevice::CPU, GPU, NPU, FPGA, or kDeviceTypeUnknown
+  uint32_t vendor_id;
+  OrtDevice::DeviceId device_id;
+  std::string_view vendor_string;  // from OrtHardwareDevice::vendor (empty if unavailable)
+};
+
+bool MatchEpDevice(const EpDeviceView& ep,
+                   std::string_view target_type_str,
+                   std::string_view target_specifier,
+                   std::string_view target_full) {
+  // "cpu"
+  if (CaseInsensitiveCompare(target_type_str, "cpu")) {
+    return ep.ep_name == kCpuExecutionProvider ||
+           ep.device_type == OrtDevice::CPU;
+  }
+  // "gpu"
+  if (CaseInsensitiveCompare(target_type_str, "gpu")) {
+    if (target_specifier.empty()) {
+      if (ep.device_type == OrtDevice::GPU) return true;
+      // Heuristic fallback for common GPU EPs if hardware info is missing
+      return ep.ep_name == kCudaExecutionProvider || ep.ep_name == kDmlExecutionProvider;
+    }
+    // "gpu:<vendor>" or "gpu:<index>"
+    if (ep.device_type == OrtDevice::GPU) {
+      uint32_t index = std::numeric_limits<uint32_t>::max();
+      if (TryParseIndex(std::string(target_specifier), index)) {
+        return ep.device_id == static_cast<OrtDevice::DeviceId>(index);
+      }
+      // gpu:<vendor>
+      if (!ep.vendor_string.empty() && CaseInsensitiveCompare(ep.vendor_string, target_specifier)) {
+        return true;
+      }
+      if (CaseInsensitiveCompare(target_specifier, "nvidia") &&
+          ep.vendor_id == OrtDevice::VendorIds::NVIDIA) return true;
+      if (CaseInsensitiveCompare(target_specifier, "amd") &&
+          ep.vendor_id == OrtDevice::VendorIds::AMD) return true;
+      if (CaseInsensitiveCompare(target_specifier, "intel") &&
+          ep.vendor_id == OrtDevice::VendorIds::INTEL) return true;
+      // Heuristic: gpu:nvidia -> CUDA
+      if (CaseInsensitiveCompare(target_specifier, "nvidia") &&
+          ep.ep_name == kCudaExecutionProvider) return true;
+    }
+    return false;
+  }
+  // "accelerator" (not cpu)
+  if (CaseInsensitiveCompare(target_type_str, "accelerator")) {
+    // Match if the EP is not a known CPU provider and its device type
+    // is not definitively CPU. Unknown device type (no HW/mem info)
+    // is treated as a potential accelerator.
+    return ep.ep_name != kCpuExecutionProvider && ep.device_type != OrtDevice::CPU;
+  }
+  // "npu"
+  if (CaseInsensitiveCompare(target_type_str, "npu")) {
+    if (ep.device_type == OrtDevice::NPU) return true;
+    return ep.ep_name == kQnnExecutionProvider || ep.ep_name == kVitisAIExecutionProvider;
+  }
+  // "fpga"
+  if (CaseInsensitiveCompare(target_type_str, "fpga")) {
+    return ep.device_type == OrtDevice::FPGA;
+  }
+  // "cuda"
+  if (CaseInsensitiveCompare(target_type_str, "cuda")) {
+    return ep.ep_name == kCudaExecutionProvider;
+  }
+  // "dml"
+  if (CaseInsensitiveCompare(target_type_str, "dml")) {
+    return ep.ep_name == kDmlExecutionProvider;
+  }
+  // Fallback: exact EP name match
+  return ep.ep_name == target_full;
+}
+
+void ParseDeviceTarget(const std::string& target_full,
+                       std::string& target_type_str,
+                       std::string& target_specifier) {
+  const auto colon_pos = target_full.find(':');
+  target_type_str = (colon_pos == std::string::npos) ? target_full : target_full.substr(0, colon_pos);
+  target_specifier = (colon_pos != std::string::npos) ? target_full.substr(colon_pos + 1) : std::string();
+}
+
+}  // namespace
+
+std::optional<std::string> EpLayeringMatcher::Match(gsl::span<const OrtEpDevice* const> ep_devices,
+                                                    const LayerAnnotation& rule) {
+  std::string target_type_str, target_specifier;
+  ParseDeviceTarget(rule.device, target_type_str, target_specifier);
+
+  for (const auto* ep_device_ptr : ep_devices) {
+    if (!ep_device_ptr) continue;
+    const OrtEpDevice& ep_device = *ep_device_ptr;
+
+    // Build normalized view from OrtEpDevice.
+    // Device type comes from either the hardware device or the memory info,
+    // with hardware device taking priority. If neither is available,
+    // device_type is set to kDeviceTypeUnknown.
+    OrtDevice::DeviceType device_type = kDeviceTypeUnknown;
+    bool has_hw = ep_device.device != nullptr;
+    if (has_hw) {
+      // Map OrtHardwareDeviceType to OrtDevice::DeviceType
+      switch (ep_device.device->type) {
+        case OrtHardwareDeviceType_GPU:
+          device_type = OrtDevice::GPU;
+          break;
+        case OrtHardwareDeviceType_NPU:
+          device_type = OrtDevice::NPU;
+          break;
+        case OrtHardwareDeviceType_CPU:
+          device_type = OrtDevice::CPU;
+          break;
+        default:
+          device_type = kDeviceTypeUnknown;
+          break;
+      }
+    } else if (ep_device.device_memory_info) {
+      device_type = ep_device.device_memory_info->device.Type();
+    }
+
+    EpDeviceView view{
+        ep_device.ep_name,
+        device_type,
+        has_hw ? ep_device.device->vendor_id : 0u,
+        has_hw ? static_cast<OrtDevice::DeviceId>(ep_device.device->device_id) : OrtDevice::DeviceId{},
+        has_hw ? std::string_view(ep_device.device->vendor) : std::string_view{}};
+
+    if (MatchEpDevice(view, target_type_str, target_specifier, rule.device)) {
+      return std::string(ep_device.ep_name);
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<std::string> EpLayeringMatcher::Match(const ExecutionProviders& providers,
+                                                    const LayerAnnotation& rule) {
+  std::string target_type_str, target_specifier;
+  ParseDeviceTarget(rule.device, target_type_str, target_specifier);
+
+  for (const auto& ep_shared_ptr : providers) {
+    if (!ep_shared_ptr) continue;
+    const IExecutionProvider& ep = *ep_shared_ptr;
+    const OrtDevice& device = ep.GetDevice();
+
+    EpDeviceView view{
+        ep.Type(),
+        device.Type(),
+        device.Vendor(),
+        device.Id(),
+        {}};  // no vendor string available from IExecutionProvider
+
+    if (MatchEpDevice(view, target_type_str, target_specifier, rule.device)) {
+      return std::string(ep.Type());
+    }
+  }
+  return std::nullopt;
+}
+
+LayeringIndex LayeringIndex::Create(const Graph& graph,
+                                    EpNameToLayeringIndices ep_map,
+                                    LayeringIndexToEpName rule_map,
+                                    LayeringRules layering_rules) {
+  // 1. Create LayeringIndex instance with pre-computed maps
+  LayeringIndex index(std::move(layering_rules), std::move(ep_map), std::move(rule_map));
+
+  // 2. Traverse the graph and index nodes
+  index.ProcessGraph(graph, std::nullopt);
+
+  return index;
+}
+
+Status LayeringIndex::Create(const Graph& graph,
+                             const std::string& config_string,
+                             gsl::span<const OrtEpDevice* const> ep_devices,
+                             const ExecutionProviders& ep_providers,
+                             const logging::Logger& logger,
+                             std::optional<LayeringIndex>& layering_index) {
+  LayeringRules rules;
+  ORT_RETURN_IF_ERROR(LayeringRules::FromConfigString(config_string, rules));
+
+  LOGS(logger, INFO) << "Parsed " << rules.rules.size() << " layering rules from config.";
+
+  if (rules.rules.empty()) {
+    // Return no index indicating no layering
+    layering_index.reset();
+    return Status::OK();
+  }
+
+  // Identify which EPs satisfy which rules
+  EpNameToLayeringIndices ep_map;
+  LayeringIndexToEpName rule_map;
+
+  size_t matched_rule_count = 0;
+
+  for (size_t i = 0, lim = rules.rules.size(); i < lim; ++i) {
+    const auto& rule = rules.rules[i];
+
+    // 1. Try matching against ep_devices (from session options)
+    std::optional<std::string> matched_ep;
+    if (!ep_devices.empty()) {
+      matched_ep = EpLayeringMatcher::Match(ep_devices, rule);
+    }
+
+    // 2. If not matched, try matching against Registered EPs
+    if (!matched_ep) {
+      matched_ep = EpLayeringMatcher::Match(ep_providers, rule);
+    }
+
+    if (matched_ep) {
+      const std::string& ep_type = *matched_ep;
+      ep_map[ep_type].insert(i);
+      // Ensure 1:1 mapping from rule index to EP type
+      // Note: A rule index refers to a unique entry in LayeringRules::rules vector.
+      // So 'i' is unique.
+      rule_map[i] = ep_type;
+      matched_rule_count++;
+      LOGS(logger, VERBOSE) << "Layering Rule " << i << " (" << rule.device << " -> " << rule.annotation
+                            << ") mapped to EP: " << ep_type;
+    } else {
+      LOGS(logger, WARNING) << "Layering Rule " << i << " (" << rule.device << " -> " << rule.annotation
+                            << ") could not be mapped to any available Execution Provider.";
+    }
+  }
+
+  LOGS(logger, INFO) << "LayeringIndex created. Matched " << matched_rule_count
+                     << " out of " << rules.rules.size() << " rules to available Execution Providers.";
+
+  layering_index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+  return Status::OK();
+}
+
+void LayeringIndex::ProcessGraph(const Graph& graph, std::optional<size_t> parent_layer_id) {
+  // 3. Create entry for this graph instance
+  bool was_updated = false;
+  std::optional<GraphLayeringIndex> new_index;
+  GraphLayeringIndex* current_graph_index_ptr = nullptr;
+  auto found = graph_index_.find(&graph);
+  if (found != graph_index_.end()) {
+    current_graph_index_ptr = &found->second;
+  } else {
+    new_index.emplace();
+    current_graph_index_ptr = &(*new_index);
+  }
+  GraphLayeringIndex& current_graph_index = *current_graph_index_ptr;
+
+  for (auto& node : graph.Nodes()) {
+    std::optional<size_t> matched_rule_idx = std::nullopt;
+
+    // 4. For every node query its annotation
+    const std::string& annotation = node.GetLayeringAnnotation();
+    if (!annotation.empty()) {
+      // If it has an annotation try to match it
+      matched_rule_idx = matcher_.Match(annotation);
+    }
+
+    // 5. If node has no annotation, inherit from subgraph parent node
+    if (!matched_rule_idx && parent_layer_id) {
+      matched_rule_idx = parent_layer_id;
+    }
+
+    // Record assignment if we have a match
+    if (matched_rule_idx) {
+      const size_t rule_idx = *matched_rule_idx;
+
+      // Only assign if this rule maps to a valid EP in our configuration
+      if (layering_index_to_ep_name_.count(rule_idx)) {
+        ORT_IGNORE_RETURN_VALUE(current_graph_index.node_to_layering_index_.insert_or_assign(node.Index(), rule_idx));
+        ORT_IGNORE_RETURN_VALUE(current_graph_index.layer_to_node_ids_[rule_idx].insert(node.Index()));
+        was_updated = true;
+      } else {
+        // reset since no valid EP mapping
+        matched_rule_idx = std::nullopt;
+      }
+    }
+
+    // Recurse for subgraphs
+    if (node.ContainsSubgraph()) {
+      const std::optional<size_t> subgraph_parent_assignment = matched_rule_idx;
+      for (auto& [attr_name, subgraph] : node.GetAttributeNameToSubgraphMap()) {
+        ProcessGraph(*subgraph, subgraph_parent_assignment);
+      }
+    }
+  }
+  if (was_updated && new_index) {
+    graph_index_.emplace(&graph, std::move(*new_index));
+  }
+}
+
+void LayeringIndex::Update(const Graph& graph, gsl::span<const NodeIndex> nodes) {
+  // Ensure we have an entry for this graph (creating it if it doesn't exist, though typically it should)
+  bool was_updated = false;
+  std::optional<GraphLayeringIndex> new_index;
+  GraphLayeringIndex* current_graph_index_ptr = nullptr;
+  auto found = graph_index_.find(&graph);
+  if (found != graph_index_.end()) {
+    current_graph_index_ptr = &found->second;
+  } else {
+    new_index.emplace();
+    current_graph_index_ptr = &(*new_index);
+  }
+
+  auto& current_graph_index = *current_graph_index_ptr;
+
+  for (NodeIndex node_index : nodes) {
+    // GetMutableNode because we want to ClearLayeringAnnotation if we use it
+    const Node* node = graph.GetNode(node_index);
+    if (!node) {
+      continue;
+    }
+
+    const std::string& annotation = node->GetLayeringAnnotation();
+    if (!annotation.empty()) {
+      auto matched_rule_idx = matcher_.Match(annotation);
+
+      if (matched_rule_idx) {
+        const size_t rule_idx = *matched_rule_idx;
+
+        // Only assign if this rule maps to a valid EP in our configuration
+        if (layering_index_to_ep_name_.count(rule_idx)) {
+          // Check if already assigned to a DIFFERENT rule, if so clean up old mapping
+          auto prev_assign = current_graph_index.node_to_layering_index_.find(node_index);
+          if (prev_assign != current_graph_index.node_to_layering_index_.end()) {
+            size_t old_rule = prev_assign->second;
+            if (old_rule != rule_idx) {
+              current_graph_index.layer_to_node_ids_[old_rule].erase(node_index);
+            }
+          }
+
+          ORT_IGNORE_RETURN_VALUE(current_graph_index.node_to_layering_index_.insert_or_assign(node_index, rule_idx));
+          ORT_IGNORE_RETURN_VALUE(current_graph_index.layer_to_node_ids_[rule_idx].insert(node_index));
+          was_updated = true;
+        }
+      }
+    }
+  }
+  if (was_updated && new_index) {
+    graph_index_.emplace(&graph, std::move(*new_index));
+  }
+}
+
+void LayeringRuleMatcher::AddExactRule(const std::string& annotation, size_t index) {
+  // Only store the first occurrence (lowest index)
+  exact_match_rules_.insert({annotation, index});
+}
+
+void LayeringRuleMatcher::AddPrefixRule(const std::string& annotation, size_t index) {
+  TrieNode* current = &root_;
+  for (char c : annotation) {
+    auto p = current->children.insert({c, nullptr});
+    if (p.second) {
+      p.first->second = std::make_unique<TrieNode>();
+    }
+    current = p.first->second.get();
+  }
+
+  // Only store if strictly better (lower index) or not set
+  // Since we iterate rules 0..N, if a rule index is already set for this node,
+  // it corresponds to a higher priority rule, so we skip overwriting it.
+  if (!current->rule_index) {
+    current->rule_index = index;
+  }
+}
+
+void LayeringRuleMatcher::UpdateBestMatch(std::optional<size_t>& current_best, size_t candidate) const {
+  if (!current_best || candidate < *current_best) {
+    current_best = candidate;
+  }
+}
+
+std::optional<std::reference_wrapper<const InlinedHashSet<size_t>>>
+LayeringIndex::GetLayeringRulesForThisEp(const std::string& ep_type) const {
+  auto hit = ep_name_to_layering_indices_.find(ep_type);
+  if (hit == ep_name_to_layering_indices_.end()) {
+    return {};
+  }
+  return hit->second;
+}
+
+std::optional<size_t> LayeringIndex::GetNodeAssignment(const Graph& graph, NodeIndex node_id) const {
+  auto hit = graph_index_.find(&graph);
+  if (hit == graph_index_.end()) {
+    return {};
+  }
+
+  // Nodes in subgraph that were not annotated has already inherited their
+  // annotation if any from the parent node of the subgraph
+  const auto& graph_layering_index = hit->second;
+  auto layer_hit = graph_layering_index.node_to_layering_index_.find(node_id);
+  if (layer_hit != graph_layering_index.node_to_layering_index_.end()) {
+    return layer_hit->second;
+  }
+  return {};
+}
+
+void LayeringIndex::MakeNodeUnassigned(const Graph& graph, NodeIndex node_id) {
+  auto hit = graph_index_.find(&graph);
+  if (hit == graph_index_.end()) {
+    return;
+  }
+  auto& graph_layering_index = hit->second;
+  auto node_to_layer_hit = graph_layering_index.node_to_layering_index_.find(node_id);
+  std::optional<size_t> layer_idx;
+  if (node_to_layer_hit != graph_layering_index.node_to_layering_index_.end()) {
+    // Get the layer index
+    layer_idx = node_to_layer_hit->second;
+    graph_layering_index.node_to_layering_index_.erase(node_to_layer_hit);
+  }
+  // Remove node from layer collection
+  if (layer_idx) {
+    auto layer_to_nodes_hit = graph_layering_index.layer_to_node_ids_.find(*layer_idx);
+    if (layer_to_nodes_hit != graph_layering_index.layer_to_node_ids_.end()) {
+      layer_to_nodes_hit->second.erase(node_id);
+      if (layer_to_nodes_hit->second.empty()) {
+        graph_layering_index.layer_to_node_ids_.erase(layer_to_nodes_hit);
+      }
+    }
+  }
+}
+
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/layering_annotations.h b/onnxruntime/core/framework/layering_annotations.h
new file mode 100644
index 0000000000000..5d58e9ace2471
--- /dev/null
+++ b/onnxruntime/core/framework/layering_annotations.h
@@ -0,0 +1,213 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#include "core/common/inlined_containers.h"
+#include "core/common/status.h"
+#include "core/graph/basic_types.h"
+#include "core/common/logging/logging.h"
+#include "gsl/gsl"
+#include <string>
+#include <vector>
+#include <optional>
+#include <memory>
+
+struct OrtEpDevice;
+
+namespace onnxruntime {
+class ExecutionProviders;
+class Graph;
+
+/// <summary>
+/// Annotation extracted from kOrtSessionOptionsLayerAssignmentSettings session configuration option.
+/// </summary>
+struct LayerAnnotation {
+  std::string device;
+  std::string annotation;
+  bool prefix_match;
+};
+
+/// <summary>
+/// This struct is a container for layering rules extracted from the kOrtSessionOptionsLayerAssignmentSettings
+/// session configuration option.
+/// </summary>
+struct LayeringRules {
+  std::vector<LayerAnnotation> rules;
+  /// <summary>
+  /// Parses the layering rules from the given configuration string.
+  /// The configuration string is in the following format.:
+  /// 'cpu(L1,L2); gpu(L3,=L4)' where cpu or gpu denote the target EP.
+  /// L1, L2, L3 are annotations that can be matched to node annotations in the graph. The '=' prefix denotes
+  /// exact match. The position of the annotation (L1, L2, L3) in the list denotes its priority in matching (left to right).
+  /// However, the prefix annotations will always have higher priority than the exact match annotations regardless
+  /// of their position in the list. In the above example, L1 has the highest priority, followed by L2,
+  /// then L3 and finally L4. The rules are separated by ';' and there can be multiple rules for different EPs.
+  /// </summary>
+  /// <param name="config_value">The configuration string to parse.</param>
+  /// <param name="rules">Output parameter where the parsed rules will be stored.</param>
+  /// <returns>Status indicating success or failure (e.g. due to format errors).</returns>
+  static common::Status FromConfigString(const std::string& config_value, LayeringRules& rules);
+};
+
+/// <summary>
+/// This class matches node annotations against layering rules.
+/// </summary>
+class LayeringRuleMatcher {
+ public:
+  explicit LayeringRuleMatcher(const LayeringRules& rules);
+
+  /// <summary>
+  /// The method returns the index of the best matching rule for the given annotation
+  /// if it exists
+  /// </summary>
+  /// <param name="node_annotation">annotation retrieved from protobuf node metadata</param>
+  /// <returns>index of the matching LayeringRule if it exists</returns>
+  std::optional<size_t> Match(const std::string& node_annotation) const;
+
+ private:
+  struct TrieNode {
+    InlinedHashMap<char, std::unique_ptr<TrieNode>> children;
+    std::optional<size_t> rule_index;
+  };
+
+  TrieNode root_;
+  InlinedHashMap<std::string, size_t> exact_match_rules_;
+
+  void AddExactRule(const std::string& annotation, size_t index);
+
+  void AddPrefixRule(const std::string& annotation, size_t index);
+
+  void UpdateBestMatch(std::optional<size_t>& current_best, size_t candidate) const;
+};
+
+namespace EpLayeringMatcher {
+/// <summary>
+/// Matches a list of available OrtEpDevices against the device string specified in the LayerAnnotation.
+/// Returns the EP Type string of the first device that matches the rule.
+/// </summary>
+/// <param name="ep_devices">The list of available EP devices.</param>
+/// <param name="rule">The rule containing the device designator.</param>
+/// <returns>Optional containing the matched EP type, nullopt otherwise.</returns>
+std::optional<std::string> Match(gsl::span<const OrtEpDevice* const> ep_devices,
+                                 const LayerAnnotation& rule);
+
+/// <summary>
+/// Matches a collection of ExecutionProviders against the device string specified in the LayerAnnotation.
+/// Returns the EP Type string of the first provider that matches the rule.
+/// </summary>
+/// <param name="providers">The collection of available Execution Providers.</param>
+/// <param name="rule">The rule containing the device designator.</param>
+/// <returns>Optional containing the matched EP type, nullopt otherwise.</returns>
+std::optional<std::string> Match(const ExecutionProviders& providers, const LayerAnnotation& rule);
+}  // namespace EpLayeringMatcher
+
+// This class contains indexing information about the entire graph
+// per sub-graph info is stored in graph_index_
+class LayeringIndex {
+ public:
+  // mapping of EP name/type to a set of LayeringRule indices mapped to that EP.
+  using EpNameToLayeringIndices = InlinedHashMap<std::string, InlinedHashSet<size_t>>;
+  // mapping of LayeringRule index to EP name/type, reverse of the above
+  using LayeringIndexToEpName = InlinedHashMap<size_t, std::string>;
+
+  /// <summary>
+  /// Creates a fully initialized LayeringIndex.
+  /// </summary>
+  /// <param name="graph">The graph to traverse and index.</param>
+  /// <param name="ep_map">Pre-populated mapping of EP names to their applicable rule indices.</param>
+  /// <param name="rule_map">Pre-populated mapping of rule indices to EP names.</param>
+  /// <param name="matcher">Matcher to resolve node annotations to rule indices.</param>
+  static LayeringIndex Create(const Graph& graph,
+                              EpNameToLayeringIndices ep_map,
+                              LayeringIndexToEpName rule_map,
+                              LayeringRules layering_rules);
+
+  /// <summary>
+  /// Factory method that creates a LayeringIndex by parsing configuration, matching rules against
+  /// available devices/providers, and indexing the graph.
+  /// </summary>
+  /// <param name="graph">The graph to index.</param>
+  /// <param name="config_string">The configuration string containing layering rules.</param>
+  /// <param name="ep_devices">Available OrtEpDevices to match rules against.</param>
+  /// <param name="ep_providers">Available ExecutionProviders to match rules against (fallback).</param>
+  /// <param name="logger">Logger for reporting information/errors.</param>
+  /// <param name="layering_index">Output parameter for the created LayeringIndex. Returns no index if
+  ///              no valid layering rules discovered.</param>
+  /// <returns>Status indicating success or failure.</returns>
+  static Status Create(const Graph& graph,
+                       const std::string& config_string,
+                       gsl::span<const OrtEpDevice* const> ep_devices,
+                       const ExecutionProviders& ep_providers,
+                       const logging::Logger& logger,
+                       std::optional<LayeringIndex>& layering_index);
+
+  // Returns the Layering Rule indices mapped to the EP if any
+  std::optional<std::reference_wrapper<const InlinedHashSet<size_t>>>
+  GetLayeringRulesForThisEp(const std::string& ep_type) const;
+
+  // Returns the parsed layering rules
+  const LayeringRules& GetRules() const noexcept { return rules_; }
+
+  // This function returns an index for the Layering rule the node is assigned to if any
+  std::optional<size_t> GetNodeAssignment(const Graph& graph, NodeIndex node_id) const;
+
+  // This is used when an EP fails to claim a node during partitioning so we make it
+  // available for other EPs
+  void MakeNodeUnassigned(const Graph& graph, NodeIndex node_id);
+  /// <summary>
+  /// Updates the layering index for a specific set of nodes in a graph.
+  /// This checks if the nodes have annotations, and if so, matches them against the rules
+  /// and updates the assignment.
+  /// </summary>
+  /// <param name="graph">The graph containing the nodes.</param>
+  /// <param name="nodes">Indices of nodes to check and update.</param>
+  void Update(const Graph& graph, gsl::span<const NodeIndex> nodes);
+
+ private:
+  LayeringRules rules_;
+  LayeringRuleMatcher matcher_;
+  // These stay constant
+  EpNameToLayeringIndices ep_name_to_layering_indices_;
+  LayeringIndexToEpName layering_index_to_ep_name_;
+
+  using SetOfNodes = InlinedHashSet<NodeIndex>;
+  using LayerIndexToNodes = InlinedHashMap<size_t, SetOfNodes>;
+  using NodeIndexToLayeringIndex = InlinedHashMap<NodeIndex, size_t>;
+
+  /// <summary>
+  /// This struct contains the result of layering assignment for a graph.
+  /// The struct first reflects pre-assignment according to the configuration.
+  /// However, as we partition the graph, some nodes may be moved to unassigned sections
+  /// to make them available to subsequent partitioning passes.
+  /// </summary>
+  struct GraphLayeringIndex {
+    // Node to layering idx assignment map 1:1
+    // If the node is not in this map, it is unassigned
+    NodeIndexToLayeringIndex node_to_layering_index_;
+    // This map contains mapping of LayeringRule index to the list of node ids
+    // Reverse from the above 1:M
+    LayerIndexToNodes layer_to_node_ids_;
+  };
+
+  LayeringIndex(LayeringRules layering_rules, EpNameToLayeringIndices ep_name_to_layering_indices, LayeringIndexToEpName layering_index_to_ep_name)
+      : rules_(std::move(layering_rules)),
+        matcher_(rules_),
+        ep_name_to_layering_indices_(std::move(ep_name_to_layering_indices)),
+        layering_index_to_ep_name_(std::move(layering_index_to_ep_name)) {}
+
+  // Graph and sub-graphs mapping to their indices
+  InlinedHashMap<const Graph*, GraphLayeringIndex> graph_index_;
+
+  void ProcessGraph(const Graph& graph, std::optional<size_t> parent_layer_id);
+};
+
+}  // namespace onnxruntime
+
+#else
+namespace onnxruntime {
+class LayeringIndex;
+}
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc
index 0665cc1951e60..68610ebb4be17 100644
--- a/onnxruntime/core/framework/resource_accountant.cc
+++ b/onnxruntime/core/framework/resource_accountant.cc
@@ -11,24 +11,31 @@
 
 #include "core/framework/config_options.h"
 #include "core/framework/murmurhash3.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/graph/constants.h"
 #include "core/graph/graph.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 
 #include <fstream>
+#include <optional>
 
 namespace onnxruntime {
 
 // Use this accountant if your resource can be counted with size_t type
-class SizeTAccountant : public IResourceAccountant {
+// This accountant uses NodeAllocationStats to compute resource consumption per node
+// which can be collected and saved to a file OR loaded from a file and used for partitioning.
+// This is currently used for CUDA EP.
+class SizeBasedStatsAccountant : public IResourceAccountant {
  public:
-  SizeTAccountant() = default;
-  ~SizeTAccountant() = default;
+  SizeBasedStatsAccountant() = default;
+  ~SizeBasedStatsAccountant() = default;
 
-  SizeTAccountant(size_t threshold, InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
+  SizeBasedStatsAccountant(size_t threshold, InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
       : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {}
 
-  explicit SizeTAccountant(InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
+  explicit SizeBasedStatsAccountant(size_t threshold) : IResourceAccountant(threshold) {}
+
+  explicit SizeBasedStatsAccountant(InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
       : IResourceAccountant(), node_stats_(std::move(node_stats)) {}
 
   ResourceCount GetConsumedAmount() const noexcept override {
@@ -46,20 +53,99 @@ class SizeTAccountant : public IResourceAccountant {
     }
   }
 
-  ResourceCount ComputeResourceCount(const Node& node) const override {
-    const auto node_name = MakeUniqueNodeName(node);
-    auto hit = node_stats_.find(node_name);
-    if (hit != node_stats_.end()) {
-      const auto& stats = hit->second;
-      return stats.input_sizes + stats.initializers_sizes +
-             stats.total_dynamic_sizes + stats.total_temp_allocations;
+  ResourceCount ComputeResourceCount(const Node& node) override {
+    if (node_stats_) {
+      const auto node_name = MakeUniqueNodeName(node);
+      auto hit = node_stats_->find(node_name);
+      if (hit != node_stats_->end()) {
+        const auto& stats = hit->second;
+        return stats.input_sizes + stats.initializers_sizes +
+               stats.total_dynamic_sizes + stats.total_temp_allocations;
+      }
+      return static_cast<size_t>(0U);
+    } else {
+      const auto* graph = node.GetContainingGraph();
+      if (!graph) return static_cast<size_t>(0);
+
+      SafeInt<size_t> total_size = 0;
+      for (const auto* input_def : node.InputDefs()) {
+        if (!input_def->Exists()) continue;
+
+        const auto& name = input_def->Name();
+        constexpr bool check_outer_scope = true;
+        const auto* tensor_proto = graph->GetInitializer(name, check_outer_scope);
+
+        if (tensor_proto) {
+          // Skip if already committed from a previous partitioning iteration
+          if (committed_weights_.count(name) > 0) {
+            continue;
+          }
+
+          // Skip if already pending from another node in this GetCapability pass
+          if (pending_weights_.count(name) > 0) {
+            continue;
+          }
+
+          size_t size = 0;
+          auto status = utils::GetSizeInBytesFromTensorProto<0>(*tensor_proto, &size);
+
+          if (status.IsOK()) {
+            total_size += size;
+            pending_weights_.insert(name);
+            pending_weights_by_node_[node.Index()].insert(name);
+          }
+        }
+      }
+
+      // Account for intermediate output tensors when shape info is available.
+      // GetSizeInBytesFromTensorTypeProto will only succeed when all dims are known
+      // (static shape) and a valid element type is present, so dynamic outputs are
+      // naturally skipped.
+      SafeInt<size_t> output_size = 0;
+      for (const auto* output_def : node.OutputDefs()) {
+        if (!output_def->Exists() || !output_def->HasTensorOrScalarShape()) continue;
+        const auto* type_proto = output_def->TypeAsProto();
+        if (!type_proto || !utils::HasTensorType(*type_proto)) continue;
+
+        size_t size = 0;
+        if (utils::GetSizeInBytesFromTensorTypeProto<0>(type_proto->tensor_type(), &size).IsOK()) {
+          output_size += size;
+        }
+      }
+
+      // Apply a safety multiplier for workspace/temp allocations we can't see
+      constexpr size_t kAdHocSafetyMultiplierPercent = 150;  // 1.5x
+      SafeInt<size_t> estimated = total_size + output_size;
+      return static_cast<size_t>(estimated * kAdHocSafetyMultiplierPercent / 100);
+    }
+  }
+
+  void ResetPendingWeights() override {
+    pending_weights_.clear();
+    pending_weights_by_node_.clear();
+  }
+
+  void CommitWeightsForNode(NodeIndex node_index) override {
+    auto it = pending_weights_by_node_.find(node_index);
+    if (it != pending_weights_by_node_.end()) {
+      for (const auto& name : it->second) {
+        pending_weights_.erase(name);
+      }
+      committed_weights_.insert(it->second.begin(), it->second.end());
+      pending_weights_by_node_.erase(it);
     }
-    return static_cast<size_t>(0U);
   }
 
  private:
   size_t consumed_amount_ = 0;
-  InlinedHashMap<std::string, NodeAllocationStats> node_stats_;
+  std::optional<InlinedHashMap<std::string, NodeAllocationStats>> node_stats_;
+  // Weights committed from previous partitioning iterations.
+  // These persist across GetCapability passes.
+  InlinedHashSet<std::string> committed_weights_;
+  // Flat set of all pending weight names for O(1) membership checks.
+  InlinedHashSet<std::string> pending_weights_;
+  // Same pending weights keyed by node index, used by CommitWeightsForNode.
+  InlinedHashMap<NodeIndex, InlinedHashSet<std::string>> pending_weights_by_node_;
 };
 
 struct NodeStatsRecorder::Impl {
@@ -155,10 +241,11 @@ static Status LoadNodeAllocationStats(
   return Status::OK();
 }
 
-Status NodeStatsRecorder::CreateAccountants(
+Status CreateAccountants(
     const ConfigOptions& config_options,
     const std::filesystem::path& model_path,
     std::optional<ResourceAccountantMap>& acc_map) {
+  std::optional<ResourceAccountantMap> result;
   // Check if CUDA partitioning settings are provided
   const std::string resource_partitioning_settings = config_options.GetConfigOrDefault(
       kOrtSessionOptionsResourceCudaPartitioningSettings, "");
@@ -166,29 +253,34 @@ Status NodeStatsRecorder::CreateAccountants(
   if (!resource_partitioning_settings.empty()) {
     auto splits = utils::SplitString(resource_partitioning_settings, ",", true);
     if (splits.size() == 2) {
-      if (splits[1].empty()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid resource partitioning settings");
-      }
-
-      InlinedHashMap<std::string, NodeAllocationStats> loaded_stats;
-      ORT_RETURN_IF_ERROR(LoadNodeAllocationStats(model_path, splits[1], loaded_stats));
-
-      std::optional<ResourceAccountantMap> result;
       auto& map = result.emplace();
 
+      std::optional<size_t> cuda_memory_limit;
       if (!splits[0].empty()) {
-        size_t cuda_memory_limit = 0;
-        ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(std::string{splits[0]}, cuda_memory_limit));
-        cuda_memory_limit = SafeInt<size_t>(cuda_memory_limit) * 1024;  // to bytes
+        cuda_memory_limit.emplace(0U);
+        ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(std::string{splits[0]}, *cuda_memory_limit));
+        cuda_memory_limit = SafeInt<size_t>(*cuda_memory_limit) * 1024;  // to bytes
+      }
+
+      std::optional<InlinedHashMap<std::string, NodeAllocationStats>> loaded_stats;
+      if (!splits[1].empty()) {
+        loaded_stats.emplace();
+        ORT_RETURN_IF_ERROR(LoadNodeAllocationStats(model_path, splits[1], *loaded_stats));
+      }
+
+      if (cuda_memory_limit && loaded_stats) {
         map.insert_or_assign(kCudaExecutionProvider,
-                             std::make_unique<SizeTAccountant>(cuda_memory_limit,
-                                                               std::move(loaded_stats)));
-      } else {
+                             std::make_unique<SizeBasedStatsAccountant>(*cuda_memory_limit,
+                                                                        std::move(*loaded_stats)));
+      } else if (cuda_memory_limit) {
         map.insert_or_assign(kCudaExecutionProvider,
-                             std::make_unique<SizeTAccountant>(std::move(loaded_stats)));
+                             std::make_unique<SizeBasedStatsAccountant>(*cuda_memory_limit));
+      } else if (loaded_stats) {
+        map.insert_or_assign(kCudaExecutionProvider,
+                             std::make_unique<SizeBasedStatsAccountant>(std::move(*loaded_stats)));
+      } else {
+        map.insert_or_assign(kCudaExecutionProvider, std::make_unique<SizeBasedStatsAccountant>());
       }
-
-      acc_map = std::move(result);
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid format for: ",
                              kOrtSessionOptionsResourceCudaPartitioningSettings,
@@ -196,6 +288,7 @@ Status NodeStatsRecorder::CreateAccountants(
     }
   }
 
+  acc_map = std::move(result);
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index bee7f048b7c6e..74fbe4d24de96 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -2531,5 +2531,18 @@ Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& initializer, std
   return UnpackInitializerData(initializer, std::filesystem::path(), unpacked_tensor);
 }
 
+std::optional<std::string> GetNodeProtoLayeringAnnotation(const ONNX_NAMESPACE::NodeProto& node_proto) {
+  std::optional<std::string> result;
+  for (const auto& prop : node_proto.metadata_props()) {
+    if (prop.key() == kNodeProtoLayerAnnotation) {
+      if (!prop.value().empty()) {
+        result = prop.value();
+        break;
+      }
+    }
+  }
+  return result;
+}
+
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index e7649c072416c..8b22e8d6d1c89 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -671,5 +671,15 @@ common::Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& initiali
  */
 common::Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& initializer,
                                      std::vector<uint8_t>& unpacked_tensor);
+
+constexpr const char* kNodeProtoLayerAnnotation = "layer_ann";
+
+/**
+ * This function examines the given node proto and looks into its metadata_props.
+ * It returns the first non-empty value found for the key kNodeProtoLayerAnnotation.
+ * A node is expected to have only one such annotation.
+ * If no non-empty annotation is found, std::nullopt is returned.
+ */
+std::optional<std::string> GetNodeProtoLayeringAnnotation(const ONNX_NAMESPACE::NodeProto& node_proto);
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 3599edbfcd357..e7da5a16930c6 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -3935,6 +3935,20 @@ Status Graph::RemovedUnusedInitializersOrtFormat() {
   auto result = ForThisAndAllSubgraphs(all_subgraphs, cleanup_func);
   return result;
 }
+
+Status Graph::RemoveAllLayeringAnnotations() {
+  std::vector<Graph*> all_subgraphs;
+  FindAllSubgraphs(all_subgraphs);
+  auto cleanup_func = [](Graph& graph) {
+    for (auto& node : graph.Nodes()) {
+      node.ClearLayeringAnnotation();
+    }
+    return Status::OK();
+  };
+
+  return ForThisAndAllSubgraphs(all_subgraphs, cleanup_func);
+}
+
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
 const std::string& Graph::Name() const noexcept {
@@ -4371,6 +4385,13 @@ Node& Graph::AddNode(const Node& other) {
                            &other.GetAttributes(),
                            other.Domain());
 
+  // Preserve layering annotation from the source node so that graph transformers
+  // that reconstruct nodes (or function inlining) retain the EP assignment hint.
+  const auto& annotation = other.GetLayeringAnnotation();
+  if (!annotation.empty()) {
+    new_node.SetLayeringAnnotation(annotation);
+  }
+
   return new_node;
 }
 
@@ -4396,6 +4417,13 @@ Node& Graph::AddNode(const NodeProto& node_proto,
                            &attributes,
                            node_proto.domain());
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  auto maybe_annotation = utils::GetNodeProtoLayeringAnnotation(node_proto);
+  if (maybe_annotation) {
+    new_node.SetLayeringAnnotation(std::move(*maybe_annotation));
+  }
+#endif  //
+
   // Perf optimization: temporarily set NodeProto in Node so we don't need to call Node::ToProto prior to
   // calling onnx::check_node
   // NOTE: We don't handle a node with kOnnxDomainAlias. The entry in schema_registry_ uses kOnnxDomain,
@@ -4630,6 +4658,38 @@ Node& Graph::AddNode(const std::string& name,
   return *node;
 }
 
+Node& Graph::AddNode(const std::string& name,
+                     const std::string& op_type,
+                     const std::string& description,
+                     gsl::span<NodeArg* const> input_args,
+                     gsl::span<NodeArg* const> output_args,
+                     const Node& annotation_source,
+                     const NodeAttributes* attributes,
+                     const std::string& domain) {
+  auto& new_node = AddNode(name, op_type, description, input_args, output_args, attributes, domain);
+  const auto& annotation = annotation_source.GetLayeringAnnotation();
+  if (!annotation.empty()) {
+    new_node.SetLayeringAnnotation(annotation);
+  }
+  return new_node;
+}
+
+Node& Graph::AddNode(const std::string& name,
+                     const std::string& op_type,
+                     const std::string& description,
+                     gsl::span<NodeArg* const> input_args,
+                     gsl::span<NodeArg* const> output_args,
+                     const Node& annotation_source,
+                     NodeAttributes&& attributes,
+                     const std::string& domain) {
+  auto& new_node = AddNode(name, op_type, description, input_args, output_args, std::move(attributes), domain);
+  const auto& annotation = annotation_source.GetLayeringAnnotation();
+  if (!annotation.empty()) {
+    new_node.SetLayeringAnnotation(annotation);
+  }
+  return new_node;
+}
+
 bool Graph::RemoveNode(NodeIndex p_index) {
   auto node = GetNode(p_index);
   if (nullptr == node) {
@@ -6074,7 +6134,8 @@ Status Graph::InlineIfSubgraph(bool condition_value, Node& if_node, const loggin
   return Status::OK();
 }
 
-Status Graph::InlineFunctionProto(const ONNX_NAMESPACE::FunctionProto& func_to_inline) {
+Status Graph::InlineFunctionProto(const ONNX_NAMESPACE::FunctionProto& func_to_inline,
+                                  const std::string& parent_annotation) {
   auto to_node_arg = [this](const std::string& name) {
     return &this->GetOrCreateNodeArg(name, nullptr);
   };
@@ -6109,28 +6170,31 @@ Status Graph::InlineFunctionProto(const ONNX_NAMESPACE::FunctionProto& func_to_i
     for (const auto& node_attr : inlined_node->attribute()) {
       new_attr_map.insert_or_assign(node_attr.name(), node_attr);
     }
-    ORT_IGNORE_RETURN_VALUE(AddNode(inlined_node->name(), inlined_node->op_type(),
-                                    inlined_node->doc_string(), inputs, outputs,
-                                    &new_attr_map, inlined_node->domain()));
+    auto& new_node = AddNode(inlined_node->name(), inlined_node->op_type(),
+                             inlined_node->doc_string(), inputs, outputs,
+                             &new_attr_map, inlined_node->domain());
+
+    // Nodes that come from function_proto currently can not have any annotations.
+    // So we set it to parent.
+    if (!parent_annotation.empty()) {
+      new_node.SetLayeringAnnotation(parent_annotation);
+    }
   }
 
   return Status::OK();
 }
 
 Status Graph::InlineFunction(Node& callnode) {
-  // Remove output edges. Requirement for RemoveNode() below.
-  auto output_edges = callnode.GetRelationships().output_edges;  // copy so RemoveEdge doesn't invalidate iterator
-  for (const auto& output_edge : output_edges) {
-    RemoveEdge(callnode.Index(), output_edge.GetNode().Index(), output_edge.GetSrcArgIndex(),
-               output_edge.GetDstArgIndex());
-  }
-
   // create a uniq_identifier to append to every node name and intermediate input\outputs
   // to make sure there are no unintended duplicates
   std::string base_uniq_identifier{"_inlfunc_"};
   base_uniq_identifier.append(callnode.OpType());
   const auto uniq_identifier = GenerateNodeName(base_uniq_identifier);
 
+  // Capture the parent function node's layering annotation before inlining.
+  // Inlined nodes that don't already have their own annotation will inherit this.
+  const std::string parent_annotation = callnode.GetLayeringAnnotation();
+
   // Replace a (function-call) node by an inlined graph.
   if (!callnode.GetFunctionBody()) {
     // This is the normal use-case: inlining a FunctionProto (representing
@@ -6142,7 +6206,7 @@ Status Graph::InlineFunction(Node& callnode) {
     function_utils::Specialize(inlined_fp, callnode, uniq_identifier);
 
     // In this case, global Resolve() will take care of everything.
-    ORT_RETURN_IF_ERROR(InlineFunctionProto(inlined_fp));
+    ORT_RETURN_IF_ERROR(InlineFunctionProto(inlined_fp, parent_annotation));
   } else {
     // Uncommon scenario. Inlining a node representing a fused sub-graph.
     // TODO: Unclear that this feature is needed. Can this be removed?
@@ -6161,11 +6225,18 @@ Status Graph::InlineFunction(Node& callnode) {
           outputs.push_back(&n_output);
         }
 
-        AddNode(subgraph_node.Name() + uniq_identifier, subgraph_node.OpType(), subgraph_node.Description(),
-                inputs,
-                outputs,
-                &subgraph_node.GetAttributes(),
-                subgraph_node.Domain());
+        auto& new_node = AddNode(subgraph_node.Name() + uniq_identifier, subgraph_node.OpType(),
+                                 subgraph_node.Description(),
+                                 inputs,
+                                 outputs,
+                                 &subgraph_node.GetAttributes(),
+                                 subgraph_node.Domain());
+        if (!subgraph_node.GetLayeringAnnotation().empty()) {
+          new_node.SetLayeringAnnotation(subgraph_node.GetLayeringAnnotation());
+        } else if (!parent_annotation.empty()) {
+          // If the subgraph node doesn't have its own annotation, use the parent function node's annotation.
+          new_node.SetLayeringAnnotation(parent_annotation);
+        }
       }
     }
 
@@ -6192,9 +6263,15 @@ Status Graph::InlineFunction(Node& callnode) {
     }
   }
 
-  RemoveNode(callnode.Index());
+  // Requirement for RemoveNode() below.
+  // copy so RemoveEdge doesn't invalidate iterator
+  auto output_edges = callnode.GetRelationships().output_edges;
+  for (const auto& output_edge : output_edges) {
+    RemoveEdge(callnode.Index(), output_edge.GetNode().Index(), output_edge.GetSrcArgIndex(),
+               output_edge.GetDstArgIndex());
+  }
 
-  // std::cout << "Graph after inlining\n\n" << *this << std::endl << std::flush;
+  RemoveNode(callnode.Index());
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/graph/graph_utils.cc b/onnxruntime/core/graph/graph_utils.cc
index 0480263befdd1..85de654581161 100644
--- a/onnxruntime/core/graph/graph_utils.cc
+++ b/onnxruntime/core/graph/graph_utils.cc
@@ -32,6 +32,154 @@ static int GetIndexFromName(const Node& node, const std::string& name, bool is_i
   return static_cast<int>(index);
 }
 
+Status CreateFilteredIndexedGraph(gsl::span<const Node* const> nodes, const Graph& graph,
+                                  std::unique_ptr<IndexedSubGraph>& result) {
+  // Following data structures help determine the final inputs/outputs of the subgraph.
+  // Note: The 'subgraph' here refers to a graph that contains a subset of nodes in the 'src_graph'.
+
+  // Pre-pass: Identify all outputs produced by nodes within the subgraph.
+  // This allows O(1) checks to determine if an input is internal or from the boundary.
+  InlinedHashSet<NodeIndex> node_set;
+  InlinedHashSet<const NodeArg*> internal_outputs;
+  for (size_t i = 0, lim = nodes.size(); i < lim; i++) {
+    const auto& node = *nodes[i];
+    node_set.insert(node.Index());
+    for (const auto& output : node.OutputDefs()) {
+      internal_outputs.insert(output);
+    }
+  }
+
+  // Source graph output names
+  InlinedHashSet<std::string> graph_output_names;
+  for (const auto* output_arg : graph.GetOutputs()) {
+    graph_output_names.insert(output_arg->Name());
+  }
+
+  // These maps store the inputs and outputs of the subgraph.
+  // Value is order index to maintain deterministic order.
+  InlinedHashMap<const NodeArg*, int> subgraph_inputs, subgraph_outputs;
+
+  int input_order = 0;
+  int output_order = 0;
+
+  std::unique_ptr<IndexedSubGraph> indexed_sub_graph = std::make_unique<IndexedSubGraph>();
+  InlinedVector<std::string> initializers;
+
+  // Add nodes and identify boundary inputs/outputs
+  for (size_t i = 0, lim = nodes.size(); i < lim; i++) {
+    const auto& node = *nodes[i];
+    indexed_sub_graph->nodes.push_back(node.Index());
+
+    // Process Inputs: If an input is not produced internally, it's a subgraph input.
+    auto process_inputs = [&](gsl::span<const NodeArg* const> inputs) {
+      for (const auto& input : inputs) {
+        if (!input->Exists()) continue;
+
+        const auto* tensor_proto = graph.GetConstantInitializer(input->Name(), true);
+        if (tensor_proto != nullptr) {
+          initializers.push_back(input->Name());
+          continue;
+        }
+
+        // If not produced by this subgraph, it's a boundary input
+        if (internal_outputs.count(input) == 0) {
+          // Use insert to keep the first occurrence's order
+          auto emplace_result = subgraph_inputs.emplace(input, input_order);
+          if (emplace_result.second) {
+            ++input_order;
+          }
+        }
+      }
+    };
+
+    process_inputs(gsl::make_span(node.InputDefs().data(), node.InputDefs().size()));
+    process_inputs(gsl::make_span(node.ImplicitInputDefs().data(), node.ImplicitInputDefs().size()));
+
+    // Process Outputs: If an output is graph output OR consumed externally, it's a subgraph output.
+    for (const auto& output : node.OutputDefs()) {
+      if (!output->Exists()) continue;
+
+      bool is_boundary_output = false;
+
+      // 1. Is it a graph output?
+      if (graph_output_names.count(output->Name()) > 0) {
+        is_boundary_output = true;
+      } else {
+        // 2. Is it consumed by any node outside the subgraph?
+        for (auto it = node.OutputEdgesBegin(), end = node.OutputEdgesEnd(); it != end; ++it) {
+          // Check if the edge uses this specific output
+          if (it->GetSrcArgIndex() < static_cast<int>(node.OutputDefs().size()) &&
+              node.OutputDefs()[it->GetSrcArgIndex()] == output) {
+            if (node_set.count(it->GetNode().Index()) == 0) {
+              is_boundary_output = true;
+              break;
+            }
+          }
+        }
+      }
+
+      if (is_boundary_output) {
+        subgraph_outputs.insert({output, output_order++});
+      }
+    }
+  }
+
+  std::multimap<int, const NodeArg*> inputs, outputs;
+
+  // Get the input order of the original graph
+  InlinedHashMap<const NodeArg*, int> original_inputs;
+  int order = 0;
+  for (const auto* input : graph.GetInputs()) {
+    original_inputs[input] = order++;
+  }
+
+  // input order needs to be consistent with original graph's input order
+  for (const auto& [node_arg, subgraph_input_order] : subgraph_inputs) {
+    const auto original_input_it = original_inputs.find(node_arg);
+
+    if (original_input_it != original_inputs.end()) {
+      inputs.emplace(
+          original_input_it->second,  // input order from original graph
+          node_arg);
+    } else {
+      inputs.emplace(
+          subgraph_input_order,  // input order from subgraph
+          node_arg);
+    }
+  }
+
+  // Sort outputs by the order they were added
+  for (const auto& [node_arg, subgraph_output_order] : subgraph_outputs) {
+    outputs.emplace(subgraph_output_order, node_arg);
+  }
+
+  std::unique_ptr<IndexedSubGraph::MetaDef> meta_def = std::make_unique<IndexedSubGraph::MetaDef>();
+  meta_def->name = "sub_graph";
+  meta_def->since_version = 1;
+
+  // Assign inputs and outputs to subgraph's meta_def
+  for (const auto& input : inputs) {
+    if (input.second->Exists()) {
+      meta_def->inputs.push_back(input.second->Name());
+    }
+  }
+
+  for (const auto& initializer : initializers) {
+    meta_def->constant_initializers.push_back(initializer);
+  }
+
+  for (const auto& output : outputs) {
+    if (output.second->Exists()) {
+      meta_def->outputs.push_back(output.second->Name());
+    }
+  }
+
+  indexed_sub_graph->SetMetaDef(std::move(meta_def));
+  result = std::move(indexed_sub_graph);
+
+  return Status::OK();
+}
+
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
 #if !defined(ORT_MINIMAL_BUILD)
@@ -1010,6 +1158,5 @@ NodeArg& CreateNodeArg(Graph& graph, const NodeArg& base_arg) {
 }
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
-
 }  // namespace graph_utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/graph_utils.h b/onnxruntime/core/graph/graph_utils.h
index 256a6fc81495d..2106da1a96327 100644
--- a/onnxruntime/core/graph/graph_utils.h
+++ b/onnxruntime/core/graph/graph_utils.h
@@ -475,5 +475,21 @@ NodeArg& CreateNodeArg(Graph& graph, const NodeArg& base_arg);
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+/// <summary>
+/// This function creates an indexed subgraph from a collection of nodes
+/// using the graph instance. The IndexedSubgraph can then be used to create
+/// a filtered GraphViewer instance that only contains the nodes in the collection.
+/// </summary>
+/// <param name="nodes"></param>
+/// <param name="graph"></param>
+/// <param name="indexed_subgraph"></param>
+/// <returns></returns>
+Status CreateFilteredIndexedGraph(gsl::span<const Node* const> nodes, const Graph& graph,
+                                  std::unique_ptr<IndexedSubGraph>& indexed_subgraph);
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 }  // namespace graph_utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/mlas/lib/qnbitgemm.cpp b/onnxruntime/core/mlas/lib/qnbitgemm.cpp
index e861a26f188ba..f649d8ab38648 100644
--- a/onnxruntime/core/mlas/lib/qnbitgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qnbitgemm.cpp
@@ -31,10 +31,8 @@ enum QNBitGemmVariant {
     SQ4BitGemmVariant_CompFp32 = 0,
     SQ4BitGemmVariant_CompInt8,
     HQ4BitGemmVariant_CompFp16,
-    HQ4BitGemmVariant_CompInt8,
     SQ8BitGemmVariant_CompInt8,
     HQ8BitGemmVariant_CompFp16,
-    HQ8BitGemmVariant_CompInt8,
 
     // End of valid variants
 
@@ -58,16 +56,12 @@ GetQNBitGemmVariant(
                 return HQ4BitGemmVariant_CompFp16;
             } else if (ComputeType == SQNBIT_CompInt8) {
                 return SQ4BitGemmVariant_CompInt8;
-            } else if (ComputeType == HQNBIT_CompInt8) {
-                return HQ4BitGemmVariant_CompInt8;
             }
         } else if (BlkBitWidth == 8) {
             if (ComputeType == SQNBIT_CompInt8) {
                 return SQ8BitGemmVariant_CompInt8;
             } else if (ComputeType == HQNBIT_CompFp16) {
                 return HQ8BitGemmVariant_CompFp16;
-            } else if (ComputeType == HQNBIT_CompInt8) {
-                return HQ8BitGemmVariant_CompInt8;
             }
         }
     }
@@ -84,6 +78,12 @@ MlasIsQNBitGemmAvailable(
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
+    // HQNBIT_CompInt8 uses the same MLAS kernels as SQNBIT_CompInt8.
+    // The operator handles fp16<->fp32 conversion and delegates to the SQ path.
+    if (ComputeType == HQNBIT_CompInt8) {
+        ComputeType = SQNBIT_CompInt8;
+    }
+
     const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch;
     if (Dispatch == nullptr) {
         return false;
@@ -101,7 +101,7 @@ MlasIsQNBitGemmAvailable(
                    Dispatch->HQ4BitGemmKernel_CompFp16 != nullptr &&
                    Dispatch->HQ4BitBlkDequantBForHgemm_CompFp16 != nullptr;
         }
-        case SQ4BitGemmVariant_CompInt8: { // SQ4BitGemmKernel_BlkSum_CompInt8
+        case SQ4BitGemmVariant_CompInt8: {
             return
               (Dispatch->SQ4BitGemmKernel_Packed_CompInt8 != nullptr && Dispatch->QuantizeA_Packed_CompInt8 != nullptr) ||
               (Dispatch->SQ4BitGemmKernel_CompInt8 != nullptr && Dispatch->QuantizeARow_CompInt8 != nullptr) ||
@@ -117,16 +117,6 @@ MlasIsQNBitGemmAvailable(
                    Dispatch->HQ8BitBlkDequantBForHgemm_CompFp16 != nullptr &&
                    Dispatch->HQ4BitGemmKernel_CompFp16 != nullptr;
         }
-        case HQ4BitGemmVariant_CompInt8: {
-            return
-              (Dispatch->SQ4BitGemmKernel_CompInt8 != nullptr && Dispatch->QuantizeARow_CompInt8 != nullptr) ||
-              (Dispatch->SQ4BitGemmKernel_BlkSum_CompInt8 != nullptr && Dispatch->QuantizeARowComputeBlkSum_CompInt8 != nullptr);
-        }
-        case HQ8BitGemmVariant_CompInt8: {
-            return Dispatch->SQ8BitGemmPackQuantBDataAndBlkSum != nullptr &&
-                   Dispatch->SQ8BitGemmKernel_BlkSum_CompInt8 != nullptr &&
-                   Dispatch->QuantizeARowComputeBlkSum_CompInt8 != nullptr;
-        }
         default: {
             return false;
         }
@@ -270,16 +260,6 @@ struct PerGemmQuantAWorkspace {
     size_t M_, BlockCountK_, BlkLen_;
 };
 
-// Workspace bundle for HQ8BitGemm_CompInt8.
-// Contains QuantA workspace and pre-extracted float B pointers from PackedQuantBDataStruct<float,8>.
-struct HQ8BitCompInt8PerGemmWorkspace {
-    PerGemmQuantAWorkspace quant_a;
-    std::byte* PackedQuantBData;
-    float* PackedQuantBScale;
-    float* QuantBBlkSum;
-    float* BlkUnsignedQuantAZeroPointCorrection;
-};
-
 void MLASCALL
 MlasQNBitGemmPackQuantBData(
     size_t N,
@@ -318,20 +298,6 @@ MlasQNBitGemmPackQuantBData(
                 ThreadPool,
                 BackendKernelSelectorConfig
             );
-        } else if (ComputeType == HQNBIT_CompInt8 && Dispatch->SQ4BitGemmPackQuantBData != nullptr) {
-            // Use SQ4BitGemmPackQuantBData directly with SQNBIT_CompInt8 to get the correct int8
-            // sub-block packing format. Bypass SQ4BitGemmPackQuantBDataAndBlkSum to avoid KleidiAI
-            // path which would incorrectly interpret fp16 scales as float.
-            Dispatch->SQ4BitGemmPackQuantBData(
-                N,
-                K,
-                BlkLen,
-                SQNBIT_CompInt8,
-                static_cast<const std::byte*>(QuantBData),
-                static_cast<std::byte*>(PackedQuantBDataAndOrBlkSumWorkspace),
-                ThreadPool,
-                BackendKernelSelectorConfig
-            );
         } else if (ComputeType == HQNBIT_CompFp16 && Dispatch->HQ4BitGemmPackQuantBData != nullptr) {
             Dispatch->HQ4BitGemmPackQuantBData(
                 N,
@@ -371,7 +337,7 @@ MlasQNBitGemmPackQuantBData(
                 ThreadPool,
                 BackendKernelSelectorConfig
             );
-        } else if ((ComputeType == SQNBIT_CompInt8 || ComputeType == HQNBIT_CompInt8) && Dispatch->SQ8BitGemmPackQuantBDataAndBlkSum != nullptr) {
+        } else if (ComputeType == SQNBIT_CompInt8 && Dispatch->SQ8BitGemmPackQuantBDataAndBlkSum != nullptr) {
             const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
             PackedQuantBDataStruct<float, 8> packed_quant_b(PackedQuantBDataAndOrBlkSumWorkspace, N, BlockCountK,
                                                             BlkLen, GetMlasPlatform().ArmNeonIsQuantActivationsUnsigned);
@@ -716,213 +682,6 @@ HQ8BitGemm_CompFp16(
     }
 }
 
-void
-HQ8BitGemm_CompInt8(
-    const size_t BlkLen,
-    const size_t K,
-    const MLAS_QNBIT_GEMM_DATA_PARAMS<MLAS_FP16>* const DataParams,
-    void* const PerGemmWorkspace,
-    const size_t RangeStartM,
-    const size_t RangeCountM,
-    const size_t RangeStartN,
-    const size_t RangeCountN,
-    const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* BackendKernelSelectorConfig
-)
-{
-    MLAS_UNREFERENCED_PARAMETER(BackendKernelSelectorConfig);
-    constexpr size_t BlkBitWidth = 8;
-
-    const size_t k_blks = MlasDivRoundup(K, BlkLen);
-    const size_t lda = k_blks * BlkLen;  // separate scale array, not Q8BlkSize
-    const size_t ldc = DataParams->ldc;
-    const size_t ldb = k_blks * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
-
-    auto* ws = static_cast<HQ8BitCompInt8PerGemmWorkspace*>(PerGemmWorkspace);
-
-    const std::byte* QuantA = ws->quant_a.QuantData + RangeStartM * lda;
-    const float* QuantAScale = ws->quant_a.QuantScale + RangeStartM * k_blks;
-    const float* ABlockSum = ws->quant_a.BlockSum + RangeStartM * k_blks;
-
-    const std::byte* QuantBData = ws->PackedQuantBData + RangeStartN * ldb;
-    const float* QuantBScale = ws->PackedQuantBScale + RangeStartN * k_blks;
-    const float* QuantBBlkSum = ws->QuantBBlkSum + RangeStartN * k_blks;
-    const float* BlkUnsignedQuantAZeroPointCorrection =
-        ws->BlkUnsignedQuantAZeroPointCorrection
-            ? ws->BlkUnsignedQuantAZeroPointCorrection + RangeStartN * k_blks
-            : nullptr;
-
-    MLAS_FP16* C = DataParams->C + RangeStartM * ldc + RangeStartN;
-
-    const MLAS_FP16* BiasFp16 = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias + RangeStartN;
-
-    // Convert fp16 bias to fp32
-    std::vector<float> bias_fp32;
-    float* bias_fp32_ptr = nullptr;
-    if (BiasFp16 != nullptr) {
-        bias_fp32.resize(RangeCountN);
-        MlasConvertHalfToFloatBuffer(BiasFp16, bias_fp32.data(), RangeCountN);
-        bias_fp32_ptr = bias_fp32.data();
-    }
-
-    size_t CountN;
-    const size_t MaxCountN = std::min(RangeCountN, size_t{128});
-    // Temporary fp32 C buffer reused across N-chunks to avoid per-iteration allocations.
-    std::vector<float> c_temp(RangeCountM * MaxCountN);
-
-    for (size_t n = 0; n < RangeCountN; n += CountN) {
-        CountN = std::min(RangeCountN - n, size_t{128});
-
-        const std::byte* b_col = QuantBData + n * ldb;
-        const float* b_col_scale = QuantBScale + n * k_blks;
-        const float* bias = (bias_fp32_ptr == nullptr) ? nullptr : bias_fp32_ptr + n;
-        const float* b_blk_sum = QuantBBlkSum + n * k_blks;
-        const float* blk_unsigned =
-            BlkUnsignedQuantAZeroPointCorrection
-                ? BlkUnsignedQuantAZeroPointCorrection + n * k_blks
-                : nullptr;
-
-        GetMlasPlatform().QNBitGemmDispatch->SQ8BitGemmKernel_BlkSum_CompInt8(
-            BlkLen,
-            QuantA,
-            QuantAScale,
-            b_col,
-            b_col_scale,
-            nullptr,  // zero points baked into BlkSum
-            c_temp.data(),
-            RangeCountM,
-            CountN,
-            K,
-            k_blks,
-            bias,
-            CountN,  // ldc for temp buffer
-            ABlockSum,
-            b_blk_sum,
-            blk_unsigned
-        );
-
-        // Convert fp32 C output to fp16 and write to actual output
-        MLAS_FP16* c_out = C + n;
-        for (size_t m = 0; m < RangeCountM; m++) {
-            MlasConvertFloatToHalfBuffer(
-                c_temp.data() + m * CountN,
-                c_out + m * ldc,
-                CountN
-            );
-        }
-
-        if (DataParams->PostProcessor != nullptr) {
-            DataParams->PostProcessor->Process(
-                DataParams->C, RangeStartM, RangeStartN + n,
-                RangeCountM, CountN, ldc
-            );
-        }
-    }
-}
-
-void
-HQ4BitGemm_CompInt8(
-    const size_t BlkLen,
-    const size_t K,
-    const MLAS_QNBIT_GEMM_DATA_PARAMS<MLAS_FP16>* const DataParams,
-    void* const PerGemmWorkspace,
-    const size_t RangeStartM,
-    const size_t RangeCountM,
-    const size_t RangeStartN,
-    const size_t RangeCountN,
-    const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* BackendKernelSelectorConfig
-)
-{
-    MLAS_UNREFERENCED_PARAMETER(BackendKernelSelectorConfig);
-    constexpr size_t BlkBitWidth = 4;
-
-    const size_t k_blks = MlasDivRoundup(K, BlkLen);
-
-    const size_t lda = k_blks * Q8BlkSize(BlkLen);
-    const size_t ldc = DataParams->ldc;
-    const size_t ldb = k_blks * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
-    const size_t k_blks_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(k_blks);
-
-    const std::byte* QuantA = static_cast<const std::byte*>(PerGemmWorkspace) + RangeStartM * lda;
-
-    const std::byte* QuantBData = static_cast<const std::byte*>(DataParams->PackedQuantBData) + RangeStartN * ldb;
-    const MLAS_FP16* QuantBScaleFp16 = DataParams->QuantBScale + RangeStartN * k_blks;
-    const std::byte* QuantBZeroPoint =
-        (DataParams->QuantBZeroPoint == nullptr)
-            ? nullptr
-            : static_cast<const std::byte*>(DataParams->QuantBZeroPoint) + RangeStartN * k_blks_zp_bytes;
-
-    MLAS_FP16* C = DataParams->C + RangeStartM * ldc + RangeStartN;
-
-    const MLAS_FP16* BiasFp16 = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias + RangeStartN;
-
-    if (GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_CompInt8 == nullptr) {
-        return;
-    }
-
-    size_t CountN;
-    const size_t maxCountN = std::min(RangeCountN, size_t{128});
-    // Pre-allocate reusable buffers sized for the maximum column chunk
-    std::vector<float> b_col_scale_fp32(maxCountN * k_blks);
-    std::vector<float> bias_fp32(maxCountN);
-    std::vector<float> c_temp(RangeCountM * maxCountN);
-
-    for (size_t n = 0; n < RangeCountN; n += CountN) {
-        CountN = std::min(RangeCountN - n, size_t{128});
-
-        const std::byte* a_row = QuantA;
-        const std::byte* b_col = QuantBData + n * ldb;
-        const std::byte* b_col_zp =
-            (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
-        MLAS_FP16* c_blk = C + n;
-        const MLAS_FP16* bias_fp16 = (BiasFp16 == nullptr) ? nullptr : BiasFp16 + n;
-
-        // Convert fp16 scales to fp32 for this column chunk
-        b_col_scale_fp32.resize(CountN * k_blks);
-        MlasConvertHalfToFloatBuffer(QuantBScaleFp16 + n * k_blks, b_col_scale_fp32.data(), CountN * k_blks);
-
-        // Convert fp16 bias to fp32
-        float* bias_fp32_ptr = nullptr;
-        if (bias_fp16 != nullptr) {
-            bias_fp32.resize(CountN);
-            MlasConvertHalfToFloatBuffer(bias_fp16, bias_fp32.data(), CountN);
-            bias_fp32_ptr = bias_fp32.data();
-        }
-
-        size_t RowsRemaining = RangeCountM;
-        size_t RowsProcessed = 0;
-        while (RowsRemaining > 0) {
-            const auto RowsHandled = GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_CompInt8(
-                BlkLen,
-                a_row, b_col, b_col_scale_fp32.data(), b_col_zp,
-                c_temp.data() + RowsProcessed * CountN,
-                RowsRemaining, CountN, K, k_blks, CountN, bias_fp32_ptr
-            );
-
-            // Convert fp32 C output to fp16 and write to actual output
-            for (size_t m = 0; m < RowsHandled; m++) {
-                MlasConvertFloatToHalfBuffer(
-                    c_temp.data() + (RowsProcessed + m) * CountN,
-                    c_blk + m * ldc,
-                    CountN
-                );
-            }
-
-            if (DataParams->PostProcessor != nullptr) {
-                DataParams->PostProcessor->Process(
-                    DataParams->C, RangeStartM + RowsProcessed, RangeStartN + n,
-                    RowsHandled, CountN, ldc
-                );
-            }
-
-            c_blk += RowsHandled * ldc;
-            a_row += RowsHandled * lda;
-
-            RowsProcessed += RowsHandled;
-            RowsRemaining -= RowsHandled;
-        }
-    }
-}
-
 void
 SQ4BitGemm_CompInt8(
     const size_t BlkLen,
@@ -1305,86 +1064,6 @@ InitializeWorkspace_CompInt8<float>(
     }
 }
 
-template <>
-void
-InitializeWorkspace_CompInt8<MLAS_FP16>(
-    size_t M,
-    size_t N,
-    size_t K,
-    size_t BatchN,
-    size_t BlkLen,
-    const MLAS_QNBIT_GEMM_DATA_PARAMS<MLAS_FP16>* DataParams,
-    void* Workspace,
-    size_t PerGemmWorkspaceStride,
-    MLAS_THREADPOOL* ThreadPool,
-    size_t BlkBitWidth,
-    const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* BackendKernelSelectorConfig
-) {
-    MLAS_UNREFERENCED_PARAMETER(N);
-    MLAS_UNREFERENCED_PARAMETER(BackendKernelSelectorConfig);
-
-    const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
-
-    if (BlkBitWidth == 8) {
-        // For 8-bit, use QuantizeARowComputeBlkSum to produce separate QuantData, QuantScale, BlockSum.
-        // This matches the workspace layout expected by PerGemmQuantAWorkspace / HQ8BitCompInt8PerGemmWorkspace.
-        const auto QuantizeARow2 = GetMlasPlatform().QNBitGemmDispatch->QuantizeARowComputeBlkSum_CompInt8;
-        if (QuantizeARow2) {
-            MlasTrySimpleParallel(ThreadPool, BatchN, [&](ptrdiff_t gemm_idx) {
-                const auto& data = DataParams[gemm_idx];
-
-                const MLAS_FP16* ARowPtr = data.A;
-                void* PerGemmWs = static_cast<std::byte*>(Workspace) + gemm_idx * PerGemmWorkspaceStride;
-                PerGemmQuantAWorkspace quant_a_data(PerGemmWs, M, BlockCountK, BlkLen);
-                std::byte* QuantARowPtr = quant_a_data.QuantData;
-                float* QuantARowScalePtr = quant_a_data.QuantScale;
-                float* QuantARowBlkSum = quant_a_data.BlockSum;
-
-                static thread_local std::vector<float> a_row_fp32;
-                if (a_row_fp32.size() < K) {
-                    a_row_fp32.resize(K);
-                }
-
-                for (size_t m = 0; m < M; ++m) {
-                    MlasConvertHalfToFloatBuffer(ARowPtr, a_row_fp32.data(), K);
-                    QuantizeARow2(BlkLen, a_row_fp32.data(), K, QuantARowPtr, QuantARowScalePtr, QuantARowBlkSum);
-
-                    ARowPtr += data.lda;
-                    QuantARowPtr += BlockCountK * BlkLen;
-                    QuantARowScalePtr += BlockCountK;
-                    QuantARowBlkSum += BlockCountK;
-                }
-            });
-        }
-    } else {
-        // For 4-bit, use QuantizeARow to produce Q8BlkSize format (embedded scales).
-        const auto QuantizeARow = GetMlasPlatform().QNBitGemmDispatch->QuantizeARow_CompInt8;
-        const size_t QuantAStride = BlockCountK * Q8BlkSize(BlkLen);
-
-        if (QuantizeARow) {
-            MlasTrySimpleParallel(ThreadPool, BatchN, [&](ptrdiff_t gemm_idx) {
-                const auto& data = DataParams[gemm_idx];
-
-                const MLAS_FP16* ARowPtr = data.A;
-                std::byte* QuantARowPtr = static_cast<std::byte*>(Workspace) + gemm_idx * PerGemmWorkspaceStride;
-
-                static thread_local std::vector<float> a_row_fp32;
-                if (a_row_fp32.size() < K) {
-                    a_row_fp32.resize(K);
-                }
-
-                for (size_t m = 0; m < M; ++m) {
-                    MlasConvertHalfToFloatBuffer(ARowPtr, a_row_fp32.data(), K);
-                    QuantizeARow(BlkLen, a_row_fp32.data(), K, QuantARowPtr);
-
-                    ARowPtr += data.lda;
-                    QuantARowPtr += QuantAStride;
-                }
-            });
-        }
-    }
-}
-
 template <typename T>
 using InitializeWorkspaceFn = std::function<void(
     size_t M,
@@ -1421,13 +1100,8 @@ template <>
 InitializeWorkspaceFn<MLAS_FP16>
 GetInitializeWorkspace(QNBitGemmVariant variant)
 {
-    switch (variant) {
-        case HQ4BitGemmVariant_CompInt8:
-        case HQ8BitGemmVariant_CompInt8:
-            return InitializeWorkspace_CompInt8<MLAS_FP16>;
-        default:
-            return nullptr;
-    }
+    MLAS_UNREFERENCED_PARAMETER(variant);
+    return nullptr;
 }
 
 template <typename T>
@@ -1472,10 +1146,6 @@ GetQNBitGemm(QNBitGemmVariant variant)
             return HQ4BitGemm_CompFp16;
         case HQ8BitGemmVariant_CompFp16:
             return HQ8BitGemm_CompFp16;
-        case HQ4BitGemmVariant_CompInt8:
-            return HQ4BitGemm_CompInt8;
-        case HQ8BitGemmVariant_CompInt8:
-            return HQ8BitGemm_CompInt8;
         default:
             return nullptr;
     }
@@ -1587,18 +1257,6 @@ MlasQNBitGemmBatch(
 
                 PerGemmQuantAWorkspace per_gemm_quant_a_workspace(PerGemmWorkspace, M, BlockCountK, BlkLen);
                 ComputeOperation(BlkLen, K, Data, &per_gemm_quant_a_workspace, 0, M, 0, N, BackendKernelSelectorConfig);
-            } else if (Variant == HQ8BitGemmVariant_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ8BitGemmKernel_BlkSum_CompInt8 != nullptr) {
-                // Use PackedQuantBDataStruct<float, 8> to extract float pointers from the packed workspace.
-                // The packed workspace was created with float scales during PrePack.
-                PackedQuantBDataStruct<float, 8> packed_quant_b(const_cast<void*>(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen, GetMlasPlatform().ArmNeonIsQuantActivationsUnsigned);
-                HQ8BitCompInt8PerGemmWorkspace hw{
-                    PerGemmQuantAWorkspace(PerGemmWorkspace, M, BlockCountK, BlkLen),
-                    packed_quant_b.PackedQuantBData,
-                    packed_quant_b.PackedQuantBScale,
-                    packed_quant_b.QuantBBlkSum,
-                    packed_quant_b.BlkUnsignedQuantAZeroPointCorrection
-                };
-                ComputeOperation(BlkLen, K, Data, &hw, 0, M, 0, N, BackendKernelSelectorConfig);
             } else {
                 ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, 0, M, 0, N, BackendKernelSelectorConfig);
             }
@@ -1680,16 +1338,6 @@ MlasQNBitGemmBatch(
 
             PerGemmQuantAWorkspace per_gemm_quant_a_workspace(PerGemmWorkspace, M, BlockCountK, BlkLen);
             ComputeOperation(BlkLen, K, Data, &per_gemm_quant_a_workspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN, BackendKernelSelectorConfig);
-        } else if (Variant == HQ8BitGemmVariant_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ8BitGemmKernel_BlkSum_CompInt8 != nullptr) {
-            PackedQuantBDataStruct<float, 8> packed_quant_b(const_cast<void*>(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen, GetMlasPlatform().ArmNeonIsQuantActivationsUnsigned);
-            HQ8BitCompInt8PerGemmWorkspace hw{
-                PerGemmQuantAWorkspace(PerGemmWorkspace, M, BlockCountK, BlkLen),
-                packed_quant_b.PackedQuantBData,
-                packed_quant_b.PackedQuantBScale,
-                packed_quant_b.QuantBBlkSum,
-                packed_quant_b.BlkUnsignedQuantAZeroPointCorrection
-            };
-            ComputeOperation(BlkLen, K, Data, &hw, RangeStartM, RangeCountM, RangeStartN, RangeCountN, BackendKernelSelectorConfig);
         } else {
             ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN, BackendKernelSelectorConfig);
         }
diff --git a/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
index ac42ced83f36c..5a3c8005d8318 100644
--- a/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
@@ -90,7 +90,7 @@ QNBitGemmPackQuantBDataSize(
         const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
         size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
 
-        if (ComputeType == SQNBIT_CompInt8 || ComputeType == HQNBIT_CompInt8) {
+        if (ComputeType == SQNBIT_CompInt8) {
             const size_t ScaleSize = N * BlockCountK * sizeof(float);
             size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(float);
 
@@ -132,7 +132,7 @@ SQ4BitGemmPackQuantBData(
     const size_t BlkDataSize = MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
     const size_t Iterations = N * BlockCountK;  // one iteration per block
 
-    const size_t SubBlkLen = (ComputeType == SQNBIT_CompInt8 || ComputeType == HQNBIT_CompInt8)
+    const size_t SubBlkLen = (ComputeType == SQNBIT_CompInt8)
                                  ? ((BlkLen == 16) ? 16 : 32)
                                  : 16;
 
@@ -488,12 +488,6 @@ QNBitGemmPerGemmWorkspaceSize(
                 return PerGemmWorkspaceSize;
             }
         }
-        case HQNBIT_CompInt8: {
-            // Same workspace layout as SQNBIT_CompInt8 for block quantization of A to int8
-            const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
-            const size_t PerGemmWorkspaceSize = M * BlockCountK * (Q8BlkSize(BlkLen) + sizeof(float));
-            return PerGemmWorkspaceSize;
-        }
         default: {
             return 0;
         }
@@ -509,8 +503,7 @@ QNBitGemmPerGemmWorkspaceAlignment(
     MLAS_UNREFERENCED_PARAMETER(BlkLen);
 
     switch (ComputeType) {
-        case SQNBIT_CompInt8:
-        case HQNBIT_CompInt8: {
+        case SQNBIT_CompInt8: {
             return Q8BlkAlignment();
         }
         default: {
diff --git a/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc b/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc
index f9ae13808cf2c..f3956d5e9e0f3 100644
--- a/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc
+++ b/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc
@@ -605,7 +605,7 @@ void ApplyReshapeTransposeFusions(
         graph.GenerateNodeName("DQFusedMatMulNBits"),
         "MatMulNBits",
         "Fused from DQ+Reshape+Transpose+MatMul",
-        mnb_inputs, mnb_outputs, &mnb_attrs, kMSDomain);
+        mnb_inputs, mnb_outputs, *mm_node, &mnb_attrs, kMSDomain);
     mnb_node.SetExecutionProviderType(mm_node->GetExecutionProviderType());
 
     graph_utils::RemoveNodeOutputEdges(graph, *graph.GetNode(match.matmul_idx));
@@ -784,7 +784,7 @@ void ApplyDirectDQFusions(
         graph.GenerateNodeName("DirectDQFusedMatMulNBits"),
         "MatMulNBits",
         "Fused from direct DQ(axis=0)+MatMul",
-        mnb_inputs, mnb_outputs, &mnb_attrs, kMSDomain);
+        mnb_inputs, mnb_outputs, *mm_node, &mnb_attrs, kMSDomain);
     mnb_node.SetExecutionProviderType(mm_node->GetExecutionProviderType());
 
     graph_utils::RemoveNodeOutputEdges(graph, *graph.GetNode(match.matmul_idx));
diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
index 9e35550e2f845..606e91ce91bbb 100644
--- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
@@ -17,7 +17,7 @@ using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::common;
 namespace onnxruntime {
 // Add a Cast to convert Input from int64 to int32.
-static NodeArg* CastToInt32(Graph& graph, NodeArg* input, ProviderType provider_type) {
+static NodeArg* CastToInt32(Graph& graph, NodeArg* input, const Node& source_node) {
   auto data_type = input->TypeAsProto()->tensor_type().elem_type();
   if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
     return input;
@@ -36,13 +36,13 @@ static NodeArg* CastToInt32(Graph& graph, NodeArg* input, ProviderType provider_
                              "Cast Input from int64 to int32",
                              std::array{input},
                              std::array{&cast32},
+                             source_node,
                              nullptr,
                              kOnnxDomain);
 
   // Add attribute: "to" = 6
   node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_INT32});
-
-  node.SetExecutionProviderType(provider_type);
+  node.SetExecutionProviderType(source_node.GetExecutionProviderType());
   return &cast32;
 }
 
@@ -487,9 +487,9 @@ static void CreateEmbedLayernormNode(Graph& graph,
                                      NodeArg* segment_embedding,
                                      Node& layer_norm_node) {
   // Cast input_ids and segment_ids to int32 if needed.
-  input_ids = CastToInt32(graph, input_ids, layer_norm_node.GetExecutionProviderType());
+  input_ids = CastToInt32(graph, input_ids, layer_norm_node);
   if (segment_ids != nullptr && segment_embedding != nullptr) {
-    segment_ids = CastToInt32(graph, segment_ids, layer_norm_node.GetExecutionProviderType());
+    segment_ids = CastToInt32(graph, segment_ids, layer_norm_node);
   }
 
   NodeArg place_holder("", nullptr);
@@ -514,7 +514,7 @@ static void CreateEmbedLayernormNode(Graph& graph,
                                               "fused EmbedLayerNorm subgraphs ",
                                               embed_layer_norm_input_defs,
                                               std::array{layer_norm_node.MutableOutputDefs()[0], &mask_index},
-                                              {}, kMSDomain);
+                                              layer_norm_node, nullptr, kMSDomain);
 
   // Get attribute "epsilon" from "LayerNormalization" node if available. Else, default value
   // will be used.
diff --git a/onnxruntime/core/optimizer/gelu_fusion.cc b/onnxruntime/core/optimizer/gelu_fusion.cc
index 641bfbf388623..e2f448bf70734 100644
--- a/onnxruntime/core/optimizer/gelu_fusion.cc
+++ b/onnxruntime/core/optimizer/gelu_fusion.cc
@@ -178,7 +178,7 @@ Status GeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, cons
                                     "Gelu",
                                     "fused Gelu subgraphs ",
                                     gelu_input_defs,
-                                    {}, {}, op_domain);
+                                    {}, div, nullptr, op_domain);
 
     // Assign provider to this new node. Provider should be same as the provider for old node.
     gelu_node.SetExecutionProviderType(div.GetExecutionProviderType());
diff --git a/onnxruntime/core/optimizer/gemm_sum_fusion.cc b/onnxruntime/core/optimizer/gemm_sum_fusion.cc
index be3c90a822fe2..c84e34a6d0dbe 100644
--- a/onnxruntime/core/optimizer/gemm_sum_fusion.cc
+++ b/onnxruntime/core/optimizer/gemm_sum_fusion.cc
@@ -41,7 +41,8 @@ Status GemmSumFusion::Apply(Graph& graph, Node& gemm_node, RewriteRuleEffect& mo
                                       "Fused Gemm with Sum",
                                       new_gemm_input_defs,
                                       new_gemm_output_defs,
-                                      {},
+                                      gemm_node,
+                                      nullptr,
                                       gemm_node.Domain());
   new_gemm_node.AddAttribute("transA", static_cast<int64_t>(transA));
   new_gemm_node.AddAttribute("transB", static_cast<int64_t>(transB));
diff --git a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
index da454b67aecf4..a66ad987cfaef 100644
--- a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
+++ b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
@@ -80,7 +80,8 @@ Status GemmTransposeFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& m
                                       "Fused Gemm with Transpose",
                                       new_gemm_input_defs,
                                       {},
-                                      {},
+                                      gemm_node,
+                                      nullptr,
                                       gemm_node.Domain());
   new_gemm_node.AddAttribute("transA", static_cast<int64_t>(transA));
   new_gemm_node.AddAttribute("transB", static_cast<int64_t>(transB));
diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc
index 3ade3864255ea..c10e070ef8f09 100644
--- a/onnxruntime/core/optimizer/layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc
@@ -474,7 +474,7 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
                                           "LayerNormalization",
                                           "fused LayerNorm subgraphs ",
                                           layer_norm_input_defs,
-                                          {}, {}, kOnnxDomain);
+                                          {}, mul_node, nullptr, kOnnxDomain);
 
     // Get constant "epsilon" from "Add2" node if available. Else, default value will be used.
     const ONNX_NAMESPACE::TensorProto* tensor_proto = graph_utils::GetConstantInitializer(graph, add2_node.MutableInputDefs()[1]->Name());
@@ -719,7 +719,7 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale};
     Node& layer_norm_node =
         graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/SimplifiedLayerNormFusion/"), "SimplifiedLayerNormalization",
-                      "fused LayerNorm subgraphs ", layer_norm_input_defs, {}, {}, kOnnxDomain);
+                      "fused LayerNorm subgraphs ", layer_norm_input_defs, {}, mul_node, nullptr, kOnnxDomain);
 
     // Get constant "epsilon" from "Add" node if available. Else, default value will be used.
     const ONNX_NAMESPACE::TensorProto* tensor_proto =
diff --git a/onnxruntime/core/optimizer/matmul_add_fusion.cc b/onnxruntime/core/optimizer/matmul_add_fusion.cc
index 5db61877811aa..f567609c979a9 100644
--- a/onnxruntime/core/optimizer/matmul_add_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_add_fusion.cc
@@ -7,6 +7,7 @@
 #include "core/optimizer/graph_transformer_utils.h"
 #include "core/optimizer/initializer.h"
 #include "core/optimizer/matmul_add_fusion.h"
+#include "core/optimizer/utils.h"
 
 #include <string>
 #include <string_view>
@@ -204,7 +205,8 @@ Status MatMulAddFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
         NodeArg* new_arg = &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(name + "_reshape_arg"), &new_arg_type);
         Node& reshape_node = graph.AddNode(graph.GenerateNodeName(name + "_reshape"), "Reshape", "Reshape for " + name,
                                            {is_input ? gemm_input_defs[0] : new_arg, shape_arg},
-                                           {is_input ? new_arg : gemm_output_defs[0]});
+                                           {is_input ? new_arg : gemm_output_defs[0]},
+                                           matmul_node);
         reshape_node.SetExecutionProviderType(matmul_node.GetExecutionProviderType());
         return &reshape_node;
       };
@@ -217,7 +219,8 @@ Status MatMulAddFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
     }
 
     Node& gemm_node = graph.AddNode(graph.GenerateNodeName(matmul_node.Name() + "/MatMulAddFusion"), "Gemm",
-                                    "fused Matmul and Add", gemm_input_defs, gemm_output_defs);
+                                    "fused Matmul and Add", gemm_input_defs, gemm_output_defs,
+                                    matmul_node);
     gemm_node.SetExecutionProviderType(matmul_node.GetExecutionProviderType());
 
     if (need_reshape) {
diff --git a/onnxruntime/core/optimizer/matmul_bn_fusion.cc b/onnxruntime/core/optimizer/matmul_bn_fusion.cc
index 871571ea64881..be52e26a2901f 100644
--- a/onnxruntime/core/optimizer/matmul_bn_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_bn_fusion.cc
@@ -227,6 +227,7 @@ Status MatmulBNFusion::Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect&
       "Generated from Matmul BatchNormalization fusion",
       {matmul_node.MutableInputDefs()[0], &new_gemm_b_node_arg, &new_gemm_bias_node_arg},
       matmul_node.MutableOutputDefs(),
+      matmul_node,
       nullptr,
       kOnnxDomain);
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
index 9d53e28921784..c79e4142a9ee2 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
@@ -10,6 +10,7 @@
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/optimizer/qdq_transformer/qdq_util.h"
+#include "core/optimizer/utils.h"
 
 namespace onnxruntime {
 
@@ -53,6 +54,7 @@ Status DuplicateDQForOutputEdge(const graph_utils::GraphEdge& original_dq_output
                                     MakeString("Added by ", kTransformerName),
                                     dq_inputs,
                                     {&new_dq_output_nodearg},
+                                    original_dq_node,
                                     &original_dq_node.GetAttributes(),
                                     original_dq_node.Domain());
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc
index b8252bc7a75b4..0d732a71b7ed0 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc
@@ -194,6 +194,8 @@ Status InsertQDQPairs(Graph& graph, gsl::span<const ExtendedGraphEdge> insertion
       }
     }
 
+    optimizer_utils::DuplicateNodeAnnotation(*src_node, q_node);
+
     // Add edge from src to Q node.
     src_node->MutableOutputDefs()[first_edge.src->arg_idx] = &pre_q_nodearg;
     graph.AddEdge(src_node->Index(), q_node.Index(), first_edge.src->arg_idx, 0);
@@ -221,6 +223,10 @@ Status InsertQDQPairs(Graph& graph, gsl::span<const ExtendedGraphEdge> insertion
                                   &dq_attrs,  // attributes
                                   qdq_domain);
 
+    if (src_node) {
+      optimizer_utils::DuplicateNodeAnnotation(*src_node, dq_node);
+    }
+
     ORT_RETURN_IF_NOT(graph.SetOpSchemaFromRegistryForNode(dq_node), "Failed to set op schema for added DQ node.");
 
     Node* dst_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
diff --git a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
index 5a6eb82c3e6c0..ba3ea09564c17 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
@@ -189,14 +189,14 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
           graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_weight_q"), &weight_q_type_proto);
       Node& weight_q_node = graph.AddNode(
           graph.GenerateNodeArgName(node.Name() + "_weight_q"), QDQ::QOpName, "Weight Q node",
-          {node.MutableInputDefs()[1], weight_scale_arg, &weight_zp_arg}, {&weight_q_arg}, nullptr, node.Domain());
+          {node.MutableInputDefs()[1], weight_scale_arg, &weight_zp_arg}, {&weight_q_arg}, node, nullptr, node.Domain());
 
       // DQ from int8 to float32.
       NodeArg& weight_dq_arg =
           graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_weight_dq"), weight_arg->TypeAsProto());
       Node& weight_dq_node =
           graph.AddNode(graph.GenerateNodeArgName(node.Name() + "_weight_dq"), QDQ::DQOpName, "Weight DQ node",
-                        {&weight_q_arg, weight_scale_arg, &weight_zp_arg}, {&weight_dq_arg}, nullptr, node.Domain());
+                        {&weight_q_arg, weight_scale_arg, &weight_zp_arg}, {&weight_dq_arg}, node, nullptr, node.Domain());
       graph.AddEdge(weight_q_node.Index(), weight_dq_node.Index(), 0, 0);
       node.MutableInputDefs()[1] = &weight_dq_arg;
       graph.AddEdge(weight_dq_node.Index(), node.Index(), 0, 1);
@@ -211,14 +211,14 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
                                                          weight_scale_arg->TypeAsProto());
       Node& mul_node =
           graph.AddNode(graph.GenerateNodeName(node.Name() + "_scale"), "Mul", "Bias scale node",
-                        {dq_0.MutableInputDefs()[1], weight_scale_arg}, {&bias_scale_arg}, nullptr, node.Domain());
+                        {dq_0.MutableInputDefs()[1], weight_scale_arg}, {&bias_scale_arg}, node, nullptr, node.Domain());
 
       // fp_bias / scale.
       NodeArg& bias_div_arg =
           graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_div"), bias_arg->TypeAsProto());
       Node& div_node =
           graph.AddNode(graph.GenerateNodeName(node.Name() + "_bias_div"), "Div", "Bias div node",
-                        {node.MutableInputDefs()[2], &bias_scale_arg}, {&bias_div_arg}, nullptr, node.Domain());
+                        {node.MutableInputDefs()[2], &bias_scale_arg}, {&bias_div_arg}, node, nullptr, node.Domain());
       graph.AddEdge(mul_node.Index(), div_node.Index(), 0, 1);
 
       // Round(fp_bias / scale).
@@ -226,7 +226,7 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
           graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_div_round"), bias_arg->TypeAsProto());
       Node& round_node =
           graph.AddNode(graph.GenerateNodeName(node.Name() + "_bias_div_round"), "Round", "Bias div round node",
-                        {&bias_div_arg}, {&bias_div_round_arg}, nullptr, node.Domain());
+                        {&bias_div_arg}, {&bias_div_round_arg}, node, nullptr, node.Domain());
       graph.AddEdge(div_node.Index(), round_node.Index(), 0, 0);
 
       // Cast(Round(fp_bias / scale)) to int32.
@@ -236,7 +236,7 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
       NodeArg& bias_int32_arg =
           graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_int32"), &bias_int32_type_proto);
       Node& cast_node = graph.AddNode(graph.GenerateNodeName(node.Name() + "_bias_int32"), "Cast", "Bias INT32 node",
-                                      {&bias_div_round_arg}, {&bias_int32_arg}, nullptr, node.Domain());
+                                      {&bias_div_round_arg}, {&bias_int32_arg}, node, nullptr, node.Domain());
       cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32));
       graph.AddEdge(round_node.Index(), cast_node.Index(), 0, 0);
 
@@ -245,7 +245,7 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
           graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_dq"), bias_arg->TypeAsProto());
       Node& bias_dq_node =
           graph.AddNode(graph.GenerateNodeName(node.Name() + "_bias_dq"), QDQ::DQOpName, "Bias DQ node",
-                        {&bias_int32_arg, &bias_scale_arg}, {&bias_dq_arg}, nullptr, node.Domain());
+                        {&bias_int32_arg, &bias_scale_arg}, {&bias_dq_arg}, node, nullptr, node.Domain());
       if (!is_per_tensor_scale) {
         bias_dq_node.AddAttribute("axis", static_cast<int64_t>(0));
       }
diff --git a/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc b/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc
index 9bd91e7916ecb..94fc7f6c03fa1 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc
@@ -134,6 +134,7 @@ Status WhereDummyDq::InsertDummyDQ(Node& node, Graph& graph, bool& modified, con
           "DeQuantizeLinear from WhereDummyDq GraphTransformer",
           {&dummy_data_arg, &dummy_scale_arg, &dummy_zp_arg},
           {&dummy_dq_arg},
+          node,
           nullptr,
           dq_node->Domain());
 
diff --git a/onnxruntime/core/optimizer/reshape_fusion.cc b/onnxruntime/core/optimizer/reshape_fusion.cc
index 6a2b4295093d8..167952356ff58 100644
--- a/onnxruntime/core/optimizer/reshape_fusion.cc
+++ b/onnxruntime/core/optimizer/reshape_fusion.cc
@@ -495,7 +495,8 @@ bool ReshapeFusion::FuseContiguousReshapes(Node& reshape, Graph& graph) {
   NodeArg* shape_arg = &graph_utils::AddInitializerWithOrtValue(graph, shape_initializer_proto);
   Node& reshape_node = graph.AddNode(graph.GenerateNodeName(name + "_new_reshape"), "Reshape", "Reshape for " + name,
                                      {contiguous_reshapes[0].get().MutableInputDefs()[0], shape_arg},
-                                     {contiguous_reshapes.back().get().MutableOutputDefs()[0]});
+                                     {contiguous_reshapes.back().get().MutableOutputDefs()[0]},
+                                     reshape);
   reshape_node.SetExecutionProviderType(contiguous_reshapes[0].get().GetExecutionProviderType());
 
   graph_utils::FinalizeNodeFusion(graph, contiguous_reshapes, reshape_node);
diff --git a/onnxruntime/core/optimizer/slice_concat_to_space_to_depth_fusion.cc b/onnxruntime/core/optimizer/slice_concat_to_space_to_depth_fusion.cc
index f72f74e3b4a5c..8caea2c150990 100644
--- a/onnxruntime/core/optimizer/slice_concat_to_space_to_depth_fusion.cc
+++ b/onnxruntime/core/optimizer/slice_concat_to_space_to_depth_fusion.cc
@@ -492,6 +492,7 @@ bool FuseSliceConcatToSpaceToDepth(Node& concat, Graph& graph, const logging::Lo
                                                           : "Fused Slice*4 + Concat into SpaceToDepth + channel permutation",
                                        {space_to_depth_input},
                                        space_to_depth_outputs,
+                                       concat,
                                        nullptr,
                                        kOnnxDomain);
   space_to_depth.AddAttribute("blocksize", kBlockSize);
@@ -517,6 +518,7 @@ bool FuseSliceConcatToSpaceToDepth(Node& concat, Graph& graph, const logging::Lo
                                  "Reorder SpaceToDepth channels to preserve Slice+Concat block order",
                                  {space_to_depth.MutableOutputDefs()[0], gather_indices_arg},
                                  {},
+                                 concat,
                                  nullptr,
                                  kOnnxDomain);
     gather.AddAttribute("axis", static_cast<int64_t>(kChannelAxis));
diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc
index 60ab064465f2f..c84e60e64bd2d 100644
--- a/onnxruntime/core/optimizer/stft_decomposition.cc
+++ b/onnxruntime/core/optimizer/stft_decomposition.cc
@@ -58,27 +58,43 @@ NodeArg* AddShapeInitializer(Graph& graph, const char* name, const int64_t (&sha
 std::pair<Node*, NodeArg*> AddNode(Graph& graph,
                                    const char* op_type,
                                    ProviderType execution_provider_type,
-                                   gsl::span<NodeArg*> inputs) {
+                                   gsl::span<NodeArg*> inputs,
+                                   const Node* annotation_source = nullptr) {
   auto def_name = graph.GenerateNodeArgName(op_type);
   auto node_arg = &graph.GetOrCreateNodeArg(def_name, nullptr);
-  Node& node = graph.AddNode(graph.GenerateNodeName(op_type),
-                             op_type,
-                             "",
-                             inputs,
-                             {node_arg});
+  Node& node = annotation_source
+                   ? graph.AddNode(graph.GenerateNodeName(op_type),
+                                   op_type,
+                                   "",
+                                   inputs,
+                                   {node_arg},
+                                   *annotation_source)
+                   : graph.AddNode(graph.GenerateNodeName(op_type),
+                                   op_type,
+                                   "",
+                                   inputs,
+                                   {node_arg});
   node.SetExecutionProviderType(execution_provider_type);
   return std::make_pair(&node, node_arg);
 }
 
 std::pair<Node*, NodeArg*> AddNodeCast(Graph& graph, NodeArg* in,
-                                       ONNX_NAMESPACE::TensorProto_DataType data_type) {
+                                       ONNX_NAMESPACE::TensorProto_DataType data_type,
+                                       const Node* annotation_source = nullptr) {
   auto def_name = graph.GenerateNodeArgName("Cast");
   auto node_arg = &graph.GetOrCreateNodeArg(def_name, nullptr);
-  Node& node = graph.AddNode(graph.GenerateNodeName("Cast"),
-                             "Cast",
-                             "",
-                             {in},
-                             {node_arg});
+  Node& node = annotation_source
+                   ? graph.AddNode(graph.GenerateNodeName("Cast"),
+                                   "Cast",
+                                   "",
+                                   {in},
+                                   {node_arg},
+                                   *annotation_source)
+                   : graph.AddNode(graph.GenerateNodeName("Cast"),
+                                   "Cast",
+                                   "",
+                                   {in},
+                                   {node_arg});
   node.AddAttribute("to", static_cast<int64_t>(data_type));
   node.SetExecutionProviderType(kCpuExecutionProvider);
   return std::make_pair(&node, node_arg);
@@ -238,7 +254,7 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
       Node* reshape_signal_node = nullptr;
       NodeArg* reshape_output = nullptr;
       std::tie(reshape_signal_node, reshape_output) =
-          AddNode(graph, "Reshape", stft.GetExecutionProviderType(), signal_reshaped_inputs);
+          AddNode(graph, "Reshape", stft.GetExecutionProviderType(), signal_reshaped_inputs, &stft);
 
       NodeArg* real_weights_final = real_weights;
       NodeArg* imag_weights_final = imaginary_weights;
@@ -246,11 +262,11 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
         // When we are missing a window function
         if (real_weights_final->TypeAsProto()->tensor_type().elem_type() != data_type) {
           std::tie(std::ignore, real_weights_final) =
-              AddNodeCast(graph, real_weights_final, data_type);
+              AddNodeCast(graph, real_weights_final, data_type, &stft);
         }
         if (imag_weights_final->TypeAsProto()->tensor_type().elem_type() != data_type) {
           std::tie(std::ignore, imag_weights_final) =
-              AddNodeCast(graph, imag_weights_final, data_type);
+              AddNodeCast(graph, imag_weights_final, data_type, &stft);
         }
       } else {
         // When we have a window function
@@ -261,7 +277,7 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
         if (window->TypeAsProto()->tensor_type().elem_type() != GetDataType<float>()) {
           Node* window_cast_node = nullptr;
           std::tie(window_cast_node, window_final) =
-              AddNodeCast(graph, window, GetDataType<float>());
+              AddNodeCast(graph, window, GetDataType<float>(), &stft);
           window_recipient = window_cast_node;
         }
 
@@ -269,7 +285,7 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
         Node* window_reshape_node;
         NodeArg* window_reshaped = nullptr;
         std::tie(window_reshape_node, window_reshaped) =
-            AddNode(graph, "Reshape", kCpuExecutionProvider, window_reshaped_inputs);
+            AddNode(graph, "Reshape", kCpuExecutionProvider, window_reshaped_inputs, &stft);
         if (!window_recipient) {
           window_recipient = window_reshape_node;
         }
@@ -277,17 +293,17 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
         NodeArg* scale_real_weights_inputs[] = {real_weights, window_reshaped};
         NodeArg* windowed_real_weights_output = nullptr;
         std::tie(std::ignore, windowed_real_weights_output) =
-            AddNode(graph, "Mul", kCpuExecutionProvider, scale_real_weights_inputs);
+            AddNode(graph, "Mul", kCpuExecutionProvider, scale_real_weights_inputs, &stft);
 
         NodeArg* scale_imag_weights_inputs[] = {imaginary_weights, window_reshaped};
         NodeArg* windowed_imag_weights_output = nullptr;
         std::tie(std::ignore, windowed_imag_weights_output) =
-            AddNode(graph, "Mul", kCpuExecutionProvider, scale_imag_weights_inputs);
+            AddNode(graph, "Mul", kCpuExecutionProvider, scale_imag_weights_inputs, &stft);
 
         std::tie(std::ignore, real_weights_final) =
-            AddNodeCast(graph, windowed_real_weights_output, data_type);
+            AddNodeCast(graph, windowed_real_weights_output, data_type, &stft);
         std::tie(std::ignore, imag_weights_final) =
-            AddNodeCast(graph, windowed_imag_weights_output, data_type);
+            AddNodeCast(graph, windowed_imag_weights_output, data_type, &stft);
       }
 
       // Add Convolution (reals)
@@ -295,7 +311,7 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
       Node* real_conv_node = nullptr;
       NodeArg* real_conv_output = nullptr;
       std::tie(real_conv_node, real_conv_output) =
-          AddNode(graph, "Conv", stft.GetExecutionProviderType(), conv_real_inputs);
+          AddNode(graph, "Conv", stft.GetExecutionProviderType(), conv_real_inputs, &stft);
       real_conv_node->AddAttribute("strides", std::vector<int64_t>{1, frame_step_value});
 
       // Add Convolution (imaginary)
@@ -303,7 +319,7 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
       Node* imag_conv_node = nullptr;
       NodeArg* imag_conv_output = nullptr;
       std::tie(imag_conv_node, imag_conv_output) =
-          AddNode(graph, "Conv", stft.GetExecutionProviderType(), conv_imag_inputs);
+          AddNode(graph, "Conv", stft.GetExecutionProviderType(), conv_imag_inputs, &stft);
       imag_conv_node->AddAttribute("strides", std::vector<int64_t>{1, frame_step_value});
 
       // Concatenate
@@ -311,21 +327,21 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
       Node* concat_node = nullptr;
       NodeArg* concatenated_conv_output = nullptr;
       std::tie(concat_node, concatenated_conv_output) =
-          AddNode(graph, "Concat", stft.GetExecutionProviderType(), concatenate_inputs);
+          AddNode(graph, "Concat", stft.GetExecutionProviderType(), concatenate_inputs, &stft);
       concat_node->AddAttribute("axis", static_cast<int64_t>(0));
 
       // Unsqueeze Reshape
       NodeArg* unsqueeze_reshape_inputs[] = {concatenated_conv_output, unsqueezed_shape};
       NodeArg* unsqueezed_output = nullptr;
       std::tie(std::ignore, unsqueezed_output) =
-          AddNode(graph, "Reshape", stft.GetExecutionProviderType(), unsqueeze_reshape_inputs);
+          AddNode(graph, "Reshape", stft.GetExecutionProviderType(), unsqueeze_reshape_inputs, &stft);
 
       // Transpose
       NodeArg* transpose_inputs[] = {unsqueezed_output};
       Node* transpose_node = nullptr;
       NodeArg* transpose_output = nullptr;
       std::tie(transpose_node, transpose_output) =
-          AddNode(graph, "Transpose", stft.GetExecutionProviderType(), transpose_inputs);
+          AddNode(graph, "Transpose", stft.GetExecutionProviderType(), transpose_inputs, &stft);
       transpose_node->AddAttribute("perm", std::vector<int64_t>{1, 3, 2, 0});
 
       signal_recipient = reshape_signal_node;
diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
index 29b603da56e29..467d0c090070f 100755
--- a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
@@ -531,6 +531,7 @@ static bool MakeQDQNodeUnit(api::GraphRef& graph, const api::NodeRef& dq_node) {
   // Add Q
   auto new_q_node = MakeQuantizeOp(graph, dq_domain, inputs, axis, dq_node.GetAttributeInt("block_size"),
                                    dq_node.GetAttributeInt("output_dtype"), dq_node.GetAttributeInt("saturate"));
+  new_q_node->SetLayeringAnnotation(dq_node.GetLayeringAnnotation());
   auto q_node_outputs = new_q_node->Outputs();
 
   // copy value info from the dq input for the type information, and update the shape to match next_node's output
@@ -543,6 +544,7 @@ static bool MakeQDQNodeUnit(api::GraphRef& graph, const api::NodeRef& dq_node) {
 
   // Add DQ
   auto new_dq_node = MakeDequantizeOp(graph, dq_domain, inputs, axis, dq_node.GetAttributeInt("block_size"));
+  new_dq_node->SetLayeringAnnotation(dq_node.GetLayeringAnnotation());
   auto dq_node_outputs = new_dq_node->Outputs();
 
   // straight copy of value info as the type and shape are the same as next_node's output
@@ -1007,6 +1009,7 @@ static void UnsqueezeInput(OptimizerCtx& ctx, api::NodeRef& node, size_t i, cons
     // (see Case 2).
     if (consumers->nodes.size() > 0) {
       auto squeeze_ptr = MakeSqueezeOrUnsqueeze(ctx.opset, ctx.graph, "Squeeze", value_to_modify, axes);
+      squeeze_ptr->SetLayeringAnnotation(node.GetLayeringAnnotation());
       api::NodeRef& squeeze = *squeeze_ptr;
       std::string_view sq_out = squeeze.Outputs()[0];
       ctx.graph.CopyValueInfo(value_to_modify, sq_out);
@@ -1075,6 +1078,7 @@ static void UnsqueezeInput(OptimizerCtx& ctx, api::NodeRef& node, size_t i, cons
 
   // Case 3: Add an Unsqueeze node.
   auto unsqueeze_ptr = MakeSqueezeOrUnsqueeze(ctx.opset, ctx.graph, "Unsqueeze", input, axes);
+  unsqueeze_ptr->SetLayeringAnnotation(node.GetLayeringAnnotation());
   api::NodeRef& unsqueeze = *unsqueeze_ptr;
   std::string_view unsq_out = unsqueeze.Outputs()[0];
   ctx.graph.CopyValueInfo(input, unsq_out);
@@ -1207,6 +1211,7 @@ static void TransposeInputImpl(api::GraphRef& graph, api::NodeRef& node, size_t
       // Transpose the initializer. If there are existing consumers, add Transpose nodes to them using perm_inv
       // to counteract the effect. These Transposes will hopefully be optimized out later.
       auto transpose_inv_ptr = MakeTranspose(graph, constant_to_modify, perm_inv);
+      transpose_inv_ptr->SetLayeringAnnotation(node.GetLayeringAnnotation());
       api::NodeRef& transpose_inv = *transpose_inv_ptr;
       std::string_view transpose_out = transpose_inv.Outputs()[0];
       graph.CopyValueInfo(constant_to_modify, transpose_out);
@@ -1267,6 +1272,7 @@ static void TransposeInputImpl(api::GraphRef& graph, api::NodeRef& node, size_t
         // the other Transpose.
         const std::vector<int64_t>& perm_combined = ComposePerm(*perm2, perm);
         auto transpose_ptr = MakeTranspose(graph, inp_node->Inputs()[0], perm_combined);
+        transpose_ptr->SetLayeringAnnotation(node.GetLayeringAnnotation());
         api::NodeRef& transpose = *transpose_ptr;
         std::string_view transpose_out = transpose.Outputs()[0];
         graph.CopyValueInfo(input, transpose_out);
@@ -1301,6 +1307,7 @@ static void TransposeInputImpl(api::GraphRef& graph, api::NodeRef& node, size_t
 
   // Case 4: Add a new Transpose op
   auto transpose_ptr = MakeTranspose(graph, input, perm);
+  transpose_ptr->SetLayeringAnnotation(node.GetLayeringAnnotation());
   api::NodeRef& transpose = *transpose_ptr;
   std::string_view transpose_out = transpose.Outputs()[0];
   graph.CopyValueInfo(input, transpose_out);
@@ -1376,6 +1383,7 @@ std::string_view TransposeOutput(api::GraphRef& graph, api::NodeRef& node, size_
 
   // X -> Node -> Y,   Transpose
   auto transpose = MakeTranspose(graph, "", perm);
+  transpose->SetLayeringAnnotation(node.GetLayeringAnnotation());
 
   // X -> Node -> *Y',   Transpose -> Y      *shape/dtype not set
   graph.MoveOutput(node, i, *transpose, 0);
@@ -1730,6 +1738,7 @@ static bool HandleShape(HandlerArgs& args) {
   // X -> Shape -> Y,   Gather
   std::vector<std::string_view> gather_inputs{"", perm_const};
   auto gather_ptr = args.ctx.graph.AddNode("Gather", "Gather", gather_inputs, /*num_outputs*/ 1);
+  gather_ptr->SetLayeringAnnotation(args.node.GetLayeringAnnotation());
   api::NodeRef& gather = *gather_ptr;
   gather.SetAttributeInt("axis", 0);
 
@@ -1773,6 +1782,7 @@ static void PermuteInput(api::GraphRef& graph, api::NodeRef& node, size_t i, con
   std::string_view gather_indices_const = AddInitializerInt64(graph, /*shape*/ {rank_int}, perm);
   std::vector<std::string_view> gather_inputs{input_name, gather_indices_const};
   auto gather_ptr = graph.AddNode("Gather", "Gather", gather_inputs, /*num_outputs*/ 1);
+  gather_ptr->SetLayeringAnnotation(node.GetLayeringAnnotation());
   api::NodeRef& gather = *gather_ptr;
   std::string_view gather_output = gather.Outputs()[0];
   graph.CopyValueInfo(input_name, gather_output);
@@ -2221,6 +2231,7 @@ static bool HandleTile(HandlerArgs& args) {
     std::string_view perm_inv_const = AddInitializerInt64(args.ctx.graph, perm_shape, args.perm_inv);
     std::vector<std::string_view> gather_inputs{repeats_inp, perm_inv_const};
     auto gather_node_ptr = args.ctx.graph.AddNode("Gather", "Gather", gather_inputs, /*num_outputs*/ 1);
+    gather_node_ptr->SetLayeringAnnotation(args.node.GetLayeringAnnotation());
     api::NodeRef& gather_node = *gather_node_ptr;
     std::string_view gather_output = gather_node.Outputs()[0];
     args.ctx.graph.CopyValueInfo(repeats_inp, gather_output);
@@ -2271,6 +2282,7 @@ static void RemoveCancelingTransposeNodes(HandlerArgs& args) {
       // despite computing the same value. Use an Identity op instead.
       std::vector<std::string_view> single_empty_input{""};
       auto identity_ptr = args.ctx.graph.AddNode("Identity", "Identity", single_empty_input, /*num_outputs*/ 1);
+      identity_ptr->SetLayeringAnnotation(args.node.GetLayeringAnnotation());
       api::NodeRef& identity = *identity_ptr;
       args.ctx.graph.MoveOutput(args.node, 0, identity, 0);
       identity.SetInput(0, transpose_input);
@@ -2303,6 +2315,7 @@ static bool HandleTransposeImpl(HandlerArgs& args, const std::vector<int64_t>& n
       // use the same input as the 1st Transpose, move the output from the Reshape to the new Transpose node,
       // and remove the Reshape node.
       new_node = args.ctx.graph.AddNode("Transpose", "Transpose", {args.transpose.Inputs()[0]}, 1);
+      new_node->SetLayeringAnnotation(args.node.GetLayeringAnnotation());
       args.ctx.graph.MoveOutput(args.node, 0, *new_node, 0);
       args.ctx.graph.RemoveNode(args.node);
     } else {
@@ -2973,6 +2986,7 @@ static bool TryFixTransposeMissingDQ(OptimizerCtx& ctx, api::NodeRef& transpose_
   // Add Q
   auto new_q_node = MakeQuantizeOp(ctx.graph, q_domain, inputs, axis, q_node.GetAttributeInt("block_size"),
                                    q_node.GetAttributeInt("output_dtype"), q_node.GetAttributeInt("saturate"));
+  new_q_node->SetLayeringAnnotation(transpose_node.GetLayeringAnnotation());
   auto new_q_node_output = new_q_node->Outputs()[0];
 
   // Copy value info from the q output for the type information, and update the shape to match Transpose's input
@@ -2985,6 +2999,7 @@ static bool TryFixTransposeMissingDQ(OptimizerCtx& ctx, api::NodeRef& transpose_
 
   // Add new DQ.
   auto new_dq_node = MakeDequantizeOp(ctx.graph, q_domain, inputs, axis, q_node.GetAttributeInt("block_size"));
+  new_dq_node->SetLayeringAnnotation(transpose_node.GetLayeringAnnotation());
   auto new_dq_node_output = new_dq_node->Outputs()[0];
   ctx.graph.CopyValueInfo(transpose_input_name, new_dq_node_output);
 
diff --git a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
index 6ff4da05fbf57..4ee5a65b9b9fb 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
+++ b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
@@ -258,6 +258,18 @@ class NodeRef {
   /// <returns>Id</returns>
   virtual int64_t Id() const = 0;
 
+  /// <summary>
+  /// Get the layering annotation of the node.
+  /// </summary>
+  /// <returns>annotation</returns>
+  virtual std::string_view GetLayeringAnnotation() const = 0;
+
+  /// <summary>
+  /// Set layering annotation
+  /// </summary>
+  /// <param name="annotation"></param>
+  virtual void SetLayeringAnnotation(std::string_view annotation) = 0;
+
   virtual ~NodeRef() {};
 };
 
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
index 6a02ca3578da2..5d5ed663cca05 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
@@ -105,6 +105,14 @@ class ApiNode final : public api::NodeRef {
   int SinceVersion() const override;
   int64_t Id() const override;
 
+  std::string_view GetLayeringAnnotation() const override {
+    return node_.GetLayeringAnnotation();
+  }
+
+  void SetLayeringAnnotation(std::string_view annotation) override {
+    node_.SetLayeringAnnotation(std::string(annotation));
+  }
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ApiNode);
 };
@@ -763,6 +771,9 @@ std::unique_ptr<api::NodeRef> ApiGraph::CopyNode(const api::NodeRef& source_node
                                 source_node.Outputs().size(), domain, new_node_since_version,
                                 source_node.GetExecutionProviderType());
 
+  const auto& layering_annotation = source_node.GetLayeringAnnotation();
+  node.SetLayeringAnnotation(std::string(layering_annotation));
+
   std::unique_ptr<api::NodeRef> new_node = std::make_unique<ApiNode>(node, graph_);
   new_node->CopyAttributes(source_node);
 
diff --git a/onnxruntime/core/optimizer/utils.cc b/onnxruntime/core/optimizer/utils.cc
index 4a323eefe1fe7..6d40b389d5fa3 100644
--- a/onnxruntime/core/optimizer/utils.cc
+++ b/onnxruntime/core/optimizer/utils.cc
@@ -495,6 +495,13 @@ bool IsScalar(const NodeArg& input_arg) {
   return dim_size == 0 || (dim_size == 1 && shape->dim(0).has_dim_value() && shape->dim(0).dim_value() == 1);
 }
 
+void DuplicateNodeAnnotation(const Node& src, Node& dst) {
+  const auto& src_annotation = src.GetLayeringAnnotation();
+  if (!src_annotation.empty()) {
+    dst.SetLayeringAnnotation(src_annotation);
+  }
+}
+
 template <typename T>
 bool GetScalarInitializerValue(const onnxruntime::Graph& graph, const onnxruntime::NodeArg& input_arg, T& value,
                                bool is_constant) {
diff --git a/onnxruntime/core/optimizer/utils.h b/onnxruntime/core/optimizer/utils.h
index 857640f861238..2f9b48df7a75f 100644
--- a/onnxruntime/core/optimizer/utils.h
+++ b/onnxruntime/core/optimizer/utils.h
@@ -175,6 +175,8 @@ bool CheckOutputEdges(const Graph& graph, const Node& node, size_t expected_outp
 // Check if NodeArg takes in a scalar tensor.
 bool IsScalar(const NodeArg& input_arg);
 
+void DuplicateNodeAnnotation(const Node& src, Node& dst);
+
 #endif  // #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
 }  // namespace optimizer_utils
diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
index 4bfb0f673404a..6725c92b09f82 100644
--- a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
@@ -72,6 +72,40 @@ SVMClassifier::SVMClassifier(const OpKernelInfo& info)
   ORT_ENFORCE(classlabels_strings_.size() > 0 || classlabels_ints_.size() > 0);
   ORT_ENFORCE(proba_.size() == probb_.size());
   ORT_ENFORCE(coefficients_.size() > 0);
+
+  // Validate attribute array sizes against the declared dimensions to prevent
+  // out-of-bounds reads from crafted models.
+  if (mode_ == SVM_TYPE::SVM_SVC) {
+    // SVC mode: coefficients layout is [class_count - 1, vector_count]
+    const size_t expected_coefficients = static_cast<size_t>(class_count_ - 1) * static_cast<size_t>(vector_count_);
+    ORT_ENFORCE(coefficients_.size() >= expected_coefficients,
+                "coefficients attribute size (", coefficients_.size(),
+                ") is smaller than expected (", expected_coefficients,
+                ") for the given class_count and vector_count.");
+
+    // rho needs one entry per classifier pair: class_count * (class_count - 1) / 2
+    const size_t num_classifiers = static_cast<size_t>(class_count_) * static_cast<size_t>(class_count_ - 1) / 2;
+    ORT_ENFORCE(rho_.size() >= num_classifiers,
+                "rho attribute size (", rho_.size(),
+                ") is smaller than expected (", num_classifiers,
+                ") for the given number of classes.");
+
+    // prob_a and prob_b, when provided, need one entry per classifier pair
+    if (!proba_.empty()) {
+      ORT_ENFORCE(proba_.size() >= num_classifiers,
+                  "prob_a attribute size (", proba_.size(),
+                  ") is smaller than expected (", num_classifiers,
+                  ") for the given number of classes.");
+      ORT_ENFORCE(probb_.size() >= num_classifiers,
+                  "prob_b attribute size (", probb_.size(),
+                  ") is smaller than expected (", num_classifiers,
+                  ") for the given number of classes.");
+    }
+  } else {
+    // Linear mode: coefficients layout is [class_count, feature_count]
+    ORT_ENFORCE(rho_.size() >= 1, "rho attribute must have at least one entry.");
+  }
+
   weights_are_all_positive_ = std::all_of(coefficients_.cbegin(), coefficients_.cend(),
                                           [](float value) { return value >= 0.f; });
 }
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 953858dbfde6f..59cd42c72b951 100755
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -3110,16 +3110,20 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
     }
 
     auto threshold = resource_accountant->GetThreshold();
-    if (!threshold.has_value()) {
+    if (!threshold) {
       // info_.gpu_mem_limit is for BFC arena
       size_t free_memory, total_memory;
       if (0 != cudaMemGetInfo(&free_memory, &total_memory)) {
         memory_threshold = info_.gpu_mem_limit;
+        LOGS(logger, INFO)
+            << "CUDA_EP failed to get available GPU memory info. Using info_.gpu_mem_limit instead: " << info_.gpu_mem_limit;
       } else {
         memory_threshold = std::min(free_memory, info_.gpu_mem_limit);
+        LOGS(logger, VERBOSE)
+            << "CUDA_EP Using threshold: " << memory_threshold << " Free memory reported: " << free_memory;
       }
     } else {
-      memory_threshold = std::get<0>(threshold.value());
+      memory_threshold = std::get<0>(*threshold);
     }
 
     consumed_memory = std::get<0>(resource_accountant->GetConsumedAmount());
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
index 353f698bb6f2c..076027dd3672f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
@@ -504,7 +504,7 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
                     InferAndVerifyOutputSizes(node, &defaultAttributesCapture, shapeInferrerCapture.Get(), constantCpuInputCapture, constantInputGetter, inputShapesOverrides, *outputShapes);
 
                     // Create the kernel while allowing input shape and output shape queries according to options
-                    ComPtr<DmlGraphOpKernelInfoWrapper> kernelInfoWrapper = wil::MakeOrThrow<DmlGraphOpKernelInfoWrapper>(
+                    ComPtr<DmlGraphOpKernelInfoWrapper> kernelInfoWrapper = Dml::SafeMakeOrThrow<DmlGraphOpKernelInfoWrapper>(
                             &protoHelper,
                             executionHandle,
                             true,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 18b4b4593f537..ed99ac0fc7fc2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -132,7 +132,7 @@ namespace Dml
         assert(resourceWrapper->GetD3D12Resource()->GetDesc().Width == bucketSize);
         assert(resourceWrapper != nullptr);
 
-        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
+        ComPtr<AllocationInfo> allocInfo = Dml::SafeMakeOrThrow<AllocationInfo>(
             this,
             ++m_currentAllocationId,
             resourceId,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
index 54393e9bf1539..2934fd0c11516 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
@@ -22,7 +22,7 @@ namespace Dml
         ));
 
         ComPtr<DmlResourceWrapper> resourceWrapper;
-        wil::MakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
+        Dml::SafeMakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
         return resourceWrapper;
     }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
index c99d686349e94..158c102d69ee7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
@@ -48,9 +48,9 @@ namespace Dml
             constexpr uint64_t pooledResourceId = 0; // Not a pooled resource
 
             Microsoft::WRL::ComPtr<DmlResourceWrapper> resourceWrapper;
-            wil::MakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
+            Dml::SafeMakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
 
-            Microsoft::WRL::ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
+            Microsoft::WRL::ComPtr<AllocationInfo> allocInfo = Dml::SafeMakeOrThrow<AllocationInfo>(
                 nullptr,
                 0,
                 pooledResourceId,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 6bd7de0fba5cb..4ddf8b8640376 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -232,8 +232,6 @@ namespace DmlGraphFusionHelper
                     }
                 }
 
-                // Tensor sizes in DML must be a multiple of 4 bytes large.
-                tensorByteSize = AlignToPow2<size_t>(tensorByteSize, 4);
                 if(graphSerializationEnabled)
                 {
                     WriteToFile(modelName, ConvertToWString(iter->first) + L".bin", reinterpret_cast<uint8_t*>(tensorPtr), tensorByteSize);
@@ -264,9 +262,10 @@ namespace DmlGraphFusionHelper
                         initializeInputBuffer = CreateCpuResource(providerImpl, tensorPtr, tensorByteSize);
                     }
 
-                    // Set the binding for operator initialization to the buffer
+                    // Set the binding for operator initialization to the buffer.
+                    // DML requires buffer binding sizes to be a multiple of 4 bytes.
                     initInputBindings[i].Buffer = initializeInputBuffer.Get();
-                    initInputBindings[i].SizeInBytes = tensorByteSize;
+                    initInputBindings[i].SizeInBytes = AlignToPow2<size_t>(tensorByteSize, 4);
                     initializeResourceRefs.push_back(std::move(initializeInputBuffer));
                 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 6d8d5453b9fc0..cd7dfd46485af 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -55,7 +55,7 @@ namespace Dml
         _Out_ std::shared_ptr<onnxruntime::KernelRegistry>* registry,
         _Out_ std::shared_ptr<const InternalRegistrationInfoMap>* internalRegInfoMap)
     {
-        ComPtr<AbiCustomRegistry> abiRegistry = wil::MakeOrThrow<AbiCustomRegistry>();
+        ComPtr<AbiCustomRegistry> abiRegistry = Dml::SafeMakeOrThrow<AbiCustomRegistry>();
         Dml::RegisterDmlOperators(abiRegistry.Get());
 
         assert(abiRegistry->GetRegistries().size() == 1);
@@ -88,7 +88,7 @@ namespace Dml
         ComPtr<ID3D12Device> device;
         GRAPHICS_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(device.GetAddressOf())));
 
-        m_impl = wil::MakeOrThrow<ExecutionProviderImpl>(dmlDevice, device.Get(), executionContext, enableMetacommands,
+        m_impl = Dml::SafeMakeOrThrow<ExecutionProviderImpl>(dmlDevice, device.Get(), executionContext, enableMetacommands,
                                                          enableGraphCapture, enableSyncSpinning, disableMemoryArena);
     }
 
@@ -1298,9 +1298,9 @@ namespace Dml
         uint64_t pooledResourceId = 0; // Not a pooled resource
 
         ComPtr<DmlResourceWrapper> resourceWrapper;
-        wil::MakeOrThrow<DmlCommittedResourceWrapper>(pResource).As(&resourceWrapper);
+        Dml::SafeMakeOrThrow<DmlCommittedResourceWrapper>(pResource).As(&resourceWrapper);
 
-        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(nullptr, 0, pooledResourceId, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width);
+        ComPtr<AllocationInfo> allocInfo = Dml::SafeMakeOrThrow<AllocationInfo>(nullptr, 0, pooledResourceId, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width);
         return allocInfo.Detach();
     }
     void FreeGPUAllocation(void* ptr)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index 22de743f6e718..51c25d6d40c5b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -291,7 +291,7 @@ namespace Dml::GraphDescBuilder
             if (iter != isInitializerTransferable.end())
             {
                 // Using const_cast here is simpler than making surrounding code const correct.
-                tensorWrapper = wil::MakeOrThrow<OnnxTensorWrapper>(const_cast<ONNX_NAMESPACE::TensorProto*>(iter->second.first), modelPath);
+                tensorWrapper = Dml::SafeMakeOrThrow<OnnxTensorWrapper>(const_cast<ONNX_NAMESPACE::TensorProto*>(iter->second.first), modelPath);
             }
             return tensorWrapper;
         };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index fe52f27b35bb8..13ce9afa99b1e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -868,7 +868,7 @@ namespace Windows::AI::MachineLearning::Adapter
               const onnx::TensorProto* tensorProto = &attributeProto->t();
 
               // An empty path is used as external weights are not currently supported in this case
-              Microsoft::WRL::ComPtr<IMLOperatorTensor> tensorWrapper = wil::MakeOrThrow<OnnxTensorWrapper>(const_cast<onnx::TensorProto*>(tensorProto), std::filesystem::path());
+              Microsoft::WRL::ComPtr<IMLOperatorTensor> tensorWrapper = Dml::SafeMakeOrThrow<OnnxTensorWrapper>(const_cast<onnx::TensorProto*>(tensorProto), std::filesystem::path());
               *tensor = tensorWrapper.Detach();
               return S_OK;
             }
@@ -1977,7 +1977,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 auto inputTensor = m_impl->Input<onnxruntime::Tensor>(gsl::narrow_cast<int>(inputIndex));
                 if (inputTensor != nullptr)
                 {
-                    ComPtr<TensorWrapper> tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    ComPtr<TensorWrapper> tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         const_cast<onnxruntime::Tensor*>(inputTensor),
                         IsAllocationInterface(inputTensor->Location()),
                         m_winmlProvider.Get(),
@@ -2019,7 +2019,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 auto elemTensor = const_cast<onnxruntime::Tensor*>(&inputTensorSeq->Get(sequenceIndex));
                 if (elemTensor != nullptr)
                 {
-                    ComPtr<TensorWrapper> tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    ComPtr<TensorWrapper> tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         elemTensor,
                         IsAllocationInterface(elemTensor->Location()),
                         m_winmlProvider.Get(),
@@ -2119,7 +2119,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 auto elemTensor = const_cast<onnxruntime::Tensor*>(&outputTensorSeq->Get(sequenceIndex));
                 if (elemTensor != nullptr)
                 {
-                    ComPtr<TensorWrapper> tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    ComPtr<TensorWrapper> tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         elemTensor,
                         IsAllocationInterface(elemTensor->Location()),
                         m_winmlProvider.Get(),
@@ -2212,7 +2212,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 auto outputTensor = m_impl->Output(outputIndex, shape);
                 if (outputTensor)
                 {
-                    ComPtr<TensorWrapper> tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    ComPtr<TensorWrapper> tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         const_cast<onnxruntime::Tensor*>(outputTensor),
                         IsAllocationInterface(outputTensor->Location()),
                         m_winmlProvider.Get(),
@@ -2377,7 +2377,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 const onnxruntime::Tensor* tensor = nullptr;
                 if (kerneInfo.TryGetConstantInput(index, &tensor))
                 {
-                    tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         const_cast<onnxruntime::Tensor*>(tensor),
                         IsAllocationInterface(tensor->Location()),
                         winmlProviderCapture.Get(),
@@ -2396,7 +2396,7 @@ namespace Windows::AI::MachineLearning::Adapter
             }
 
             // Create the kernel while allowing input shape and output shape queries according to options
-            ComPtr<OpKernelInfoWrapper> kernelInfoWrapper = wil::MakeOrThrow<OpKernelInfoWrapper>(
+            ComPtr<OpKernelInfoWrapper> kernelInfoWrapper = Dml::SafeMakeOrThrow<OpKernelInfoWrapper>(
                 &kerneInfo,
                 m_abiExecutionObject.Get(),
                 nullptr,
@@ -2443,7 +2443,7 @@ namespace Windows::AI::MachineLearning::Adapter
                     const auto* tensor = context->Input<onnxruntime::Tensor>(gsl::narrow_cast<int>(index));
                     if (tensor != nullptr)
                     {
-                        tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                        tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                             const_cast<onnxruntime::Tensor*>(tensor),
                             IsAllocationInterface(tensor->Location()),
                             winmlProviderCapture.Get(),
@@ -2464,7 +2464,7 @@ namespace Windows::AI::MachineLearning::Adapter
                         for (uint32_t sequenceIndex = 0; sequenceIndex < tensorSequence->Size(); ++sequenceIndex)
                         {
                             auto& tensor = tensorSequence->Get(sequenceIndex);
-                            auto tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                            auto tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                                 const_cast<onnxruntime::Tensor*>(&tensor),
                                 IsAllocationInterface(tensor.Location()),
                                 winmlProviderCapture.Get(),
@@ -2491,7 +2491,7 @@ namespace Windows::AI::MachineLearning::Adapter
             }
 
             // Create the kernel while allowing input shape and output shape queries according to options
-            ComPtr<OpKernelInfoWrapper> kernelInfoWrapper = wil::MakeOrThrow<OpKernelInfoWrapper>(
+            ComPtr<OpKernelInfoWrapper> kernelInfoWrapper = Dml::SafeMakeOrThrow<OpKernelInfoWrapper>(
                 &Info(),
                 m_abiExecutionObject.Get(),
                 &inputShapes,
@@ -2569,7 +2569,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 EdgeShapes localInferredOutputShapes;
                 ComPtr<IMLOperatorKernel> localKernel = inferShapesAndCreateKernel(local_input_shapes, localInferredOutputShapes);
 
-                ComPtr<OpKernelContextWrapper> kernelContextWrapper = wil::MakeOrThrow<OpKernelContextWrapper>(
+                ComPtr<OpKernelContextWrapper> kernelContextWrapper = Dml::SafeMakeOrThrow<OpKernelContextWrapper>(
                     context,
                     Info().GetExecutionProvider(),
                     m_internalOperator,
@@ -2588,7 +2588,7 @@ namespace Windows::AI::MachineLearning::Adapter
             }
         }
 
-        ComPtr<OpKernelContextWrapper> kernelContextWrapper = wil::MakeOrThrow<OpKernelContextWrapper>(
+        ComPtr<OpKernelContextWrapper> kernelContextWrapper = Dml::SafeMakeOrThrow<OpKernelContextWrapper>(
             context,
             Info().GetExecutionProvider(),
             m_internalOperator,
@@ -2811,7 +2811,7 @@ namespace Windows::AI::MachineLearning::Adapter
         onnxruntime::ProtoHelperNodeContext protoContext(node);
         onnxruntime::OpNodeProtoHelper<onnxruntime::ProtoHelperNodeContext> info(&protoContext);
 
-        ComPtr<MLKernelInferenceContext> inferenceContext = wil::MakeOrThrow<MLKernelInferenceContext>(&info, inputShapes, outputShapes, defaultAttributes, requiredConstantCpuInputs, constantInputGetter);
+        ComPtr<MLKernelInferenceContext> inferenceContext = Dml::SafeMakeOrThrow<MLKernelInferenceContext>(&info, inputShapes, outputShapes, defaultAttributes, requiredConstantCpuInputs, constantInputGetter);
 
         outputShapes.Reset(info.GetOutputCount());
 
@@ -2865,13 +2865,13 @@ namespace Windows::AI::MachineLearning::Adapter
             [ctx](uint32_t index)
             {
                 // An empty path is used as external weights are not currently supported in this case
-                Microsoft::WRL::ComPtr<IMLOperatorTensor> tensorWrapper = wil::MakeOrThrow<OnnxTensorWrapper>(
+                Microsoft::WRL::ComPtr<IMLOperatorTensor> tensorWrapper = Dml::SafeMakeOrThrow<OnnxTensorWrapper>(
                     const_cast<onnx::TensorProto*>(ctx->getInputData(index)), std::filesystem::path());
                 return tensorWrapper;
             }
         );
 
-        return wil::MakeOrThrow<MLSchemaInferenceContext>(info, ctx, requiredConstantCpuInputs, mlOperatorTensorGetter);
+        return Dml::SafeMakeOrThrow<MLSchemaInferenceContext>(info, ctx, requiredConstantCpuInputs, mlOperatorTensorGetter);
     }
 
     MLSchemaInferenceContext::MLSchemaInferenceContext(
@@ -2952,7 +2952,7 @@ namespace Windows::AI::MachineLearning::Adapter
         const AttributeMap* defaultAttributes)
     {
         MLOperatorTensorGetter mLOperatorTensorGetter = MLOperatorTensorGetter();
-        return wil::MakeOrThrow<MLSupportQueryContext>(info, defaultAttributes, mLOperatorTensorGetter);
+        return Dml::SafeMakeOrThrow<MLSupportQueryContext>(info, defaultAttributes, mLOperatorTensorGetter);
     }
 
     MLSupportQueryContext::MLSupportQueryContext(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index 1de88a61a0d77..25210c146a6b6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -1097,7 +1097,7 @@ class GpuDFTOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
                 version = 20;
             }
 
-            auto dftOperator = wil::MakeOrThrow<GpuDFTOperator>(context, version);
+            auto dftOperator = Dml::SafeMakeOrThrow<GpuDFTOperator>(context, version);
             dftOperator.CopyTo(kernel);
             return S_OK;
         }
@@ -1177,8 +1177,8 @@ class GpuDFTOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
         kernelDescription.options = MLOperatorKernelOptions::None;
         kernelDescription.executionOptions = 0;
 
-        auto shareInferrer = wil::MakeOrThrow<DFTShapeInferrer>();
-        auto factory = wil::MakeOrThrow<GpuDFTOperatorFactory>();
+        auto shareInferrer = Dml::SafeMakeOrThrow<DFTShapeInferrer>();
+        auto factory = Dml::SafeMakeOrThrow<GpuDFTOperatorFactory>();
 
         std::array<uint32_t, 2> requiredConstantCpuInputs = { 1, 2 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 5ba936ddf3976..6d7a089103c9b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -747,7 +747,7 @@ class DmlGridSampleOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
     {
         try
         {
-            auto dftOperator = wil::MakeOrThrow<DmlGridSampleOperator>(context);
+            auto dftOperator = Dml::SafeMakeOrThrow<DmlGridSampleOperator>(context);
             dftOperator.CopyTo(kernel);
             return S_OK;
         }
@@ -832,8 +832,8 @@ class DmlGridSampleOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
         kernelDescription.options = MLOperatorKernelOptions::None;
         kernelDescription.executionOptions = 0;
 
-        auto shareInferrer = wil::MakeOrThrow<GridSampleShapeInferrer>();
-        auto factory = wil::MakeOrThrow<DmlGridSampleOperatorFactory>();
+        auto shareInferrer = Dml::SafeMakeOrThrow<GridSampleShapeInferrer>();
+        auto factory = Dml::SafeMakeOrThrow<DmlGridSampleOperatorFactory>();
 
         ComPtr<IMLOperatorRegistryPrivate> registryPrivate;
         ORT_THROW_IF_FAILED(registry->QueryInterface(IID_PPV_ARGS(&registryPrivate)));
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index 287f1e5b6dfe7..2ee85b01a9a2e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -907,4 +907,71 @@ namespace Dml
         bufferTensorDesc->TotalTensorSizeInBytes = (elementSize + 3) & ~3;
     }
 
+    void DmlOperator::BroadcastQuantizationParameters(
+        const MLOperatorKernelCreationContext& kernelInfo,
+        gsl::span<const uint32_t> outputShape
+        )
+    {
+        const uint32_t outputShapeDimCount = gsl::narrow_cast<uint32_t>(outputShape.size());
+
+        uint32_t axis = 0;
+
+        // If an axis was explicitly passed (or the default value 1 is set from the schema),
+        // then other inputs are broadcasting to the shape of the input data tensor.
+        if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
+        {
+            // Avoid validating the axis until later because the axis parameter is ignorable unless
+            // broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the
+            // "axis" attribute even when the attribute doesn't actually exist in the model, which
+            // would cause a validation failure here.
+            const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
+            axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
+        }
+
+        // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
+        for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
+        {
+            if (!kernelInfo.IsInputValid(index))
+            {
+                continue;
+            }
+
+            auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
+            assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
+
+            // Fix up the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
+            // becomes scale[2,1], so that broadcasting works correctly.
+            std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);
+
+            // If the input tensor is a 1D vector, then extra massaging is needed to project their
+            // 1D vectors back to the full shape for broadcasting along the given axis.
+            // The 1D vector should have a length equal to the output tensor's dimension on that axis.
+            if (inputTensorShape.size() == 1 && inputTensorShape != std::vector<uint32_t>(outputShape.begin(), outputShape.end()))
+            {
+                ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
+                uint32_t broadcastAxisLength = outputShape[axis];
+                ML_CHECK_VALID_ARGUMENT(
+                    (inputTensorShape[0] == broadcastAxisLength) ||
+                    // Treat as broadcast dimension to match CPU behavior.
+                    (inputTensorShape[0] == 1)
+                );
+                inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
+                inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
+            }
+            // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
+            // will apply broadcasting with standard elementwise alignment.
+
+            m_inputTensorDescs[index] = TensorDesc(
+                edgeDesc.tensorDataType,
+                outputShape,
+                gsl::make_span(inputTensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount, // minDimensionCount
+                0 // guaranteedBaseOffsetAlignment
+            );
+        }
+    }
+
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
index fa54d4b041b5f..002541e23c47c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -149,6 +149,15 @@ namespace Dml
             uint32_t minDimensionCount = NchwDimensionCount
             ) const;
 
+        // Reshapes scale and zero_point tensor descriptors (inputs after index 0) so that their
+        // dimension count matches the output shape, enabling correct broadcasting in DML.
+        // For 1D per-axis tensors, the shape is projected along the given axis (e.g. scale[6]
+        // with axis=0 on a 5D output becomes [6,1,1,1,1]).
+        void BroadcastQuantizationParameters(
+            const MLOperatorKernelCreationContext& kernelInfo,
+            gsl::span<const uint32_t> outputShape
+            );
+
         static void TryConvertTensorToBroadcastScalar(
             const MLOperatorKernelCreationContext& kernelInfo,
             const DML_TENSOR_DESC* tensor,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
index d4d7ee1311874..b64a5265f56e3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
@@ -542,64 +542,7 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
         const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType();
         bool hasZeroPointTensor = kernelInfo.IsInputValid(2);
 
-        uint32_t axis = 0;
-
-        // If an axis was given explicitly passed (or the default value 1 is set from the schema),
-        // then other inputs are broadcasting to the shape of the input data tensor.
-        if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
-        {
-            // Avoid validating the axis until later because the axis parameter is ignorable unless
-            // broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the
-            // "axis" attribute even when the attribute doesn't actually exist in the model, which
-            // would cause a validation failure here.
-            const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
-            axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
-        }
-
-        // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
-        for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
-        {
-            if (!kernelInfo.IsInputValid(index))
-            {
-                continue;
-            }
-
-            auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
-            assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
-
-            // Fix up the the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
-            // becomes scale[2,1], so that broadcasting works correctly.
-            std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);
-
-            // If the input tensor is a 1D vector, then extra massaging is needed to project their
-            // 1D vectors back to the full shape for broadcasting along the given axis.
-            // The 1D vector should have a length equal to the output tensor's dimension on that axis.
-            if (inputTensorShape.size() == 1 && inputTensorShape != outputShape)
-            {
-                ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
-                uint32_t broadcastAxisLength = outputShape[axis];
-                ML_CHECK_VALID_ARGUMENT(
-                    (inputTensorShape[0] == broadcastAxisLength) ||
-                    // Treat as broadcast dimension to match CPU behavior.
-                    (inputTensorShape[0] == 1)
-                );
-                inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
-                inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
-            }
-            // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
-            // will apply broadcasting with standard elementwise alignment.
-
-            m_inputTensorDescs[index] = TensorDesc(
-                edgeDesc.tensorDataType,
-                gsl::make_span(outputShape),
-                gsl::make_span(inputTensorShape),
-                TensorAxis::DoNotCoerce,
-                TensorAxis::W,
-                TensorAxis::RightAligned,
-                NchwDimensionCount, // minDimensionCount
-                0 // guaranteedBaseOffsetAlignment
-            );
-        }
+        BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape));
 
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
@@ -630,6 +573,8 @@ class DmlOperatorQuantization21 : public DmlOperator
         const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType();
         bool hasZeroPointTensor = kernelInfo.IsInputValid(2);
 
+        BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape));
+
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
index bc29256dd2e28..83e35ae89282d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
@@ -76,7 +76,7 @@ class DmlOperatorNonZero: public DmlOperator
 
         // Create the DML output tensor for the number of nonzero elements
         onnxruntime::Tensor outputCountDml(onnxruntime::DataTypeImpl::GetType<uint32_t>(), m_outputCountShape, executionProvider->GetGpuAllocator());
-        Microsoft::WRL::ComPtr<IMLOperatorTensor> outputCountDmlWrapper = wil::MakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
+        Microsoft::WRL::ComPtr<IMLOperatorTensor> outputCountDmlWrapper = Dml::SafeMakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
             &outputCountDml,
             true,
             executionProvider,
@@ -84,7 +84,7 @@ class DmlOperatorNonZero: public DmlOperator
 
         // Create the DML output tensor for the coordinates (not cropped)
         onnxruntime::Tensor intermediateCoordinatesDml(onnxruntime::DataTypeImpl::GetType<int64_t>(), m_outputCoordinatesShape, executionProvider->GetGpuAllocator());
-        Microsoft::WRL::ComPtr<IMLOperatorTensor> intermediateCoordinatesDmlWrapper = wil::MakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
+        Microsoft::WRL::ComPtr<IMLOperatorTensor> intermediateCoordinatesDmlWrapper = Dml::SafeMakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
             &intermediateCoordinatesDml,
             true,
             executionProvider,
@@ -105,7 +105,7 @@ class DmlOperatorNonZero: public DmlOperator
 
             // Copy the number of nonzero elements back to the CPU
             onnxruntime::Tensor outputCountCpu(onnxruntime::DataTypeImpl::GetType<uint32_t>(), {1}, executionProvider->GetCpuInputAllocator());
-            Microsoft::WRL::ComPtr<IMLOperatorTensor> outputCountCpuWrapper = wil::MakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
+            Microsoft::WRL::ComPtr<IMLOperatorTensor> outputCountCpuWrapper = Dml::SafeMakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
                 &outputCountCpu,
                 false,
                 executionProvider,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index e2f38231f7295..091a82daefbdc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -238,7 +238,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         constexpr uint32_t dftAxis = 1;
         constexpr bool dftIsInverse = false;
-        m_dftOperator.op = wil::MakeOrThrow<GpuDFTOperator>(
+        m_dftOperator.op = Dml::SafeMakeOrThrow<GpuDFTOperator>(
             m_d3dDevice.Get(),
             dftAxis,
             params.isOnesided,
@@ -516,7 +516,7 @@ class DmlSTFTOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
     {
         try
         {
-            auto dftOperator = wil::MakeOrThrow<DmlSTFTOperator>(context);
+            auto dftOperator = Dml::SafeMakeOrThrow<DmlSTFTOperator>(context);
             dftOperator.CopyTo(kernel);
             return S_OK;
         }
@@ -574,8 +574,8 @@ class DmlSTFTOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
         kernelDescription.options = MLOperatorKernelOptions::None;
         kernelDescription.executionOptions = 0;
 
-        auto shareInferrer = wil::MakeOrThrow<STFTShapeInferrer>();
-        auto factory = wil::MakeOrThrow<DmlSTFTOperatorFactory>();
+        auto shareInferrer = Dml::SafeMakeOrThrow<STFTShapeInferrer>();
+        auto factory = Dml::SafeMakeOrThrow<DmlSTFTOperatorFactory>();
 
         std::array<uint32_t, 2> requiredConstantCpuInputs = { /*frame_step*/1, /*frame_length*/3 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index b0b37d01370bc..26f998c7521a2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -1314,18 +1314,18 @@ void RegisterDmlOperators(IMLOperatorRegistry* registry)
             totalTypeCount += typeConstraints[i].allowedTypeCount;
         }
 
-        ComPtr<MLOperatorKernelFactory> factory =  wil::MakeOrThrow<MLOperatorKernelFactory>(information.creationFunction);
+        ComPtr<MLOperatorKernelFactory> factory =  Dml::SafeMakeOrThrow<MLOperatorKernelFactory>(information.creationFunction);
         ComPtr<MLOperatorShapeInferrer> shapeInferrer;
 
         if (information.shapeInferenceFunction)
         {
-            shapeInferrer = wil::MakeOrThrow<MLOperatorShapeInferrer>(information.shapeInferenceFunction);
+            shapeInferrer = Dml::SafeMakeOrThrow<MLOperatorShapeInferrer>(information.shapeInferenceFunction);
         }
 
         ComPtr<IMLOperatorSupportQueryPrivate> supportQuery;
         if (information.supportQueryFunction)
         {
-            supportQuery = wil::MakeOrThrow<MLOperatorSupportQuery>(information.supportQueryFunction);
+            supportQuery = Dml::SafeMakeOrThrow<MLOperatorSupportQuery>(information.supportQueryFunction);
         }
 
         ORT_THROW_IF_FAILED(registryPrivate->RegisterOperatorKernel(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h
new file mode 100644
index 0000000000000..c2740470cbc0a
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h
@@ -0,0 +1,37 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <wrl/client.h>
+#include <new>
+#include <utility>
+
+// Drop-in replacement for wil::MakeOrThrow that avoids an ASan false positive.
+// WRL's MakeAllocator stores its buffer as char*, so if the constructor throws,
+// ~MakeAllocator calls delete on a char* — passing sizeof(char)=1 to sized
+// operator delete instead of sizeof(T). With the default MSVC allocator, this is
+// benign (sized delete ignores the size), but ASan flags it as
+// new-delete-type-mismatch. This helper uses placement new with correctly-sized
+// cleanup to avoid the issue.
+namespace Dml
+{
+    template <typename T, typename... TArgs>
+    Microsoft::WRL::ComPtr<T> SafeMakeOrThrow(TArgs&&... args)
+    {
+        void* buffer = ::operator new(sizeof(T));
+        T* raw = nullptr;
+        try
+        {
+            raw = new (buffer) T(std::forward<TArgs>(args)...);
+        }
+        catch (...)
+        {
+            ::operator delete(buffer, sizeof(T));
+            throw;
+        }
+        Microsoft::WRL::ComPtr<T> result;
+        result.Attach(raw);
+        return result;
+    }
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
index e9df3fd20aff9..b9febb8171e0d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
@@ -25,6 +25,7 @@
 
 #include <wil/wrl.h>
 #include <wil/result.h>
+#include "SafeMakeOrThrow.h"
 
 #include <gsl/gsl>
 
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
index ac77616cb96f0..dec84d9945569 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
@@ -5,6 +5,7 @@
 
 #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h"
 #include "MLOperatorAuthorPrivate.h"
+#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h"
 #include "core/framework/int4.h"
 #include <gsl/gsl>
 #include <optional>
@@ -972,7 +973,7 @@ class MLOperatorKernel : public Microsoft::WRL::RuntimeClass<
     {
         ORT_TRY
         {
-            Microsoft::WRL::ComPtr<MLOperatorKernel> kernel = wil::MakeOrThrow<MLOperatorKernel>(MLOperatorKernelCreationContext(&info));
+            Microsoft::WRL::ComPtr<MLOperatorKernel> kernel = Dml::SafeMakeOrThrow<MLOperatorKernel>(MLOperatorKernelCreationContext(&info));
 
             *opKernel = kernel.Detach();
             return S_OK;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h
index fa04bcf6edf41..597780a9f448b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h
@@ -5,6 +5,7 @@
 
 #include "OperatorHelper.h"
 #include "OperatorVersions.h"
+#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h"
 
 namespace SchemaInferenceOverrider
 {
@@ -21,7 +22,7 @@ namespace SchemaInferenceOverrider
     )
     {
         Microsoft::WRL::ComPtr<MLOperatorShapeInferrer> shapeInferrer =
-            wil::MakeOrThrow<MLOperatorShapeInferrer>(OperatorHelper::ShapeInferenceFunction<T>);
+            Dml::SafeMakeOrThrow<MLOperatorShapeInferrer>(OperatorHelper::ShapeInferenceFunction<T>);
 
         auto schema = const_cast<onnx::OpSchema*>(onnx::OpSchemaRegistry::Schema(name, version));
 
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index c72ce205e5fbb..c0ddc44d0ca57 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -21,6 +21,8 @@ using Microsoft::WRL::ComPtr;
 #include <wil/wrl.h>
 #include <wil/result.h>
 
+#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h"
+
 #include "core/providers/dml/dml_provider_factory.h"
 #include "core/providers/dml/dml_provider_factory_creator.h"
 #include "core/session/abi_session_options_impl.h"
@@ -89,11 +91,11 @@ std::unique_ptr<IExecutionProvider> DMLProviderFactory::CreateProvider() {
 
     // First, check if an I/O binding API that was used before this session or another session has already created a queue
     if (FAILED(d3d12_device->GetPrivateData(dml_execution_context_guid, &execution_context_ptr_size, execution_context.GetAddressOf()))) {
-      execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true, true);
+      execution_context = Dml::SafeMakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true, true);
       ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_execution_context_guid, execution_context.Get()));
     }
   } else {
-    execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_, false);
+    execution_context = Dml::SafeMakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_, false);
   }
 
   auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), execution_context.Get(), metacommands_enabled_, graph_capture_enabled_, cpu_sync_spinning_enabled_, disable_memory_arena_);
diff --git a/onnxruntime/core/providers/webgpu/quantization/quantize_linear.cc b/onnxruntime/core/providers/webgpu/quantization/quantize_linear.cc
index 2cf0f11ce46f2..1bd313053ed09 100644
--- a/onnxruntime/core/providers/webgpu/quantization/quantize_linear.cc
+++ b/onnxruntime/core/providers/webgpu/quantization/quantize_linear.cc
@@ -5,6 +5,7 @@
 
 #include "core/util/math.h"
 #include "core/providers/webgpu/quantization/quantize_linear.h"
+#include "core/framework/int4.h"
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
 #include "core/providers/webgpu/webgpu_utils.h"
@@ -22,8 +23,21 @@ Status DequantizeLinearProgram::GenerateShaderCode(ShaderHelper& shader) const {
       << "let output_indices = " << output.OffsetToIndices("global_idx") << ";\n";
 
   // Get x input
-  if (packed_) {
-    std::string unpack = (signed_) ? "unpack4xI8(x)" : "unpack4xU8(x)";
+  if (packing_ == PackingMode::Packed4) {
+    // 4-bit packing: 8 elements per u32
+    shader.MainFunctionBody()
+        << "let x = " << x.GetByOffset("global_idx / 8") << ";\n"
+        << "let x_raw = (x >> ((global_idx % 8u) * 4u)) & 0xFu;\n";
+    if (packed_signed_) {
+      shader.MainFunctionBody()
+          << "let x_value = select(input_element_t(x_raw), input_element_t(x_raw) - 16, x_raw >= 8u);\n";
+    } else {
+      shader.MainFunctionBody()
+          << "let x_value = input_element_t(x_raw);\n";
+    }
+  } else if (packing_ == PackingMode::Packed8) {
+    // 8-bit packing: 4 elements per u32
+    std::string unpack = (packed_signed_) ? "unpack4xI8(x)" : "unpack4xU8(x)";
     if (output.NumComponents() == 1) {
       shader.MainFunctionBody()
           << "let x = " << x.GetByOffset("global_idx / 4") << ";\n"
@@ -51,10 +65,14 @@ Status DequantizeLinearProgram::GenerateShaderCode(ShaderHelper& shader) const {
         << "let scale_value = " << scale.GetByOffset("scale_index") << ";\n";
   } else {
     // Block quantization. Scale input rank is same as input/output rank.
+    // On the block axis, divide by block_size; on other axes, use output index directly.
+    shader.MainFunctionBody() << "var scale_indices: scale_indices_t;\n";
+    for (int i = 0; i < rank_; i++) {
+      std::string idx = output.IndicesGet("output_indices", i);
+      std::string value_expr = "select(" + idx + ", " + idx + " / uniforms.block_size, " + std::to_string(i) + "u == uniforms.axis)";
+      shader.MainFunctionBody() << scale.IndicesSet("scale_indices", i, value_expr) << "\n";
+    }
     shader.MainFunctionBody()
-        << "var scale_indices: scale_indices_t = output_indices;\n"
-        << "let index = " << scale.IndicesGet("scale_indices", "uniforms.axis") << "/ uniforms.block_size;\n"
-        << scale.IndicesSet("scale_indices", "uniforms.axis", "index") << ";\n"
         << "let scale_value = " << scale.GetByIndices("scale_indices") << ";\n";
   }
 
@@ -62,43 +80,64 @@ Status DequantizeLinearProgram::GenerateShaderCode(ShaderHelper& shader) const {
   if (has_zeropoint_) {
     const auto& zero_point = shader.AddInput("zero_point", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
 
-    std::string unpack = (signed_) ? "unpack4xI8(zero_point_input)" : "unpack4xU8(zero_point_input)";
-    if (per_layer_) {
-      // zero-point input is a scalar
-      if (packed_) {
+    if (packing_ == PackingMode::Packed4) {
+      // 4-bit zero-point: 8 elements per u32, with sign extension for signed types
+      std::string sign_extend_prefix = packed_signed_ ? "let zp_raw = " : "let zero_point_value = input_element_t(";
+      std::string sign_extend_suffix = packed_signed_ ? ";\nlet zero_point_value = select(input_element_t(zp_raw), input_element_t(zp_raw) - 16, zp_raw >= 8u);\n"
+                                                      : ");\n";
+      if (per_layer_) {
         shader.MainFunctionBody()
-            << "let zero_point_input = " << zero_point.GetByOffset("0") << ";\n"
-            << "let zero_point_vec = " << unpack << ";\n"
-            << "let zero_point_value = zero_point_vec[0];\n";
-      } else {
-        shader.MainFunctionBody()
-            << "let zero_point_value = " << zero_point.GetByOffset("0") << ";\n";
-      }
-    } else if (per_axis_) {
-      // zero-point input is a 1D tensor
-      if (packed_) {
+            << sign_extend_prefix << zero_point.GetByOffset("0") << " & 0xFu" << sign_extend_suffix;
+      } else if (per_axis_) {
         shader.MainFunctionBody()
             << "let zero_point_index = " << output.IndicesGet("output_indices", "uniforms.axis") << ";\n"
-            << "let zero_point_input = " << zero_point.GetByOffset("zero_point_index / 4") << ";\n"
-            << "let zero_point_vec = " << unpack << ";\n"
-            << "let zero_point_value = zero_point_vec[zero_point_index % 4];\n";
+            << "let zero_point_packed = " << zero_point.GetByOffset("zero_point_index / 8") << ";\n"
+            << sign_extend_prefix << "(zero_point_packed >> ((zero_point_index % 8u) * 4u)) & 0xFu" << sign_extend_suffix;
       } else {
         shader.MainFunctionBody()
-            << "let zero_point_index = " << output.IndicesGet("output_indices", "uniforms.axis") << ";\n"
-            << "let zero_point_value = " << zero_point.GetByOffset("zero_point_index") << ";\n";
+            << "let zero_point_offset = " << scale.IndicesToOffset("scale_indices") << ";\n"
+            << "let zero_point_packed = " << zero_point.GetByOffset("zero_point_offset / 8") << ";\n"
+            << sign_extend_prefix << "(zero_point_packed >> ((zero_point_offset % 8u) * 4u)) & 0xFu" << sign_extend_suffix;
       }
     } else {
-      // BlockedQuantization. The zero-point input shape is the same as the scale input shape.
-      if (packed_) {
-        shader.MainFunctionBody()
-            << "let zero_point_offset = " << scale.IndicesToOffset("scale_indices") << ";\n"
-            << "let zero_point_input = " << zero_point.GetByOffset("zero_point_offset / 4") << ";\n"
-            << "let zero_point_vec = " << unpack << ";\n"
-            << "let zero_point_value = zero_point_vec[zero_point_offset % 4];\n";
+      std::string unpack = (packed_signed_) ? "unpack4xI8(zero_point_input)" : "unpack4xU8(zero_point_input)";
+      if (per_layer_) {
+        // zero-point input is a scalar
+        if (packing_ == PackingMode::Packed8) {
+          shader.MainFunctionBody()
+              << "let zero_point_input = " << zero_point.GetByOffset("0") << ";\n"
+              << "let zero_point_vec = " << unpack << ";\n"
+              << "let zero_point_value = zero_point_vec[0];\n";
+        } else {
+          shader.MainFunctionBody()
+              << "let zero_point_value = " << zero_point.GetByOffset("0") << ";\n";
+        }
+      } else if (per_axis_) {
+        // zero-point input is a 1D tensor
+        if (packing_ == PackingMode::Packed8) {
+          shader.MainFunctionBody()
+              << "let zero_point_index = " << output.IndicesGet("output_indices", "uniforms.axis") << ";\n"
+              << "let zero_point_input = " << zero_point.GetByOffset("zero_point_index / 4") << ";\n"
+              << "let zero_point_vec = " << unpack << ";\n"
+              << "let zero_point_value = zero_point_vec[zero_point_index % 4];\n";
+        } else {
+          shader.MainFunctionBody()
+              << "let zero_point_index = " << output.IndicesGet("output_indices", "uniforms.axis") << ";\n"
+              << "let zero_point_value = " << zero_point.GetByOffset("zero_point_index") << ";\n";
+        }
       } else {
-        shader.MainFunctionBody()
-            << "let zero_point_offset = " << scale.IndicesToOffset("scale_indices") << ";\n"
-            << "let zero_point_value = " << zero_point.GetByOffset("zero_point_offset") << ";\n";
+        // BlockedQuantization. The zero-point input shape is the same as the scale input shape.
+        if (packing_ == PackingMode::Packed8) {
+          shader.MainFunctionBody()
+              << "let zero_point_offset = " << scale.IndicesToOffset("scale_indices") << ";\n"
+              << "let zero_point_input = " << zero_point.GetByOffset("zero_point_offset / 4") << ";\n"
+              << "let zero_point_vec = " << unpack << ";\n"
+              << "let zero_point_value = zero_point_vec[zero_point_offset % 4];\n";
+        } else {
+          shader.MainFunctionBody()
+              << "let zero_point_offset = " << scale.IndicesToOffset("scale_indices") << ";\n"
+              << "let zero_point_value = " << zero_point.GetByOffset("zero_point_offset") << ";\n";
+        }
       }
     }
   } else {
@@ -122,11 +161,15 @@ Status DequantizeLinear::ComputeInternal(ComputeContext& context) const {
   auto* output_tensor = context.Output(0, x_shape);
   int64_t x_scale_rank = x_scale->Shape().NumDimensions();
 
-  // Currently only INT8, UINT8, and INT32 are registered.
   auto x_type = x->GetElementType();
 
-  bool packed = x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 || x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;
-  bool is_signed = x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
+  PackingMode packing = (x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4 || x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4)
+                            ? PackingMode::Packed4
+                        : (x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 || x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8)
+                            ? PackingMode::Packed8
+                            : PackingMode::None;
+  bool packed = packing != PackingMode::None;
+  bool is_packed_signed = x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 || x_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4;
   int64_t axis = (axis_ >= 0) ? axis_ : axis_ + x_shape.NumDimensions();
 
   int max_components = GetMaxComponents(x_size);
@@ -137,26 +180,80 @@ Status DequantizeLinear::ComputeInternal(ComputeContext& context) const {
   // 1D tensor - 1 scaler for per axis
   bool per_axis = per_layer == false && x_scale_rank == 1;
 
-  bool use_components = per_layer && (!packed || max_components == 4);
+  // Compute effective block_size. When block_size_ is 0 (default) but scale is 1D with
+  // fewer elements than the input dimension on the axis, infer block_size from the ratio.
+  int64_t block_size = block_size_;
+  if (per_axis && block_size == 0) {
+    int64_t input_dim = x_shape[onnxruntime::narrow<size_t>(axis)];
+    int64_t scale_dim = x_scale->Shape()[0];
+    if (scale_dim < input_dim) {
+      block_size = input_dim / scale_dim;
+      per_axis = false;  // treat as block quantization
+    }
+  }
+
+  // When scale is N-D (block quantization) and block_size is 0, infer axis and block_size
+  // from the shapes. Find the dimension where scale is smaller than input to determine axis,
+  // then compute block_size from the ratio.
+  if (!per_layer && !per_axis && block_size == 0) {
+    const auto& scale_shape = x_scale->Shape();
+    for (size_t i = 0; i < x_shape.NumDimensions(); i++) {
+      if (scale_shape[i] < x_shape[i]) {
+        axis = static_cast<int64_t>(i);
+        block_size = x_shape[i] / scale_shape[i];
+        break;
+      }
+    }
+    if (block_size == 0) {
+      block_size = 1;  // all dims match, default to block_size=1
+    }
+  }
+
+  // Validate shapes for blocked quantization.
+  if (!per_layer && !per_axis && block_size > 0) {
+    const auto& scale_shape = x_scale->Shape();
+    ORT_RETURN_IF(scale_shape.NumDimensions() != x_shape.NumDimensions(),
+                  "x_scale and x must have the same rank for blocked quantization");
+    for (size_t i = 0; i < x_shape.NumDimensions(); i++) {
+      if (static_cast<int64_t>(i) == axis) {
+        ORT_RETURN_IF(scale_shape[i] != (x_shape[i] + block_size - 1) / block_size,
+                      "x_scale must be ceil(Di/block_size) on the quantize axis i for blocked quantization");
+      } else {
+        ORT_RETURN_IF(scale_shape[i] != x_shape[i],
+                      "x_scale and x must have the same shape on non-quantize axes for blocked quantization");
+      }
+    }
+    if (x_zeropoint != nullptr) {
+      for (size_t i = 0; i < x_shape.NumDimensions(); i++) {
+        ORT_RETURN_IF(x_zeropoint->Shape()[i] != scale_shape[i],
+                      "x_zero_point and x_scale must have the same shape for blocked quantization");
+      }
+    }
+  }
+
+  bool use_components = per_layer && packing != PackingMode::Packed4 && (!packed || max_components == 4);
   int components = use_components ? max_components : 1;
   int input_component = use_components ? max_components : 1;
+  // For 4-bit types, each u32 holds 8 elements; for 8-bit types, 4 elements.
+  int pack_factor = (packing == PackingMode::Packed4) ? 8 : 4;
 
-  DequantizeLinearProgram program{packed, is_signed, per_layer, per_axis, x_zeropoint != nullptr};
+  DequantizeLinearProgram program{packing, is_packed_signed, per_layer, per_axis, x_zeropoint != nullptr,
+                                  static_cast<int>(x_shape.NumDimensions())};
 
   program
-      .AddInputs({{x, ProgramTensorMetadataDependency::TypeAndRank, ProgramInput::Flatten, packed ? 4 : input_component}})
+      .AddInputs({{x, ProgramTensorMetadataDependency::TypeAndRank, ProgramInput::Flatten, packed ? pack_factor : input_component}})
       .AddInputs({{x_scale, ProgramTensorMetadataDependency::TypeAndRank}})
       .AddOutput(use_components
                      ? ProgramOutput{output_tensor, ProgramTensorMetadataDependency::Rank, ProgramOutput::Flatten, components}
                      : ProgramOutput{output_tensor, ProgramTensorMetadataDependency::Rank, components})
       .SetDispatchGroupSize((x_size / components + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({{static_cast<uint32_t>(axis)}})
-      .AddUniformVariables({{static_cast<uint32_t>(block_size_)}})
+      .AddUniformVariables({{static_cast<uint32_t>(block_size)}})
       .AddUniformVariables({{static_cast<uint32_t>(x_size / components)}})
-      .CacheHint(std::to_string(axis), std::to_string(is_signed), std::to_string(per_layer), std::to_string(per_axis), std::to_string(block_size_));
+      .CacheHint(std::to_string(axis), std::to_string(is_packed_signed), std::to_string(per_layer), std::to_string(per_axis), std::to_string(block_size), std::to_string(static_cast<int>(packing)));
 
   if (x_zeropoint != nullptr) {
-    program.AddInputs({{x_zeropoint, ProgramTensorMetadataDependency::None, ProgramInput::Flatten, packed ? 4 : 1}});
+    program.AddInputs({{x_zeropoint, ProgramTensorMetadataDependency::None, ProgramInput::Flatten, packed ? pack_factor : 1}});
   }
 
   return context.RunProgram(program);
@@ -167,7 +264,9 @@ const std::vector<MLDataType>& DequantizeLinearConstraints() {
   static std::vector<MLDataType> types{
       DataTypeImpl::GetTensorType<int8_t>(),
       DataTypeImpl::GetTensorType<uint8_t>(),
-      DataTypeImpl::GetTensorType<int32_t>()};
+      DataTypeImpl::GetTensorType<int32_t>(),
+      DataTypeImpl::GetTensorType<UInt4x2>(),
+      DataTypeImpl::GetTensorType<Int4x2>()};
   return types;
 }
 }  // namespace
diff --git a/onnxruntime/core/providers/webgpu/quantization/quantize_linear.h b/onnxruntime/core/providers/webgpu/quantization/quantize_linear.h
index 95614998017e9..31484ac040d85 100644
--- a/onnxruntime/core/providers/webgpu/quantization/quantize_linear.h
+++ b/onnxruntime/core/providers/webgpu/quantization/quantize_linear.h
@@ -8,15 +8,24 @@
 namespace onnxruntime {
 namespace webgpu {
 
+// How the quantized input is packed into u32 words.
+enum class PackingMode {
+  None,     // no packing (e.g. int32)
+  Packed8,  // 8-bit: 4 elements per u32, uses unpack4x[I/U]8
+  Packed4,  // 4-bit: 8 elements per u32, manual bit extraction
+};
+
 class DequantizeLinearProgram final : public Program<DequantizeLinearProgram> {
  public:
-  DequantizeLinearProgram(const bool packed, const bool issigned, const bool per_layer,
-                          const bool per_axis, bool has_zeropoint) : Program<DequantizeLinearProgram>{"DequantizeLinear"},
-                                                                     packed_{packed},
-                                                                     signed_{issigned},
-                                                                     per_layer_{per_layer},
-                                                                     per_axis_{per_axis},
-                                                                     has_zeropoint_{has_zeropoint} {}
+  DequantizeLinearProgram(PackingMode packing, bool is_packed_signed, bool per_layer,
+                          bool per_axis, bool has_zeropoint, int rank = 0)
+      : Program<DequantizeLinearProgram>{"DequantizeLinear"},
+        packing_{packing},
+        packed_signed_{is_packed_signed},
+        per_layer_{per_layer},
+        per_axis_{per_axis},
+        has_zeropoint_{has_zeropoint},
+        rank_{rank} {}
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
@@ -25,11 +34,12 @@ class DequantizeLinearProgram final : public Program<DequantizeLinearProgram> {
                                           {"output_size", ProgramUniformVariableDataType::Uint32});
 
  private:
-  bool packed_;
-  bool signed_;
+  PackingMode packing_;
+  bool packed_signed_;
   bool per_layer_;
   bool per_axis_;
   bool has_zeropoint_;
+  int rank_;
 };
 
 class DequantizeLinear final : public WebGpuKernel {
@@ -38,6 +48,7 @@ class DequantizeLinear final : public WebGpuKernel {
     axis_ = info.GetAttrOrDefault<int64_t>("axis", 1);
     block_size_ = info.GetAttrOrDefault<int64_t>("block_size", 0);
     output_dtype_ = info.GetAttrOrDefault<int64_t>("output_dtype", 0);
+    ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative.");
   }
 
   Status ComputeInternal(ComputeContext& context) const override;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index c61d5826cb885..ec20bf2fdbdfb 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -124,6 +124,12 @@ void WebGpuContext::Initialize(const WebGpuContextConfig& config) {
     device_queue_ = device_.GetQueue();
     // cache device limits
     ORT_ENFORCE(Device().GetLimits(&device_limits_));
+    // Align maxStorageBufferBindingSize down to minStorageBufferOffsetAlignment so that
+    // buffer segment offsets are always properly aligned for WebGPU bind group creation.
+    if (device_limits_.minStorageBufferOffsetAlignment > 0) {
+      device_limits_.maxStorageBufferBindingSize -=
+          (device_limits_.maxStorageBufferBindingSize % device_limits_.minStorageBufferOffsetAlignment);
+    }
     // cache device features
     wgpu::SupportedFeatures supported_features;
     Device().GetFeatures(&supported_features);
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index b873c95b496bb..2ba52a3e989bd 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -29,6 +29,7 @@
 #include "core/framework/kernel_registry.h"
 #include "core/framework/kernel_type_str_resolver.h"
 #include "core/framework/kernel_type_str_resolver_utils.h"
+#include "core/framework/layering_annotations.h"
 #include "core/framework/mldata_type_utils.h"
 #include "core/framework/TensorSeq.h"
 #include "core/framework/tensorprotoutils.h"
@@ -1518,11 +1519,33 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
     }
   }
 
+  LayeringIndex* layering_index = nullptr;
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  std::optional<LayeringIndex> layering_index_storage;
+  const auto layering_config = session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsLayerAssignmentSettings, "");
+  if (!layering_config.empty()) {
+    ORT_RETURN_IF_ERROR_SESSIONID_(LayeringIndex::Create(graph, layering_config, {}, execution_providers_,
+                                                         *session_logger_, layering_index_storage));
+    if (layering_index_storage) {
+      layering_index = &layering_index_storage.value();
+    }
+  }
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   // Do partitioning based on execution providers' capabilities.
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
-                                                       session_options_.config_options, *session_logger_,
+                                                       session_options_.config_options, *session_logger_, layering_index,
                                                        mode, session_options_.GetEpContextGenerationOptions(), debug_graph_fn));
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  if (layering_index) {
+    // Layering annotations maybe present even if index is not built although unlikely.
+    ORT_RETURN_IF_ERROR_SESSIONID_(graph.RemoveAllLayeringAnnotations());
+    // We are currently not using it beyond this point. Clear it to free up memory.
+    layering_index = nullptr;
+    layering_index_storage.reset();
+  }
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
   // Get graph optimizations loop level from session config, if not present, set to default value of 1 as per
   // the definition of kOrtSessionOptionsGraphOptimizationsLoopLevel.
   unsigned int graph_optimizations_loop_level = static_cast<unsigned int>(std::stoi(
@@ -2039,6 +2062,7 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                             transform_layout_fn,
                                             sess_options.config_options,
                                             logger,
+                                            nullptr /*layering_index*/,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 37a74a5de22a6..9834902cea2b1 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -3031,7 +3031,23 @@ ORT_API_STATUS_IMPL(OrtApis::Graph_GetGraphView, _In_ const OrtGraph* src_graph,
                                  "src_graph is a ModelEditorGraph which doesn't support Graph_GetGraphView.");
   }
   const GraphViewer& graph_viewer = ep_graph->GetGraphViewer();
-  const Graph& graph = graph_viewer.GetGraph();
+
+  // Create subgraph's node set and convert them to internal Node
+  InlinedHashSet<NodeIndex> node_set;
+  InlinedVector<const Node*> internal_nodes;
+  internal_nodes.reserve(num_nodes);
+  for (size_t i = 0; i < num_nodes; i++) {
+    const EpNode* ep_node = EpNode::ToInternal(nodes[i]);
+    if (ep_node != nullptr) {
+      const Node& node = ep_node->GetInternalNode();
+      node_set.insert(node.Index());
+      internal_nodes.push_back(&node);
+    } else {
+      std::ostringstream oss;
+      oss << "node indexed [" << i << "] appears to be a ModelEditorNode";
+      return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, oss.str().c_str());
+    }
+  }
 
   // Create a GraphViewer with filtered info
   // TODO: Investigate whether utils::MakeComputeCapability can be extended and reused instead
@@ -3040,178 +3056,93 @@ ORT_API_STATUS_IMPL(OrtApis::Graph_GetGraphView, _In_ const OrtGraph* src_graph,
   // Following data structures help determine the final inputs/outputs of the subgraph.
   // Note: The 'subgraph' here refers to a graph contains a subset of nodes in the 'src_graph'.
 
-  // Subgraph's node set
-  const std::unordered_set<size_t> node_set = [&]() {
-    std::unordered_set<size_t> node_set;
-    for (size_t i = 0; i < num_nodes; i++) {
-      const OrtNode* ort_node = nodes[i];
-      const EpNode* ep_node = EpNode::ToInternal(ort_node);
-      if (ep_node != nullptr) {
-        node_set.insert(ep_node->GetInternalNode().Index());
-      }
+  // Pre-pass: Identify all outputs produced by nodes within the subgraph.
+  // This allows O(1) checks to determine if an input is internal or from the boundary.
+  InlinedHashSet<const NodeArg*> internal_outputs;
+  for (size_t i = 0, lim = internal_nodes.size(); i < lim; i++) {
+    const auto& node = *internal_nodes[i];
+    for (const auto& output : node.OutputDefs()) {
+      internal_outputs.insert(output);
     }
-
-    return node_set;
-  }();
+  }
 
   // Source graph output names
-  std::unordered_set<std::string> graph_output_names;
+  InlinedHashSet<std::string> graph_output_names;
   for (const auto* output_arg : graph_viewer.GetOutputs()) {
     graph_output_names.insert(output_arg->Name());
   }
 
   // These maps store the inputs and outputs of the subgraph.
-  // Please note that the inputs and outputs of the maps will be dynamically updated during node iteration
-  // to determine the final inputs and outputs of the subgraph.
-  std::unordered_map<const NodeArg*, int> subgraph_inputs, subgraph_outputs;
-
-  // This map stores the node's output that will be consumed by another node outside of this subgraph.
-  // So the node's output should be put into the subgraph's output list.
-  std::unordered_map<const NodeArg*, int> subgraph_outputs_to_add;
-
-  // This map stores the node's output that is original graph's output.
-  // So the node's output should be put into the subgraph's output list.
-  std::unordered_map<const NodeArg*, int> graph_outputs_to_add;
+  // Value is order index to maintain deterministic order.
+  InlinedHashMap<const NodeArg*, int> subgraph_inputs, subgraph_outputs;
 
-  std::unordered_set<const NodeArg*> erased;
-
-  // This is the relative ordering that ensures node's input or output being added to the 'subgraph_inputs',
-  // 'subgraph_outputs', 'subgraph_outputs_to_add' and 'graph_outputs_to_add' maps is associated with a relative order index.
-  // Items added earlier receive a smaller order index than items added later.
-  // When constructing the final subgraph's input or output lists, entries with smaller
-  // order indices will appear before those with larger indices.
   int input_order = 0;
   int output_order = 0;
 
-  // node arg to its consumer nodes.
-  // Note: graph.GetConsumerNodes() is not available in minimal build, in order to use unified implementation across
-  //       all builds, this map is needed to determine if node arg is consumed by other nodes.
-  std::unordered_map<std::string, std::unordered_set<NodeIndex>> node_arg_to_consumer_nodes;
-
-  std::vector<std::string> initializers;
+  InlinedVector<std::string> initializers;
 
-  // Add nodes
-  for (size_t i = 0; i < num_nodes; i++) {
-    const OrtNode* ort_node = nodes[i];
-    const EpNode* ep_node = EpNode::ToInternal(ort_node);
-    if (ep_node == nullptr) {
-      return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
-                                   "node is a ModelEditorNode which doesn't support Graph_GetGraphView.");
-    }
-    const Node& node = ep_node->GetInternalNode();
+  // Add nodes and identify boundary inputs/outputs
+  for (size_t i = 0, lim = internal_nodes.size(); i < lim; i++) {
+    const auto& node = *internal_nodes[i];
     indexed_sub_graph->nodes.push_back(node.Index());
 
-    for (const auto& input : node.InputDefs()) {
-      if (!input->Exists()) {
-        continue;
-      }
+    // Process Inputs: If an input is not produced internally, it's a subgraph input.
+    auto process_inputs = [&](gsl::span<const NodeArg* const> inputs) {
+      for (const auto& input : inputs) {
+        if (!input->Exists()) continue;
 
-      if (graph_viewer.IsConstantInitializer(input->Name(), true)) {
-        initializers.push_back(input->Name());
-        continue;
-      }
-      const auto& it = subgraph_outputs.find(input);
-      if (it != subgraph_outputs.end()) {
-        subgraph_outputs.erase(it);
-        erased.insert(input);
-      } else if (erased.find(input) == erased.end()) {
-        // Only when input is neither in output list nor erased list, add the input to input list
-        subgraph_inputs.insert({input, input_order++});
-      }
-    }
+        if (graph_viewer.IsConstantInitializer(input->Name(), true)) {
+          initializers.push_back(input->Name());
+          continue;
+        }
 
-    for (const auto& input : node.ImplicitInputDefs()) {
-      if (!input->Exists()) {
-        continue;
+        // If not produced by this subgraph, it's a boundary input
+        if (internal_outputs.count(input) == 0) {
+          // Use insert to keep the first occurrence's order
+          auto p = subgraph_inputs.emplace(input, input_order);
+          if (p.second) {
+            input_order++;
+          }
+        }
       }
+    };
 
-      if (graph_viewer.IsConstantInitializer(input->Name(), true)) {
-        initializers.push_back(input->Name());
-        continue;
-      }
-      const auto& it = subgraph_outputs.find(input);
-      if (it != subgraph_outputs.end()) {
-        subgraph_outputs.erase(it);
-        erased.insert(input);
-      } else if (erased.find(input) == erased.end()) {
-        // Only when input is neither in output list nor erased list, add the input to input list
-        subgraph_inputs.insert({input, input_order++});
-      }
-    }
+    process_inputs(gsl::make_span(node.InputDefs().data(), node.InputDefs().size()));
+    process_inputs(gsl::make_span(node.ImplicitInputDefs().data(), node.ImplicitInputDefs().size()));
 
-    // For output searching, there are two special cases,
-    // One is, if subgraph's node output is parent graph's output. the node output should
-    // be also added to the subgraph's output list
-    // The other one is, if node's OutputEdges are more than its outputs, meaning certain output is used more than once,
-    // if the output is connected to nodes that don't belong to the subgraph, the output need to be added
-    // to the output list
+    // Process Outputs: If an output is graph output OR consumed externally, it's a subgraph output.
     for (const auto& output : node.OutputDefs()) {
-      if (!output->Exists()) {
-        continue;
-      }
+      if (!output->Exists()) continue;
+
+      bool is_boundary_output = false;
 
-      const auto& it = subgraph_inputs.find(output);
-      if (it != subgraph_inputs.end()) {
-        subgraph_inputs.erase(it);
-        erased.insert(output);
-      } else if (erased.find(output) == erased.end()) {
-        auto has_consumer_nodes = [&](const std::string& node_arg_str) -> bool {
-          // Same implementation as Graph::PopulateNodeArgToProducerConsumerLookupsFromNodes()
-          if (node_arg_to_consumer_nodes.empty()) {
-            for (const auto& node : graph.Nodes()) {
-              node.ForEachDef([&](const NodeArg& node_arg, bool is_input) {
-                if (is_input) {
-                  node_arg_to_consumer_nodes[node_arg.Name()].insert(node.Index());
-                }
-              });
+      // 1. Is it a graph output?
+      if (graph_output_names.count(output->Name()) > 0) {
+        is_boundary_output = true;
+      } else {
+        // 2. Is it consumed by any node outside the subgraph?
+        for (auto it = node.OutputEdgesBegin(), end = node.OutputEdgesEnd(); it != end; ++it) {
+          // Check if the edge uses this specific output
+          if (it->GetSrcArgIndex() < static_cast<int>(node.OutputDefs().size()) &&
+              node.OutputDefs()[it->GetSrcArgIndex()] == output) {
+            if (node_set.count(it->GetNode().Index()) == 0) {
+              is_boundary_output = true;
+              break;
             }
           }
-          return node_arg_to_consumer_nodes.find(node_arg_str) != node_arg_to_consumer_nodes.end();
-        };
-
-        if (has_consumer_nodes(output->Name())) {
-          // Only when output is neither in input list nor erased list,
-          // and the output is consumed by another node, add the output to output list
-          subgraph_outputs.insert({output, output_order++});
         }
       }
 
-      if (graph_output_names.find(output->Name()) != graph_output_names.end()) {
-        // This output is the graph's output.
-        // So the output should be put into the subgraph's output list.
-        graph_outputs_to_add.insert({output, output_order++});
-      }
-    }
-
-    if (node.GetOutputEdgesCount() > node.OutputDefs().size()) {
-      for (auto it = node.OutputEdgesBegin(), end = node.OutputEdgesEnd(); it != end; ++it) {
-        const auto& node_idx = it->GetNode().Index();
-
-        if (node_set.find(node_idx) == node_set.end()) {
-          // This output will be consumed by another node outside of this subgraph.
-          // So the output should be put into the subgraph's output list.
-          const NodeArg* output = nullptr;
-
-          // The dst_arg_index from GetDstArgIndex() could be the index for explicit/implicit input defs of the node.
-          // We need to get the correct input index accordingly. (See Graph::BuildConnections() in graph.cc for more details)
-          if (it->GetDstArgIndex() < static_cast<int>(it->GetNode().InputDefs().size())) {
-            output = (it->GetNode()).InputDefs()[it->GetDstArgIndex()];
-          } else {
-            output = (it->GetNode()).ImplicitInputDefs()[it->GetDstArgIndex() - it->GetNode().InputDefs().size()];
-          }
-          subgraph_outputs_to_add.insert({output, output_order++});
-        }
+      if (is_boundary_output) {
+        subgraph_outputs.insert({output, output_order++});
       }
     }
   }
 
-  subgraph_outputs.insert(subgraph_outputs_to_add.begin(), subgraph_outputs_to_add.end());
-  subgraph_outputs.insert(graph_outputs_to_add.begin(), graph_outputs_to_add.end());
-
   std::multimap<int, const NodeArg*> inputs, outputs;
 
   // Get the input order of the original graph
-  std::unordered_map<const NodeArg*, int> original_inputs;
+  InlinedHashMap<const NodeArg*, int> original_inputs;
   int order = 0;
   for (const auto* input : graph_viewer.GetInputs()) {
     original_inputs[input] = order++;
@@ -3219,22 +3150,22 @@ ORT_API_STATUS_IMPL(OrtApis::Graph_GetGraphView, _In_ const OrtGraph* src_graph,
 
   // input order needs to be consistent with original graph's input order
   for (const auto& [node_arg, subgraph_input_order] : subgraph_inputs) {
-    const auto& original_input_it = original_inputs.find(node_arg);
+    const auto original_input_it = original_inputs.find(node_arg);
 
     if (original_input_it != original_inputs.end()) {
-      inputs.insert(std::make_pair(
+      inputs.emplace(
           original_input_it->second,  // input order from original graph
-          node_arg));
+          node_arg);
     } else {
-      inputs.insert(std::make_pair(
+      inputs.emplace(
           subgraph_input_order,  // input order from subgraph
-          node_arg));
+          node_arg);
     }
   }
 
   // Sort outputs by the order they were added
-  for (auto it = subgraph_outputs.begin(), end = subgraph_outputs.end(); it != end; ++it) {
-    outputs.insert(std::pair<int, const NodeArg*>(it->second, it->first));
+  for (const auto& [node_arg, subgraph_output_order] : subgraph_outputs) {
+    outputs.emplace(subgraph_output_order, node_arg);
   }
 
   std::unique_ptr<IndexedSubGraph::MetaDef> meta_def = std::make_unique<IndexedSubGraph::MetaDef>();
@@ -3259,7 +3190,8 @@ ORT_API_STATUS_IMPL(OrtApis::Graph_GetGraphView, _In_ const OrtGraph* src_graph,
   }
 
   indexed_sub_graph->SetMetaDef(std::move(meta_def));
-  auto new_graph_viewer = std::make_unique<GraphViewer>(graph, *indexed_sub_graph.get());
+  const Graph& graph = graph_viewer.GetGraph();
+  auto new_graph_viewer = std::make_unique<GraphViewer>(graph, *indexed_sub_graph);
 
   std::unique_ptr<EpGraph> result;
   ORT_API_RETURN_IF_STATUS_NOT_OK(EpGraph::Create(std::move(new_graph_viewer), std::move(indexed_sub_graph), result));
diff --git a/onnxruntime/python/tools/layering/layer_annotate.py b/onnxruntime/python/tools/layering/layer_annotate.py
new file mode 100644
index 0000000000000..738c528b28754
--- /dev/null
+++ b/onnxruntime/python/tools/layering/layer_annotate.py
@@ -0,0 +1,165 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import logging
+import pathlib
+
+import onnx
+
+
+def get_logger(name, level=logging.DEBUG):
+    logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s")
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    return logger
+
+
+def getargs():
+    argparser = argparse.ArgumentParser(
+        description="Read a config file with a list of node annotations and apply them to an ONNX model.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    argparser.add_argument(
+        "--config_file_path",
+        type=pathlib.Path,
+        required=True,
+        help="Path to the configuration file with node annotations.",
+    )
+    argparser.add_argument(
+        "--model_path",
+        type=pathlib.Path,
+        required=True,
+        help="Path to a single model to process.",
+    )
+    argparser.add_argument(
+        "--annotated_model",
+        type=pathlib.Path,
+        required=True,
+        help="Path to write the annotated model to.",
+    )
+
+    return argparser.parse_args()
+
+
+def read_annotation_config(config_file_path):
+    """
+    Reads a configuration file to map substrings to annotations.
+
+    The file format is expected to be:
+    annotation_string: substring1, substring2, ...
+
+    The same annotation string can appear multiple times.
+    The node names in the configuration are treated as substrings.
+
+    Args:
+        config_file_path (str or Path): Path to the configuration file.
+
+    Returns:
+        list: A list of tuples (substring, annotation_string).
+    """
+    substring_annotations = []
+    with open(config_file_path) as f:
+        for unstripped_line in f:
+            line = unstripped_line.strip()
+            if not line:
+                continue
+            parts = line.split(":", 1)
+            if len(parts) < 2:
+                continue
+            annotation = parts[0].strip()
+            substrings = parts[1].split(",")
+            for substr in substrings:
+                substring = substr.strip()
+                if substring:
+                    substring_annotations.append((substring, annotation))
+    return substring_annotations
+
+
+def process_nodes(nodes, substring_annotations):
+    """
+    Helper function to process a list of nodes sequentially.
+    """
+    logger = get_logger("annotate_model")
+    logger.info(f"Processing {len(nodes)} nodes.")
+
+    for node in nodes:
+        matched_annotation = None
+        for substring, annotation in substring_annotations:
+            if substring in node.name:
+                matched_annotation = annotation
+
+        if matched_annotation:
+            # Check if annotation already exists
+            entry = None
+            for prop in node.metadata_props:
+                if prop.key == "layer_ann":
+                    entry = prop
+                    break
+
+            if entry:
+                entry.value = matched_annotation
+            else:
+                entry = node.metadata_props.add()
+                entry.key = "layer_ann"
+                entry.value = matched_annotation
+
+        # Recurse into subgraphs for control flow nodes
+        for attr in node.attribute:
+            if attr.type == onnx.AttributeProto.GRAPH:
+                annotate_graph(attr.g, substring_annotations)
+            elif attr.type == onnx.AttributeProto.GRAPHS:
+                for sub_graph in attr.graphs:
+                    annotate_graph(sub_graph, substring_annotations)
+
+
+def annotate_graph(graph, substring_annotations):
+    """
+    Recursively applies annotations to nodes where a configured substring appears in the node name.
+
+    This function iterates over all nodes in the given graph. It checks if any
+    substring from the configuration appears in the node's name. If matched,
+    it adds or updates a metadata property with key 'layer_ann' containing
+    the annotation string. If multiple substrings match, the last one defined
+    in the configuration list applies.
+
+    It also handles control flow nodes (like 'If' or 'Loop') by recursively
+    processing their subgraphs (attributes of type GRAPH or GRAPHS).
+
+    Args:
+        graph (onnx.GraphProto): The ONNX graph to process.
+        substring_annotations (list): A list of tuples (substring, annotation_string).
+    """
+    process_nodes(graph.node, substring_annotations)
+
+
+def annotate_model(model, substring_annotations):
+    """
+    Annotates an ONNX model with metadata based on a provided mapping.
+
+    This function serves as the entry point to annotate the model's graph.
+    It delegates the work to `annotate_graph`, which recursively processes
+    all nodes in the main graph and any nested subgraphs.
+
+    Args:
+        model (onnx.ModelProto): The ONNX model to annotate.
+        substring_annotations (list): A list of tuples (substring, annotation_string).
+    """
+    annotate_graph(model.graph, substring_annotations)
+
+
+if __name__ == "__main__":
+    args = getargs()
+    logger = get_logger("annotate_model")
+
+    # Read the mapping from the configuration file
+    substring_annotations = read_annotation_config(args.config_file_path)
+
+    logger.info(f"Loading model from {args.model_path}")
+    onnx_model = onnx.load(args.model_path, load_external_data=False)
+
+    logger.info(f"Applying annotations from {args.config_file_path}")
+    annotate_model(onnx_model, substring_annotations)
+
+    logger.info(f"Saving annotated model to {args.annotated_model}")
+    onnx.save_model(onnx_model, args.annotated_model)
diff --git a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
index 2b7fbffa842f7..af14dedd005b8 100644
--- a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
@@ -32,8 +32,14 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             [1, None, 0, 0, 0],
         )
         if qkv_nodes is None:
-            logger.debug("fuse_conformer_attention: failed to match qkv path")
-            return
+            qkv_nodes = self.model.match_parent_path(
+                normalize_node,
+                ["MatMul", "Reshape", "Transpose", "MatMul"],
+                [1, 0, 0, 0],
+            )
+            if qkv_nodes is None:
+                logger.debug("fuse_conformer_attention: failed to match qkv path")
+                return
 
         reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes[-3], qkv_nodes[-2], qkv_nodes[-1]
 
@@ -50,15 +56,22 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 [1, 0, 0, 0],
             )
             if v_nodes is None:
-                logger.debug("fuse_conformer_attention: failed to match v path")
-                return
+                v_nodes = self.model.match_parent_path(
+                    matmul_qkv,
+                    ["Transpose", "Reshape", "MatMul"],
+                    [1, 0, 0],
+                )
+                if v_nodes is None:
+                    logger.debug("fuse_conformer_attention: failed to match v path")
+                    return
         else:
             concat_v = v_nodes[0]
             concat_parent = self.model.get_parent(concat_v, 0, None)
             present_v = concat_v.output[0]
             past_v = concat_parent.output[0]
 
-        add_v, matmul_v = v_nodes[-2], v_nodes[-1]
+        add_v = v_nodes[-2] if len(v_nodes) >= 2 and v_nodes[-2].op_type == "Add" else None
+        matmul_v = v_nodes[-1]
 
         attn_mask = ""
         qk_nodes = self.model.match_parent_path(
@@ -66,6 +79,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Softmax", "Add", "MatMul"],
             [0, 0, 0],
         )
+        where_qk = None
         if qk_nodes is None:
             qk_nodes = self.model.match_parent_path(
                 matmul_qkv,
@@ -73,10 +87,19 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 [0, 2, 0, 2, 0],
             )
             if qk_nodes is None:
-                logger.debug("fuse_conformer_attention: failed to match qk path")
-                return
+                qk_nodes = self.model.match_parent_path(
+                    matmul_qkv,
+                    ["Where", "Softmax", "Where", "Div", "Add", "MatMul"],
+                    [0, 2, 0, 2, 0, 0],
+                )
+                if qk_nodes is None:
+                    logger.debug("fuse_conformer_attention: failed to match qk path")
+                    return
+                where_qk = qk_nodes[2]
+            else:
+                where_qk = qk_nodes[2]
 
-            where_qk = qk_nodes[2]
+        if where_qk is not None:
             mask_nodes = self.model.match_parent_path(
                 where_qk,
                 ["Equal", "Unsqueeze", "Cast"],
@@ -99,20 +122,46 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 [0, 0, 0, 0, 0],
             )
             if q_nodes is None:
-                logger.debug("fuse_conformer_attention: failed to match q path")
-                return
+                q_nodes = self.model.match_parent_path(
+                    matmul_qk,
+                    ["Transpose", "Add", "Reshape", "MatMul"],
+                    [0, 0, 0, 1],
+                )
+                if q_nodes is None:
+                    q_nodes = self.model.match_parent_path(
+                        matmul_qk,
+                        ["Transpose", "Add", "Reshape", "MatMul"],
+                        [0, 0, 0, 0],
+                    )
+                    if q_nodes is None:
+                        logger.debug("fuse_conformer_attention: failed to match q path")
+                        return
 
-        reshape_q, add_q, matmul_q = q_nodes[-3], q_nodes[-2], q_nodes[-1]
+        reshape_q = next((node for node in q_nodes if node.op_type == "Reshape"), None)
+        add_q = next((node for node in q_nodes if node.op_type == "Add"), None)
+        matmul_q = next((node for node in reversed(q_nodes) if node.op_type == "MatMul"), None)
+        if reshape_q is None or add_q is None or matmul_q is None:
+            logger.debug("fuse_conformer_attention: failed to identify q reshape/add/matmul nodes")
+            return
 
         extra_q_nodes = self.model.match_parent_path(
             add_qk,
             ["Reshape", "Transpose", "MatMul", "Transpose", "Reshape", "Div"],
             [1, 0, 0, 0, 0, 0],
         )
-        if extra_q_nodes is not None and q_nodes[0] != extra_q_nodes[-1]:
+        if extra_q_nodes is not None and q_nodes[0].op_type in ["Div", "Mul"] and q_nodes[0] != extra_q_nodes[-1]:
             logger.debug("fuse_conformer_attention: failed to match extra q path")
             return
 
+        if extra_q_nodes is None:
+            nemotron_extra_q_nodes = self.model.match_parent_path(
+                add_qk,
+                ["Slice", "Reshape", "Slice", "Reshape", "Pad", "MatMul", "Transpose", "Add"],
+                [1, 0, 0, 0, 0, 0, 0, 0],
+            )
+            if nemotron_extra_q_nodes is not None:
+                extra_q_nodes = nemotron_extra_q_nodes
+
         past_k, present_k = "", ""
         k_nodes = self.model.match_parent_path(
             matmul_qk,
@@ -132,24 +181,50 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     [1, 0, 0, 0],
                 )
                 if k_nodes is None:
-                    logger.debug("fuse_conformer_attention: failed to match k path")
-                    return
+                    k_nodes = self.model.match_parent_path(
+                        matmul_qk,
+                        ["Transpose", "Reshape", "MatMul"],
+                        [1, 0, 0],
+                    )
+                    if k_nodes is None:
+                        logger.debug("fuse_conformer_attention: failed to match k path")
+                        return
         else:
             concat_k = k_nodes[1]
             concat_parent = self.model.get_parent(concat_k, 0, None)
             past_k = concat_parent.output[0]
             present_k = concat_k.output[0]
 
-        add_k, matmul_k = k_nodes[-2], k_nodes[-1]
+        add_k = k_nodes[-2] if len(k_nodes) >= 2 and k_nodes[-2].op_type == "Add" else None
+        matmul_k = k_nodes[-1]
 
         num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
         if num_heads <= 0 or hidden_size <= 0 or (hidden_size % num_heads) != 0:
             logger.debug("fuse_conformer_attention: failed to detect num_heads or hidden_size")
             return
 
+        # Validate attention_bias: the Attention and MultiHeadAttention kernels require a 4-D
+        # tensor with shape [batch_size or 1, num_heads or 1, sequence_length, total_sequence_length].
+        # Scalar or 1-D initializers (e.g. a plain QK scaling constant) must not be forwarded as
+        # attention_bias. Non-initializer values (computed positional-bias outputs) are kept as-is.
+        attention_bias = add_qk.input[1]
+        bias_init = self.model.get_initializer(attention_bias)
+        if bias_init is not None and len(bias_init.dims) != 4:
+            logger.debug(
+                "fuse_conformer_attention: skipping attention_bias %s with dims %s (expected 4-D)",
+                attention_bias,
+                list(bias_init.dims),
+            )
+            attention_bias = ""
+
         new_node = None
         use_packed_attention_op = (
-            matmul_q.input[0] == matmul_k.input[0] and matmul_k.input[0] == matmul_v.input[0] and extra_q_nodes is None
+            matmul_q.input[0] == matmul_k.input[0]
+            and matmul_k.input[0] == matmul_v.input[0]
+            and extra_q_nodes is None
+            and add_q is not None
+            and add_k is not None
+            and add_v is not None
         )
         if use_packed_attention_op:
             # Self-attention, use Attention op
@@ -165,7 +240,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 hidden_size=hidden_size,
                 first_input=matmul_q.input[0],
                 output=reshape_qkv.output[0],
-                add_qk_str=add_qk.input[1],
+                add_qk_str=attention_bias,
                 past_k=past_k,
                 past_v=past_v,
                 present_k=present_k,
@@ -183,7 +258,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 hidden_size=hidden_size,
                 output=reshape_qkv.output[0],
                 key_padding_mask=attn_mask,
-                add_qk=add_qk.input[1],
+                add_qk=attention_bias,
                 past_k=past_k,
                 past_v=past_v,
                 present_k=present_k,
diff --git a/onnxruntime/test/framework/function_test.cc b/onnxruntime/test/framework/function_test.cc
index 699d1b1a2c27a..9e28882b9a65d 100644
--- a/onnxruntime/test/framework/function_test.cc
+++ b/onnxruntime/test/framework/function_test.cc
@@ -662,5 +662,161 @@ TEST(FunctionTest, Test_GH_issue_16438) {
   status = session_object.Initialize();
   ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
 }
+
+// Verify that when a function node with a layering annotation is inlined,
+// the inlined nodes inherit the parent function node's annotation.
+TEST(FunctionTest, InlinedNodesInheritLayeringAnnotation) {
+  // Parse and build a Model with a local function (multi-node body: Constant + Mul).
+  ONNX_NAMESPACE::OnnxParser parser(basic_code);
+  ONNX_NAMESPACE::ModelProto model_proto;
+  auto parse_status = parser.Parse(model_proto);
+  ASSERT_TRUE(parse_status.IsOK()) << parse_status.ErrorMessage();
+  ASSERT_TRUE(parser.EndOfInput()) << "Extra unparsed input unexpected.";
+
+  auto& logger = DefaultLoggingManager().DefaultLogger();
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, logger));
+
+  Graph& graph = model->MainGraph();
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Find the function call node (local.myfun) and annotate it.
+  Node* func_node = nullptr;
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "myfun") {
+      func_node = &node;
+      break;
+    }
+  }
+  ASSERT_NE(func_node, nullptr) << "Could not find function call node 'myfun'";
+  ASSERT_TRUE(func_node->CanBeInlined());
+
+  const std::string annotation = "TestLayerAnnotation";
+  func_node->SetLayeringAnnotation(annotation);
+
+  // Inline the function node.
+  ASSERT_STATUS_OK(graph.InlineFunction(*func_node));
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // After inlining, the original function call node is removed and replaced
+  // by the function body nodes (a Mul node; the Constant becomes an initializer).
+  // Verify every remaining node inherited the annotation.
+  int node_count = 0;
+  for (const auto& node : graph.Nodes()) {
+    ++node_count;
+    EXPECT_EQ(node.GetLayeringAnnotation(), annotation)
+        << "Node '" << node.Name() << "' (op: " << node.OpType()
+        << ") did not inherit the parent function's layering annotation.";
+  }
+  EXPECT_GT(node_count, 0) << "Expected at least one inlined node in the graph.";
+}
+
+// Verify that when a function node with no layering annotation is inlined,
+// the inlined nodes remain unannotated.
+TEST(FunctionTest, InlinedNodesNoAnnotationWhenParentUnannotated) {
+  ONNX_NAMESPACE::OnnxParser parser(basic_code);
+  ONNX_NAMESPACE::ModelProto model_proto;
+  auto parse_status = parser.Parse(model_proto);
+  ASSERT_TRUE(parse_status.IsOK()) << parse_status.ErrorMessage();
+  ASSERT_TRUE(parser.EndOfInput()) << "Extra unparsed input unexpected.";
+
+  auto& logger = DefaultLoggingManager().DefaultLogger();
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, logger));
+
+  Graph& graph = model->MainGraph();
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  Node* func_node = nullptr;
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "myfun") {
+      func_node = &node;
+      break;
+    }
+  }
+  ASSERT_NE(func_node, nullptr);
+  // Do NOT set any annotation on the function node.
+  ASSERT_TRUE(func_node->GetLayeringAnnotation().empty());
+
+  ASSERT_STATUS_OK(graph.InlineFunction(*func_node));
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  for (const auto& node : graph.Nodes()) {
+    EXPECT_TRUE(node.GetLayeringAnnotation().empty())
+        << "Node '" << node.Name() << "' should not have a layering annotation "
+        << "when the parent function node was unannotated.";
+  }
+}
+
+// Verify annotation inheritance with two calls to the same function,
+// where each call has a different annotation.
+TEST(FunctionTest, InlinedNodesInheritDistinctAnnotationsPerCallSite) {
+  const char* code = R"(
+        <
+        ir_version: 8,
+        opset_import: [ "" : 16, "local" : 1 ]
+        >
+        agraph (float[N] x) => (float[N] y)
+        {
+            y1 = local.myfun (x)
+            y = local.myfun (y1)
+        }
+
+        <
+        opset_import: [ "" : 16 ],
+        domain: "local"
+        >
+        myfun (lx) => (ly) {
+            two = Constant <value = float[1] {2.0}> ()
+            ly = Mul (lx, two)
+        }
+        )";
+
+  ONNX_NAMESPACE::OnnxParser parser(code);
+  ONNX_NAMESPACE::ModelProto model_proto;
+  auto parse_status = parser.Parse(model_proto);
+  ASSERT_TRUE(parse_status.IsOK()) << parse_status.ErrorMessage();
+  ASSERT_TRUE(parser.EndOfInput());
+
+  auto& logger = DefaultLoggingManager().DefaultLogger();
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, logger));
+
+  Graph& graph = model->MainGraph();
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Collect the two function call nodes in graph order.
+  std::vector<Node*> func_nodes;
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "myfun") {
+      func_nodes.push_back(&node);
+    }
+  }
+  ASSERT_EQ(func_nodes.size(), 2u);
+
+  // Annotate each call site differently.
+  func_nodes[0]->SetLayeringAnnotation("AnnotationA");
+  func_nodes[1]->SetLayeringAnnotation("AnnotationB");
+
+  // Inline the first call, then the second.
+  ASSERT_STATUS_OK(graph.InlineFunction(*func_nodes[0]));
+  ASSERT_STATUS_OK(graph.InlineFunction(*func_nodes[1]));
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // After inlining both calls, the graph should have nodes from both expansions.
+  // Each group should carry its respective annotation.
+  bool found_a = false;
+  bool found_b = false;
+  for (const auto& node : graph.Nodes()) {
+    const auto& ann = node.GetLayeringAnnotation();
+    EXPECT_TRUE(ann == "AnnotationA" || ann == "AnnotationB")
+        << "Node '" << node.Name() << "' has unexpected annotation: '" << ann << "'";
+    if (ann == "AnnotationA") found_a = true;
+    if (ann == "AnnotationB") found_b = true;
+  }
+  EXPECT_TRUE(found_a) << "No node found with AnnotationA";
+  EXPECT_TRUE(found_b) << "No node found with AnnotationB";
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/layering_annotations_test.cc b/onnxruntime/test/framework/layering_annotations_test.cc
new file mode 100644
index 0000000000000..f865be7bfc686
--- /dev/null
+++ b/onnxruntime/test/framework/layering_annotations_test.cc
@@ -0,0 +1,1763 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#include "core/framework/execution_providers.h"
+#include "core/framework/ortmemoryinfo.h"
+#include "core/framework/layering_annotations.h"
+#include "core/session/abi_devices.h"
+#include "core/framework/execution_provider.h"
+#include "core/framework/ortdevice.h"
+#include "core/graph/constants.h"
+#include "core/graph/model.h"  // For Model, Graph
+#include "gtest/gtest.h"
+
+#include "test/util/include/asserts.h"
+#include "test/util/include/test_environment.h"
+
+namespace onnxruntime {
+namespace test {
+
+TEST(LayeringRuleMatcherTest, ExactMatches) {
+  LayeringRules rules;
+  rules.rules.push_back({"Device1", "Annotation1", false});  // Index 0
+  rules.rules.push_back({"Device2", "Annotation2", false});  // Index 1
+
+  LayeringRuleMatcher matcher(rules);
+
+  {
+    auto result = matcher.Match("Annotation1");
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, 0u);
+  }
+  {
+    auto result = matcher.Match("Annotation2");
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, 1u);
+  }
+  {
+    auto result = matcher.Match("Annotation3");
+    EXPECT_FALSE(result.has_value());
+  }
+}
+
+TEST(LayeringRuleMatcherTest, PrefixMatches) {
+  LayeringRules rules;
+  rules.rules.push_back({"Device1", "Prefix1", true});  // Index 0: =Prefix1
+  rules.rules.push_back({"Device2", "Pre", true});      // Index 1: =Pre
+
+  LayeringRuleMatcher matcher(rules);
+
+  // "Prefix1Suffix" matches "Prefix1" (idx 0) and "Pre" (idx 1). 0 < 1, so 0.
+  {
+    auto result = matcher.Match("Prefix1Suffix");
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, 0u);
+  }
+
+  // "PreSuffix" matches "Pre" (idx 1). "Prefix1" does not match.
+  {
+    auto result = matcher.Match("PreSuffix");
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, 1u);
+  }
+
+  // "Other" matches nothing
+  {
+    auto result = matcher.Match("Other");
+    EXPECT_FALSE(result.has_value());
+  }
+}
+
+TEST(LayeringRuleMatcherTest, PriorityPrefixOverExact) {
+  // Prefix matches should take precedence over exact matches regardless of order.
+
+  // Case 1: Prefix rule comes before Exact rule
+  {
+    LayeringRules rules;
+    rules.rules.push_back({"Device1", "A", true});    // Index 0: =A (Prefix)
+    rules.rules.push_back({"Device2", "AB", false});  // Index 1: AB (Exact)
+
+    LayeringRuleMatcher matcher(rules);
+    // "AB" matches prefix "A" (idx 0) and exact "AB" (idx 1).
+    // Since prefix matches are checked first and returned if found, we expect 0.
+    auto result = matcher.Match("AB");
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, 0u);
+  }
+
+  // Case 2: Exact rule comes before Prefix rule
+  {
+    LayeringRules rules;
+    rules.rules.push_back({"Device1", "AB", false});  // Index 0: AB (Exact)
+    rules.rules.push_back({"Device2", "A", true});    // Index 1: =A (Prefix)
+
+    LayeringRuleMatcher matcher(rules);
+    // "AB" matches exact "AB" (idx 0) and prefix "A" (idx 1).
+    // Priority says Prefix matches are returned first.
+    auto result = matcher.Match("AB");
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, 1u);
+  }
+}
+
+TEST(LayeringRuleMatcherTest, LongestOrShortestPrefixPriority) {
+  // If multiple prefix rules match, the one with the lowest index (earliest in config) wins.
+
+  // Case 1: Shorter prefix first
+  {
+    LayeringRules rules;
+    rules.rules.push_back({"Device1", "A", true});   // Index 0
+    rules.rules.push_back({"Device2", "AB", true});  // Index 1
+
+    LayeringRuleMatcher matcher(rules);
+    // "ABC" matches "A" (0) and "AB" (1). Since 0 < 1, best match is 0.
+    auto result = matcher.Match("ABC");
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, 0u);
+  }
+
+  // Case 2: Longer prefix first
+  {
+    LayeringRules rules;
+    rules.rules.push_back({"Device1", "AB", true});  // Index 0
+    rules.rules.push_back({"Device2", "A", true});   // Index 1
+
+    LayeringRuleMatcher matcher(rules);
+    // "ABC" matches "AB" (0) and "A" (1). Since 0 < 1, best match is 0.
+    auto result = matcher.Match("ABC");
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, 0u);
+  }
+}
+
+TEST(LayeringRuleMatcherTest, OverlappingExactMatchPriority) {
+  // If duplicates exist, first one wins.
+  LayeringRules rules;
+  rules.rules.push_back({"Device1", "A", false});  // Index 0
+  rules.rules.push_back({"Device2", "A", false});  // Index 1
+
+  LayeringRuleMatcher matcher(rules);
+  auto result = matcher.Match("A");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(*result, 0u);
+}
+
+TEST(LayeringRuleMatcherTest, OverlappingPrefixMatchPriority) {
+  // If duplicates exist, first one wins.
+  LayeringRules rules;
+  rules.rules.push_back({"Device1", "A", true});  // Index 0
+  rules.rules.push_back({"Device2", "A", true});  // Index 1
+
+  LayeringRuleMatcher matcher(rules);
+  auto result = matcher.Match("AB");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(*result, 0u);
+}
+
+namespace {
+
+// Helper to construct OrtEpDevice wrappers for testing
+struct TestEpDevice {
+  std::string ep_name;
+  OrtHardwareDevice hw_device;
+  bool has_hw_device = false;
+  OrtMemoryInfo mem_info;
+  bool has_mem_info = false;
+
+  // We need to keep the structures alive while OrtEpDevice points to them
+  OrtEpDevice Get() const {
+    OrtEpDevice ep;
+    ep.ep_name = ep_name;
+    ep.device = has_hw_device ? &hw_device : nullptr;
+    ep.device_memory_info = has_mem_info ? &mem_info : nullptr;
+    return ep;
+  }
+};
+
+TestEpDevice CreateEp(const std::string& name) {
+  TestEpDevice ep;
+  ep.ep_name = name;
+  return ep;
+}
+
+TestEpDevice CreateHwEp(const std::string& name, OrtHardwareDeviceType type, uint32_t vendor_id = 0,
+                        uint32_t device_id = 0, const std::string& vendor_str = std::string()) {
+  TestEpDevice ep;
+  ep.ep_name = name;
+  ep.hw_device = {type, vendor_id, device_id, vendor_str, {}};
+  ep.has_hw_device = true;
+  return ep;
+}
+
+TestEpDevice CreateMemEp(const std::string& name, OrtDevice::DeviceType type, int device_id = 0) {
+  TestEpDevice ep;
+  ep.ep_name = name;
+  // Note: OrtMemoryInfo name doesn't matter for logic now, but required for ctor
+  ep.mem_info = OrtMemoryInfo("TestMem", OrtAllocatorType::OrtDeviceAllocator,
+                              OrtDevice(type, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE,
+                                        static_cast<OrtDevice::DeviceId>(device_id)),
+                              OrtMemType::OrtMemTypeDefault);
+  ep.has_mem_info = true;
+  return ep;
+}
+
+}  // namespace
+
+TEST(EpLayeringMatcherTest, MatchCPU) {
+  LayerAnnotation rule = {"CPU", "Anno1", false};
+
+  // Case 1: EP Name kCpuExecutionProvider
+  {
+    auto test_ep = CreateEp(kCpuExecutionProvider);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, kCpuExecutionProvider);
+  }
+
+  // Case 2: Hardware Device CPU
+  {
+    auto test_ep = CreateHwEp("SomeCPU_EP", OrtHardwareDeviceType_CPU);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "SomeCPU_EP");
+  }
+
+  // Case 3: Memory Info CPU
+  {
+    auto test_ep = CreateMemEp("MemCPU_EP", OrtDevice::CPU);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MemCPU_EP");
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchGPU) {
+  LayerAnnotation rule = {"GPU", "Anno1", false};
+
+  // Case 1: Hardware Device GPU
+  {
+    auto test_ep = CreateHwEp("MyGPU_EP", OrtHardwareDeviceType_GPU);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MyGPU_EP");
+  }
+
+  // Case 2: Memory Info GPU
+  {
+    auto test_ep = CreateMemEp("MemGPU_EP", OrtDevice::GPU);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MemGPU_EP");
+  }
+
+  // Case 3: Heuristic kCudaExecutionProvider
+  {
+    auto test_ep = CreateEp(kCudaExecutionProvider);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, kCudaExecutionProvider);
+  }
+
+  // Case 4: Heuristic kDmlExecutionProvider
+  {
+    auto test_ep = CreateEp(kDmlExecutionProvider);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, kDmlExecutionProvider);
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchSpecificGPU_VendorString) {
+  LayerAnnotation rule = {"gpu:nvidia", "Anno1", false};
+
+  // Case 1: Vendor String Match
+  {
+    auto test_ep = CreateHwEp("MyNvidia_EP", OrtHardwareDeviceType_GPU, 0, 0, "NVIDIA");
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MyNvidia_EP");
+  }
+
+  // Case 2: Vendor String Mismatch
+  {
+    auto test_ep = CreateHwEp("MyAMD_EP", OrtHardwareDeviceType_GPU, 0, 0, "AMD");
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    EXPECT_FALSE(result.has_value());
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchSpecificGPU_VendorId) {
+  LayerAnnotation rule_intel = {"gpu:intel", "Anno1", false};
+  LayerAnnotation rule_nvidia = {"gpu:nvidia", "Anno2", false};
+  LayerAnnotation rule_amd = {"gpu:amd", "Anno3", false};
+
+  // Case 1: Vendor ID Match Intel
+  {
+    auto test_ep = CreateHwEp("Intel_EP", OrtHardwareDeviceType_GPU, OrtDevice::VendorIds::INTEL);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule_intel);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "Intel_EP");
+  }
+
+  // Case 2: Vendor ID Match Nvidia
+  {
+    auto test_ep = CreateHwEp("Nvidia_EP", OrtHardwareDeviceType_GPU, OrtDevice::VendorIds::NVIDIA);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule_nvidia);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "Nvidia_EP");
+  }
+
+  // Case 3: Vendor ID Match AMD
+  {
+    auto test_ep = CreateHwEp("AMD_EP", OrtHardwareDeviceType_GPU, OrtDevice::VendorIds::AMD);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule_amd);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "AMD_EP");
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchSpecificGPU_Heuristic) {
+  LayerAnnotation rule = {"gpu:nvidia", "Anno1", false};
+
+  // Case 1: kCudaExecutionProvider -> nvidia
+  {
+    // Need an EP with GPU HW type but generic vendor info to trigger the heuristic
+    auto test_ep_hw = CreateHwEp(kCudaExecutionProvider, OrtHardwareDeviceType_GPU);
+    OrtEpDevice ep_device = test_ep_hw.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, kCudaExecutionProvider);
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchSpecificGPU_Index) {
+  LayerAnnotation rule = {"gpu:1", "Anno1", false};
+
+  // Case 1: ID Match
+  {
+    auto test_ep = CreateHwEp("GPU1", OrtHardwareDeviceType_GPU, 0, 1);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "GPU1");
+  }
+
+  // Case 2: ID Mismatch
+  {
+    auto test_ep = CreateHwEp("GPU0", OrtHardwareDeviceType_GPU, 0, 0);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    EXPECT_FALSE(result.has_value());
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchAccelerator) {
+  LayerAnnotation rule = {"accelerator", "Anno1", false};
+
+  // Case 1: CPU EP should NOT match
+  {
+    auto test_ep = CreateEp(kCpuExecutionProvider);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    EXPECT_FALSE(result.has_value());
+  }
+
+  // Case 2: Custom EP, No HW/Mem info, considered accelerator
+  {
+    auto test_ep = CreateEp("MyCustomAccel");
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MyCustomAccel");
+  }
+
+  // Case 3: GPU HW is an accelerator
+  {
+    auto test_ep = CreateHwEp("MyGPU", OrtHardwareDeviceType_GPU);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MyGPU");
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchNPU) {
+  LayerAnnotation rule = {"npu", "Anno1", false};
+
+  // Case 1: Hardware NPU
+  {
+    auto test_ep = CreateHwEp("MyNPU", OrtHardwareDeviceType_NPU);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MyNPU");
+  }
+
+  // Case 2: QNN Heuristic
+  {
+    auto test_ep = CreateEp(kQnnExecutionProvider);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, kQnnExecutionProvider);
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchFPGA) {
+  LayerAnnotation rule = {"fpga", "Anno1", false};
+
+  // Case 1: MemInfo says FPGA
+  {
+    auto test_ep = CreateMemEp("MyFPGA", OrtDevice::FPGA);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MyFPGA");
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchDirectDesignators) {
+  LayerAnnotation rule_cuda = {"cuda", "A", false};
+  LayerAnnotation rule_dml = {"dml", "B", false};
+
+  {
+    auto test_ep = CreateEp(kCudaExecutionProvider);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule_cuda);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, kCudaExecutionProvider);
+  }
+  {
+    auto test_ep = CreateEp(kDmlExecutionProvider);
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule_dml);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, kDmlExecutionProvider);
+  }
+}
+
+TEST(EpLayeringMatcherTest, MatchExactEPName) {
+  LayerAnnotation rule = {"MyCustomEP", "Anno1", false};
+
+  {
+    auto test_ep = CreateEp("MyCustomEP");
+    OrtEpDevice ep_device = test_ep.Get();
+    std::vector<const OrtEpDevice*> devices = {&ep_device};
+    auto result = EpLayeringMatcher::Match(devices, rule);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, "MyCustomEP");
+  }
+}
+
+namespace {
+
+// Minimal concrete implementation of IExecutionProvider for testing
+class MockExecutionProvider : public IExecutionProvider {
+ public:
+  MockExecutionProvider(const std::string& type, OrtDevice device)
+      : IExecutionProvider(type, device) {}
+
+  std::shared_ptr<KernelRegistry> GetKernelRegistry() const override { return nullptr; }
+};
+
+}  // namespace
+
+TEST(EpLayeringMatcherTest, MatchExecutionProviders_CPU) {
+  LayerAnnotation rule = {"CPU", "Anno1", false};
+  ExecutionProviders providers;
+
+  // Add CPU provider
+  auto cpu_ep = std::make_shared<MockExecutionProvider>(kCpuExecutionProvider, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0, 0));
+  ASSERT_STATUS_OK(providers.Add(kCpuExecutionProvider, cpu_ep));
+
+  // Add a GPU provider (should be skipped for CPU rule)
+  auto gpu_ep = std::make_shared<MockExecutionProvider>(kCudaExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0, 0));
+  ASSERT_STATUS_OK(providers.Add(kCudaExecutionProvider, gpu_ep));
+
+  auto result = EpLayeringMatcher::Match(providers, rule);
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(*result, kCpuExecutionProvider);
+}
+
+TEST(EpLayeringMatcherTest, MatchExecutionProviders_GPU) {
+  LayerAnnotation rule = {"GPU", "Anno1", false};
+  ExecutionProviders providers;
+
+  // Add CPU provider (should be skipped)
+  auto cpu_ep = std::make_shared<MockExecutionProvider>(kCpuExecutionProvider, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0, 0));
+  ASSERT_STATUS_OK(providers.Add(kCpuExecutionProvider, cpu_ep));
+
+  // Add CUDA provider (GPU)
+  auto gpu_ep = std::make_shared<MockExecutionProvider>(kCudaExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0, 0));
+  ASSERT_STATUS_OK(providers.Add(kCudaExecutionProvider, gpu_ep));
+
+  auto result = EpLayeringMatcher::Match(providers, rule);
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(*result, kCudaExecutionProvider);
+}
+
+TEST(EpLayeringMatcherTest, MatchExecutionProviders_GPU_Specific) {
+  LayerAnnotation rule = {"gpu:nvidia", "Anno1", false};  // Assumes heuristics or vendor ID logic
+  ExecutionProviders providers;
+
+  // Add CPU provider
+  auto cpu_ep = std::make_shared<MockExecutionProvider>(kCpuExecutionProvider, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0, 0));
+  ASSERT_STATUS_OK(providers.Add(kCpuExecutionProvider, cpu_ep));
+
+  // Add CUDA provider (NVIDIA vendor ID)
+  auto gpu_ep = std::make_shared<MockExecutionProvider>(kCudaExecutionProvider,
+                                                        OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA, 0));
+  ASSERT_STATUS_OK(providers.Add(kCudaExecutionProvider, gpu_ep));
+
+  auto result = EpLayeringMatcher::Match(providers, rule);
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(*result, kCudaExecutionProvider);
+}
+
+TEST(EpLayeringMatcherTest, MatchExecutionProviders_NoMatch) {
+  LayerAnnotation rule = {"GPU", "Anno1", false};
+  ExecutionProviders providers;
+
+  // Only CPU provider available
+  auto cpu_ep = std::make_shared<MockExecutionProvider>(kCpuExecutionProvider, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0, 0));
+  ASSERT_STATUS_OK(providers.Add(kCpuExecutionProvider, cpu_ep));
+
+  auto result = EpLayeringMatcher::Match(providers, rule);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(EpLayeringMatcherTest, MatchExecutionProviders_Accelerator) {
+  LayerAnnotation rule = {"accelerator", "Anno1", false};
+  ExecutionProviders providers;
+
+  // Add CPU
+  auto cpu_ep = std::make_shared<MockExecutionProvider>(kCpuExecutionProvider, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0, 0));
+  ASSERT_STATUS_OK(providers.Add(kCpuExecutionProvider, cpu_ep));
+
+  // Add custom accelerator
+  auto accel_ep = std::make_shared<MockExecutionProvider>("MyAccel", OrtDevice(OrtDevice::NPU, OrtDevice::MemType::DEFAULT, 0, 0));
+  ASSERT_STATUS_OK(providers.Add("MyAccel", accel_ep));
+
+  auto result = EpLayeringMatcher::Match(providers, rule);
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(*result, "MyAccel");
+}
+
+TEST(LayeringIndexTest, AssignNodesBasedOnAnnotations) {
+  // 1. Setup Graph
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  // Create nodes
+  // Node 0: "AnnotatedNode" -> Annotated with "RuleA"
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* input_arg = &graph.GetOrCreateNodeArg("input", &type_proto);
+  NodeArg* output_arg0 = &graph.GetOrCreateNodeArg("output0", &type_proto);
+  Node& node0 = graph.AddNode("node0", "Abs", "Node 0", {input_arg}, {output_arg0});
+  node0.SetLayeringAnnotation("RuleA");
+
+  // Node 1: "UnannotatedNode" -> No annotation
+  NodeArg* output_arg1 = &graph.GetOrCreateNodeArg("output1", &type_proto);
+  Node& node1 = graph.AddNode("node1", "Abs", "Node 1", {output_arg0}, {output_arg1});
+  // No annotation
+
+  // Node 2: "AnnotatedNode2" -> Annotated with "RuleB"
+  NodeArg* output_arg2 = &graph.GetOrCreateNodeArg("output2", &type_proto);
+  Node& node2 = graph.AddNode("node2", "Abs", "Node 2", {output_arg1}, {output_arg2});
+  node2.SetLayeringAnnotation("RuleB");
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // 2. Setup Rules and Matcher
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA", false});  // Index 0
+  rules.rules.push_back({"DeviceB", "RuleB", false});  // Index 1
+  LayeringRuleMatcher matcher(rules);
+
+  // 3. Setup Pre-computed Mappings (simulating Partitioning Manager)
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  ep_map["DeviceB"].insert(1);
+
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+  rule_map[1] = "DeviceB";
+
+  // 4. Create LayeringIndex
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+
+  // 5. Verify Assignments
+  // Node 0: Annotated "RuleA" -> Index 0 -> DeviceA
+  auto assign0 = index.GetNodeAssignment(graph, node0.Index());
+  ASSERT_TRUE(assign0.has_value());
+  EXPECT_EQ(*assign0, 0u);
+
+  // Node 1: Unannotated -> Should generally map to nothing (unless defaulting logic exists,
+  // but current impl leaves unannotated in main graph as unassigned)
+  auto assign1 = index.GetNodeAssignment(graph, node1.Index());
+  EXPECT_FALSE(assign1.has_value());
+
+  // Node 2: Annotated "RuleB" -> Index 1 -> DeviceB
+  auto assign2 = index.GetNodeAssignment(graph, node2.Index());
+  ASSERT_TRUE(assign2.has_value());
+  EXPECT_EQ(*assign2, 1u);
+}
+
+TEST(LayeringIndexTest, AssignNodeWithInvalidEpMapping) {
+  // Scenario: Node annotated with a rule that maps to an EP that is NOT present/valid
+
+  // 1. Setup Graph with one node annotated "RuleX"
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* input_arg = &graph.GetOrCreateNodeArg("input", &type_proto);
+  NodeArg* output_arg = &graph.GetOrCreateNodeArg("output", &type_proto);
+
+  Node& node = graph.AddNode("node", "Abs", "Node", {input_arg}, {output_arg});
+  node.SetLayeringAnnotation("RuleX");
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // 2. Setup Rules: RuleX exists at index 0, maps to "PhantomDevice"
+  LayeringRules rules;
+  rules.rules.push_back({"PhantomDevice", "RuleX", false});  // Index 0
+
+  // 3. Setup Mappings: But "PhantomDevice" is NOT in the mappings (simulating EP unavailable)
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  // ep_map["PhantomDevice"] is empty/missing
+
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  // rule_map[0] is missing
+
+  // 4. Create Index
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+  // 5. Verify: Node should NOT be assigned because the mapped EP is missing
+  auto assign = index.GetNodeAssignment(graph, node.Index());
+  EXPECT_FALSE(assign.has_value());
+}
+
+TEST(LayeringIndexTest, SubgraphInheritance) {
+  // Scenario: Annotated Node containing a subgraph.
+  // Nodes inside subgraph (unannotated) should inherit parent's assignment.
+
+  // 1. Setup Parent Graph
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_BOOL);
+  NodeArg* cond_arg = &graph.GetOrCreateNodeArg("cond", &type_proto);
+  NodeArg* output_arg = &graph.GetOrCreateNodeArg("output", &type_proto);
+
+  // Create "If" node
+  Node& if_node = graph.AddNode("if_node", "If", "If Node", {cond_arg}, {output_arg});
+  if_node.SetLayeringAnnotation("RuleA");  // Annotate Parent
+
+  auto build_subgraph = [](ONNX_NAMESPACE::GraphProto& proto, const std::string& graph_name,
+                           const std::string& node_name, const std::string& input_name, const std::string& output_name) {
+    proto.set_name(graph_name);
+    // Inputs: Implicit from outer scope for 'cond'
+
+    auto* node = proto.add_node();
+    node->set_name(node_name);
+    node->set_op_type("Identity");
+    node->add_input(input_name);
+    node->add_output(output_name);
+
+    auto* out_vi = proto.add_output();
+    out_vi->set_name(output_name);
+    out_vi->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_BOOL);
+  };
+
+  // Create Subgraph (then_branch)
+  ONNX_NAMESPACE::GraphProto then_graph_proto;
+  build_subgraph(then_graph_proto, "then_graph", "sub_node", "cond", "sub_out");
+  if_node.AddAttribute("then_branch", then_graph_proto);
+
+  // Create 'else_branch'
+  ONNX_NAMESPACE::GraphProto else_graph_proto;
+  build_subgraph(else_graph_proto, "else_graph", "else_sub_node", "cond", "else_sub_out");
+  if_node.AddAttribute("else_branch", else_graph_proto);
+
+  // First Resolve to create subgraph instances
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Get subgraph instances (checked to ensure they exist)
+  Graph* then_graph = if_node.GetMutableGraphAttribute("then_branch");
+  ASSERT_NE(then_graph, nullptr);
+  Graph* else_graph = if_node.GetMutableGraphAttribute("else_branch");
+  ASSERT_NE(else_graph, nullptr);
+
+  // 2. Setup Rules
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA", false});  // Index 0
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+
+  // 3. Create Index
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+
+  // 4. Verify Parent Assignment
+  auto assign_parent = index.GetNodeAssignment(graph, if_node.Index());
+  ASSERT_TRUE(assign_parent.has_value());
+  EXPECT_EQ(*assign_parent, 0u);
+
+  // 5. Verify Subgraph Node Assignment (Inheritance)
+  bool validated_then = false;
+  for (const auto& node : then_graph->Nodes()) {
+    if (node.OpType() == "Identity") {
+      auto assign_sub = index.GetNodeAssignment(*then_graph, node.Index());
+      ASSERT_TRUE(assign_sub.has_value()) << "Subgraph node should inherit parent annotation";
+      EXPECT_EQ(*assign_sub, 0u);
+      validated_then = true;
+    }
+  }
+  ASSERT_TRUE(validated_then);
+}
+
+TEST(LayeringIndexTest, SubgraphOverride) {
+  // Scenario: Annotated Node containing a subgraph.
+  // Node inside subgraph HAS annotation -> Should override inheritance.
+
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_BOOL);
+  NodeArg* cond_arg = &graph.GetOrCreateNodeArg("cond", &type_proto);
+  NodeArg* output_arg = &graph.GetOrCreateNodeArg("output", &type_proto);
+
+  Node& if_node = graph.AddNode("if_node", "If", "If Node", {cond_arg}, {output_arg});
+  if_node.SetLayeringAnnotation("RuleA");  // Annotate Parent = Rule A (Index 0)
+
+  auto build_subgraph = [](ONNX_NAMESPACE::GraphProto& proto, const std::string& graph_name,
+                           const std::string& node_name, const std::string& input_name, const std::string& output_name) {
+    proto.set_name(graph_name);
+
+    auto* node = proto.add_node();
+    node->set_name(node_name);
+    node->set_op_type("Identity");
+    node->add_input(input_name);
+    node->add_output(output_name);
+
+    auto* out_vi = proto.add_output();
+    out_vi->set_name(output_name);
+    out_vi->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_BOOL);
+  };
+
+  ONNX_NAMESPACE::GraphProto then_graph_proto;
+  build_subgraph(then_graph_proto, "then_graph", "sub_node", "cond", "sub_out");
+  if_node.AddAttribute("then_branch", then_graph_proto);
+
+  ONNX_NAMESPACE::GraphProto else_graph_proto;
+  build_subgraph(else_graph_proto, "else_graph", "else_sub_node", "cond", "else_sub_out");
+  if_node.AddAttribute("else_branch", else_graph_proto);
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  Graph* then_graph = if_node.GetMutableGraphAttribute("then_branch");
+  ASSERT_NE(then_graph, nullptr);
+
+  // Find sub_node to set annotation
+  Node* sub_node = nullptr;
+  for (auto& node : then_graph->Nodes()) {
+    if (node.Name() == "sub_node") {
+      sub_node = &node;
+      break;
+    }
+  }
+  ASSERT_NE(sub_node, nullptr);
+
+  // OVERRIDE: Annotate sub_node with Rule B
+  sub_node->SetLayeringAnnotation("RuleB");
+
+  // Rules: RuleA(0)->DeviceA, RuleB(1)->DeviceB
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA", false});
+  rules.rules.push_back({"DeviceB", "RuleB", false});
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  ep_map["DeviceB"].insert(1);
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+  rule_map[1] = "DeviceB";
+
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+
+  // Verify Parent = 0
+  auto assign_parent = index.GetNodeAssignment(graph, if_node.Index());
+  ASSERT_TRUE(assign_parent.has_value());
+  EXPECT_EQ(*assign_parent, 0u);
+
+  // Verify Sub = 1 (Override)
+  auto assign_sub = index.GetNodeAssignment(*then_graph, sub_node->Index());
+  ASSERT_TRUE(assign_sub.has_value());
+  EXPECT_EQ(*assign_sub, 1u);
+}
+
+TEST(LayeringIndexTest, UpdateIndex) {
+  // 1. Setup Graph with one node
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* input_arg = &graph.GetOrCreateNodeArg("input", &type_proto);
+  NodeArg* output_arg = &graph.GetOrCreateNodeArg("output", &type_proto);
+
+  Node& node = graph.AddNode("node", "Abs", "Node", {input_arg}, {output_arg});
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // 2. Setup Rules and Index
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA", false});  // Index 0
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+
+  // Creates index (node has no annotation, so not assigned)
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+  EXPECT_FALSE(index.GetNodeAssignment(graph, node.Index()).has_value());
+
+  // 3. Update Node with Annotation
+  node.SetLayeringAnnotation("RuleA");
+
+  // 4. Call Update
+  std::vector<NodeIndex> nodes_to_update = {node.Index()};
+  index.Update(graph, nodes_to_update);
+
+  // 5. Verify Assignment
+  auto assignment = index.GetNodeAssignment(graph, node.Index());
+  ASSERT_TRUE(assignment.has_value());
+  EXPECT_EQ(*assignment, 0u);
+}
+
+TEST(LayeringRulesTest, LayeringRulesParsing) {
+  // Test empty string
+  {
+    LayeringRules rules;
+    ASSERT_STATUS_OK(LayeringRules::FromConfigString("", rules));
+    EXPECT_TRUE(rules.rules.empty());
+  }
+
+  // Test simple valid string
+  {
+    LayeringRules rules;
+    ASSERT_STATUS_OK(LayeringRules::FromConfigString("EP1(Annotation1)", rules));
+    ASSERT_EQ(rules.rules.size(), 1u);
+    EXPECT_EQ(rules.rules[0].device, "EP1");
+    EXPECT_EQ(rules.rules[0].annotation, "Annotation1");
+    EXPECT_TRUE(rules.rules[0].prefix_match);
+  }
+
+  // Test multiple annotations for one device
+  {
+    LayeringRules rules;
+    ASSERT_STATUS_OK(LayeringRules::FromConfigString("EP1(Annotation1, Annotation2)", rules));
+    ASSERT_EQ(rules.rules.size(), 2u);
+    EXPECT_EQ(rules.rules[0].device, "EP1");
+    EXPECT_EQ(rules.rules[0].annotation, "Annotation1");
+    EXPECT_TRUE(rules.rules[0].prefix_match);
+    EXPECT_EQ(rules.rules[1].device, "EP1");
+    EXPECT_EQ(rules.rules[1].annotation, "Annotation2");
+    EXPECT_TRUE(rules.rules[1].prefix_match);
+  }
+
+  // Test multiple devices
+  {
+    LayeringRules rules;
+    ASSERT_STATUS_OK(LayeringRules::FromConfigString("EP1(Annotation1); EP2(Annotation2)", rules));
+    ASSERT_EQ(rules.rules.size(), 2u);
+    EXPECT_EQ(rules.rules[0].device, "EP1");
+    EXPECT_EQ(rules.rules[0].annotation, "Annotation1");
+    EXPECT_TRUE(rules.rules[0].prefix_match);
+    EXPECT_EQ(rules.rules[1].device, "EP2");
+    EXPECT_EQ(rules.rules[1].annotation, "Annotation2");
+    EXPECT_TRUE(rules.rules[1].prefix_match);
+  }
+
+  // Test exact match
+  {
+    LayeringRules rules;
+    ASSERT_STATUS_OK(LayeringRules::FromConfigString("EP1(=Annotation1)", rules));
+    ASSERT_EQ(rules.rules.size(), 1u);
+    EXPECT_EQ(rules.rules[0].device, "EP1");
+    EXPECT_EQ(rules.rules[0].annotation, "Annotation1");
+    EXPECT_FALSE(rules.rules[0].prefix_match);
+  }
+
+  // Test trimming whitespace
+  {
+    LayeringRules rules;
+    ASSERT_STATUS_OK(LayeringRules::FromConfigString("  EP1  (  Annotation1  ,  =Annotation2  )  ;  EP2  (  Annotation3  )  ", rules));
+    ASSERT_EQ(rules.rules.size(), 3u);
+    EXPECT_EQ(rules.rules[0].device, "EP1");
+    EXPECT_EQ(rules.rules[0].annotation, "Annotation1");
+    EXPECT_TRUE(rules.rules[0].prefix_match);
+    EXPECT_EQ(rules.rules[1].device, "EP1");
+    EXPECT_EQ(rules.rules[1].annotation, "Annotation2");
+    EXPECT_FALSE(rules.rules[1].prefix_match);
+    EXPECT_EQ(rules.rules[2].device, "EP2");
+    EXPECT_EQ(rules.rules[2].annotation, "Annotation3");
+    EXPECT_TRUE(rules.rules[2].prefix_match);
+  }
+}
+
+TEST(LayeringRulesTest, FromConfigString_InvalidFormat) {
+  LayeringRules rules;
+
+  // Error: Missing parentheses structure entirely
+  EXPECT_FALSE(LayeringRules::FromConfigString("Device1Annotation1", rules).IsOK());
+
+  // Error: Missing closing parenthesis
+  EXPECT_FALSE(LayeringRules::FromConfigString("Device1(Annotation1", rules).IsOK());
+
+  // Error: Missing opening parenthesis (or only closing present)
+  EXPECT_FALSE(LayeringRules::FromConfigString("Device1Annotation1)", rules).IsOK());
+
+  // Error: Parentheses reversed
+  EXPECT_FALSE(LayeringRules::FromConfigString("Device1)Annotation1(", rules).IsOK());
+
+  // Error: Empty device name (starts with parenthesis)
+  EXPECT_FALSE(LayeringRules::FromConfigString("(Annotation1)", rules).IsOK());
+}
+
+TEST(LayeringRulesTest, FromConfigString_IgnoresEmptyEntries) {
+  LayeringRules rules;
+  // "; ;" should result in 0 rules but Status::OK
+  ASSERT_STATUS_OK(LayeringRules::FromConfigString(";   ;", rules));
+  EXPECT_TRUE(rules.rules.empty());
+}
+
+TEST(LayeringRulesTest, FromConfigString_RejectsDuplicateAnnotations) {
+  LayeringRules rules;
+
+  // Duplicate prefix annotation within the same device
+  EXPECT_FALSE(LayeringRules::FromConfigString("EP1(Ann1, Ann1)", rules).IsOK());
+
+  // Duplicate prefix annotation across different devices
+  EXPECT_FALSE(LayeringRules::FromConfigString("EP1(Ann1); EP2(Ann1)", rules).IsOK());
+
+  // Duplicate exact annotation within the same device
+  EXPECT_FALSE(LayeringRules::FromConfigString("EP1(=Ann1, =Ann1)", rules).IsOK());
+
+  // Duplicate exact annotation across different devices
+  EXPECT_FALSE(LayeringRules::FromConfigString("EP1(=Ann1); EP2(=Ann1)", rules).IsOK());
+
+  // Same annotation but different match types (prefix vs exact) should be OK
+  ASSERT_STATUS_OK(LayeringRules::FromConfigString("EP1(Ann1, =Ann1)", rules));
+  ASSERT_EQ(rules.rules.size(), 2u);
+  EXPECT_TRUE(rules.rules[0].prefix_match);
+  EXPECT_FALSE(rules.rules[1].prefix_match);
+}
+
+TEST(LayeringIndexTest, MakeNodeUnassigned_PreservesEpRuleMapping) {
+  // Scenario: All nodes for a rule are unassigned in one graph.
+  // ep_name_to_layering_indices_ must still contain the rule so that
+  // sibling subgraphs (or the same graph on a subsequent pass) can still
+  // use it for filtering.
+
+  // 1. Setup Graph with two nodes, both annotated with the same rule
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  // Create nodes
+  // Node 0: "AnnotatedNode" -> Annotated with "RuleA"
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* input_arg = &graph.GetOrCreateNodeArg("input", &type_proto);
+  NodeArg* mid_arg = &graph.GetOrCreateNodeArg("mid", &type_proto);
+  NodeArg* output_arg = &graph.GetOrCreateNodeArg("output", &type_proto);
+
+  Node& node0 = graph.AddNode("node0", "Abs", "Node 0", {input_arg}, {mid_arg});
+  node0.SetLayeringAnnotation("RuleA");
+  Node& node1 = graph.AddNode("node1", "Abs", "Node 1", {mid_arg}, {output_arg});
+  node1.SetLayeringAnnotation("RuleA");
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // 2. Setup Rules: RuleA -> DeviceA
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA", false});  // Index 0
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+
+  // 3. Create Index
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+
+  // Both nodes should be assigned
+  ASSERT_TRUE(index.GetNodeAssignment(graph, node0.Index()).has_value());
+  ASSERT_TRUE(index.GetNodeAssignment(graph, node1.Index()).has_value());
+
+  // 3. Unassign both nodes (simulating EP failing to claim them)
+  index.MakeNodeUnassigned(graph, node0.Index());
+  index.MakeNodeUnassigned(graph, node1.Index());
+
+  // Nodes should be unassigned
+  EXPECT_FALSE(index.GetNodeAssignment(graph, node0.Index()).has_value());
+  EXPECT_FALSE(index.GetNodeAssignment(graph, node1.Index()).has_value());
+
+  // 4. CRITICAL: ep_name_to_layering_indices_ must still map DeviceA -> {0}
+  // so that other graphs/passes can still use this rule for filtering.
+  auto rules_opt = index.GetLayeringRulesForThisEp("DeviceA");
+  ASSERT_TRUE(rules_opt.has_value()) << "EP-to-rule mapping should not be erased when nodes are unassigned";
+  EXPECT_EQ(rules_opt->get().count(0), 1u);
+}
+
+TEST(LayeringIndexTest, UpdateAfterFullUnassignment_RestoresVisibility) {
+  // Scenario: All nodes for a rule are unassigned, then Update() adds
+  // a new node matching the same rule. The new node must be visible
+  // to the EP via GetLayeringRulesForThisEp.
+
+  // 1. Setup Graph with one annotated node
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* input_arg = &graph.GetOrCreateNodeArg("input", &type_proto);
+  NodeArg* output_arg = &graph.GetOrCreateNodeArg("output", &type_proto);
+
+  Node& node0 = graph.AddNode("node0", "Abs", "Node 0", {input_arg}, {output_arg});
+  node0.SetLayeringAnnotation("RuleA");
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // 2. Setup Rules: RuleA -> DeviceA
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA", false});  // Index 0
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+  ASSERT_TRUE(index.GetNodeAssignment(graph, node0.Index()).has_value());
+
+  // 3. Unassign the only node
+  index.MakeNodeUnassigned(graph, node0.Index());
+  EXPECT_FALSE(index.GetNodeAssignment(graph, node0.Index()).has_value());
+
+  // 4. Simulate layout transform adding a new node with inherited annotation
+  NodeArg* new_output_arg = &graph.GetOrCreateNodeArg("new_output", &type_proto);
+  Node& new_node = graph.AddNode("new_node", "Abs", "Node with inherited assignment",
+                                 {output_arg}, {new_output_arg});
+  new_node.SetLayeringAnnotation("RuleA");  // Inherits parent's annotation
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Record the new node index
+  NodeIndex new_node_index = new_node.Index();
+
+  // 5. Update index with the new node
+  std::vector<NodeIndex> new_nodes = {new_node_index};
+  index.Update(graph, new_nodes);
+
+  // 6. New node should be assigned to rule 0
+  auto assign = index.GetNodeAssignment(graph, new_node.Index());
+  ASSERT_TRUE(assign.has_value());
+  EXPECT_EQ(*assign, 0u);
+
+  // 7. CRITICAL: The rule must still be visible for DeviceA
+  auto rules_opt = index.GetLayeringRulesForThisEp("DeviceA");
+  ASSERT_TRUE(rules_opt.has_value()) << "EP-to-rule mapping must be intact for Update to be effective";
+  EXPECT_EQ(rules_opt->get().count(0), 1u);
+}
+
+// ============================================================================
+// Tests for graph_partitioner.cc LayeringIndex integration
+// These tests exercise behaviors from GetCapabilityForEP, InlineNodes, and
+// the partitioning pipeline when a LayeringIndex is present.
+// ============================================================================
+
+// Helper to create a simple linear graph: input -> node0 -> node1 -> ... -> output
+namespace {
+
+struct SimpleGraphHelper {
+  std::unique_ptr<Model> model;
+  Graph* graph = nullptr;
+  std::vector<NodeIndex> node_indices;
+
+  static SimpleGraphHelper Create(int num_nodes, const std::string& op_type = "Abs") {
+    SimpleGraphHelper h;
+    std::unordered_map<std::string, int> domain_to_version;
+    domain_to_version[kOnnxDomain] = 12;
+    h.model = std::make_unique<Model>("test_model", false, ModelMetaData(), PathString(),
+                                      IOnnxRuntimeOpSchemaRegistryList(),
+                                      domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+                                      DefaultLoggingManager().DefaultLogger());
+    h.graph = &h.model->MainGraph();
+
+    ONNX_NAMESPACE::TypeProto type_proto;
+    type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+    NodeArg* prev_arg = &h.graph->GetOrCreateNodeArg("input", &type_proto);
+
+    for (int i = 0; i < num_nodes; ++i) {
+      std::string out_name = (i == num_nodes - 1) ? "output" : "mid_" + std::to_string(i);
+      NodeArg* out_arg = &h.graph->GetOrCreateNodeArg(out_name, &type_proto);
+      Node& node = h.graph->AddNode("node_" + std::to_string(i), op_type,
+                                    "Node " + std::to_string(i), {prev_arg}, {out_arg});
+      h.node_indices.push_back(node.Index());
+      prev_arg = out_arg;
+    }
+    return h;
+  }
+};
+
+LayeringIndex CreateTwoEpIndex(const Graph& graph,
+                               const std::string& ep_a, const std::string& annotation_a,
+                               const std::string& ep_b, const std::string& annotation_b) {
+  LayeringRules rules;
+  rules.rules.push_back({ep_a, annotation_a, false});  // Index 0
+  rules.rules.push_back({ep_b, annotation_b, false});  // Index 1
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map[ep_a].insert(0);
+  ep_map[ep_b].insert(1);
+
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = ep_a;
+  rule_map[1] = ep_b;
+
+  return LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+}
+
+}  // namespace
+
+TEST(LayeringIndexPartitionerTest, FilteredGraphViewerExcludesOtherEpNodes) {
+  // Validates the filtering logic in create_graph_viewer (GetCapabilityForEP):
+  // When layering_index is present, nodes assigned to other EPs should be excluded
+  // from the GraphViewer presented to the current EP.
+
+  // Setup: 3-node chain, node0 -> RuleA (DeviceA), node1 -> unannotated, node2 -> RuleB (DeviceB)
+  auto h = SimpleGraphHelper::Create(3);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  auto* node2 = h.graph->GetNode(h.node_indices[2]);
+  node0->SetLayeringAnnotation("RuleA");
+  node2->SetLayeringAnnotation("RuleB");
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  auto index = CreateTwoEpIndex(*h.graph, "DeviceA", "RuleA", "DeviceB", "RuleB");
+
+  // Verify: From DeviceA's perspective, node2 should be excluded
+  auto rules_a = index.GetLayeringRulesForThisEp("DeviceA");
+  ASSERT_TRUE(rules_a.has_value());
+
+  // node0 should be assigned to rule 0 (DeviceA)
+  auto assign0 = index.GetNodeAssignment(*h.graph, h.node_indices[0]);
+  ASSERT_TRUE(assign0.has_value());
+  EXPECT_EQ(*assign0, 0u);
+
+  // node1 should be unassigned (available to any EP)
+  auto assign1 = index.GetNodeAssignment(*h.graph, h.node_indices[1]);
+  EXPECT_FALSE(assign1.has_value());
+
+  // node2 should be assigned to rule 1 (DeviceB)
+  auto assign2 = index.GetNodeAssignment(*h.graph, h.node_indices[2]);
+  ASSERT_TRUE(assign2.has_value());
+  EXPECT_EQ(*assign2, 1u);
+
+  // Simulate the filtering logic from create_graph_viewer:
+  // For DeviceA: include nodes with no assignment OR assignment in DeviceA's rules
+  InlinedVector<const Node*> filtered_for_device_a;
+  for (auto& node : h.graph->Nodes()) {
+    auto rule_idx_opt = index.GetNodeAssignment(*h.graph, node.Index());
+    bool include = true;
+    if (rule_idx_opt) {
+      // Node has assignment - include only if it belongs to DeviceA
+      if (rules_a->get().count(*rule_idx_opt) == 0) {
+        include = false;
+      }
+    }
+    if (include) {
+      filtered_for_device_a.push_back(&node);
+    }
+  }
+
+  // DeviceA should see node0 (assigned to it) and node1 (unassigned), but NOT node2
+  EXPECT_EQ(filtered_for_device_a.size(), 2u);
+  bool found_node0 = false, found_node1 = false, found_node2 = false;
+  for (const auto* n : filtered_for_device_a) {
+    if (n->Index() == h.node_indices[0]) found_node0 = true;
+    if (n->Index() == h.node_indices[1]) found_node1 = true;
+    if (n->Index() == h.node_indices[2]) found_node2 = true;
+  }
+  EXPECT_TRUE(found_node0) << "DeviceA's assigned node should be included";
+  EXPECT_TRUE(found_node1) << "Unassigned node should be included for any EP";
+  EXPECT_FALSE(found_node2) << "DeviceB's assigned node should be excluded from DeviceA's view";
+}
+
+TEST(LayeringIndexPartitionerTest, FilteredGraphViewerForDeviceBExcludesDeviceANodes) {
+  // Mirror of the above test but from DeviceB's perspective.
+
+  auto h = SimpleGraphHelper::Create(3);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  auto* node2 = h.graph->GetNode(h.node_indices[2]);
+  node0->SetLayeringAnnotation("RuleA");
+  node2->SetLayeringAnnotation("RuleB");
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  auto index = CreateTwoEpIndex(*h.graph, "DeviceA", "RuleA", "DeviceB", "RuleB");
+
+  auto rules_b = index.GetLayeringRulesForThisEp("DeviceB");
+  ASSERT_TRUE(rules_b.has_value());
+
+  // Simulate filtering for DeviceB
+  InlinedVector<const Node*> filtered_for_device_b;
+  for (auto& node : h.graph->Nodes()) {
+    auto rule_idx_opt = index.GetNodeAssignment(*h.graph, node.Index());
+    bool include = true;
+    if (rule_idx_opt) {
+      if (rules_b->get().count(*rule_idx_opt) == 0) {
+        include = false;
+      }
+    }
+    if (include) {
+      filtered_for_device_b.push_back(&node);
+    }
+  }
+
+  // DeviceB should see node1 (unassigned) and node2 (assigned to it), but NOT node0
+  EXPECT_EQ(filtered_for_device_b.size(), 2u);
+  bool found_node0 = false, found_node1 = false, found_node2 = false;
+  for (const auto* n : filtered_for_device_b) {
+    if (n->Index() == h.node_indices[0]) found_node0 = true;
+    if (n->Index() == h.node_indices[1]) found_node1 = true;
+    if (n->Index() == h.node_indices[2]) found_node2 = true;
+  }
+  EXPECT_FALSE(found_node0) << "DeviceA's assigned node should be excluded from DeviceB's view";
+  EXPECT_TRUE(found_node1) << "Unassigned node should be included for any EP";
+  EXPECT_TRUE(found_node2) << "DeviceB's assigned node should be included";
+}
+
+TEST(LayeringIndexPartitionerTest, ResetUnclaimedNodesRemovesAssignment) {
+  // Validates the reset_assignment_unclaimed_nodes logic:
+  // Nodes that were pre-assigned to an EP via layering but NOT claimed in capabilities
+  // should be unassigned so subsequent EPs can pick them up.
+
+  auto h = SimpleGraphHelper::Create(4);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  auto* node1 = h.graph->GetNode(h.node_indices[1]);
+  auto* node2 = h.graph->GetNode(h.node_indices[2]);
+
+  node0->SetLayeringAnnotation("RuleA");
+  node1->SetLayeringAnnotation("RuleA");
+  node2->SetLayeringAnnotation("RuleA");
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA", false});  // Index 0
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+
+  auto index = LayeringIndex::Create(*h.graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+
+  // All 3 nodes should be assigned initially
+  ASSERT_TRUE(index.GetNodeAssignment(*h.graph, h.node_indices[0]).has_value());
+  ASSERT_TRUE(index.GetNodeAssignment(*h.graph, h.node_indices[1]).has_value());
+  ASSERT_TRUE(index.GetNodeAssignment(*h.graph, h.node_indices[2]).has_value());
+
+  // Simulate: EP only claims node0 and node2 (not node1)
+  InlinedHashSet<NodeIndex> claimed;
+  claimed.insert(h.node_indices[0]);
+  claimed.insert(h.node_indices[2]);
+
+  auto ep_rules_opt = index.GetLayeringRulesForThisEp("DeviceA");
+  ASSERT_TRUE(ep_rules_opt.has_value());
+  const auto& ep_rules = ep_rules_opt->get();
+
+  // Replicate reset_assignment_unclaimed_nodes logic:
+  // For each assigned-filtered-in node, if not claimed, unassign it
+  std::vector<NodeIndex> assigned_filtered_in = {h.node_indices[0], h.node_indices[1], h.node_indices[2]};
+  for (auto node_index : assigned_filtered_in) {
+    if (claimed.count(node_index) == 0) {
+      auto rule_idx_opt = index.GetNodeAssignment(*h.graph, node_index);
+      if (rule_idx_opt && ep_rules.count(*rule_idx_opt) > 0) {
+        index.MakeNodeUnassigned(*h.graph, node_index);
+      }
+    }
+  }
+
+  // node0 and node2 should still be assigned
+  EXPECT_TRUE(index.GetNodeAssignment(*h.graph, h.node_indices[0]).has_value());
+  EXPECT_TRUE(index.GetNodeAssignment(*h.graph, h.node_indices[2]).has_value());
+  // node1 should be unassigned (not claimed by EP)
+  EXPECT_FALSE(index.GetNodeAssignment(*h.graph, h.node_indices[1]).has_value());
+}
+
+TEST(LayeringIndexPartitionerTest, UpdateAfterLayoutTransformAddsNewNodes) {
+  // Validates the LayeringIndex update after layout transformation creates new nodes.
+  // In GetCapabilityForEP, after layout transform, new nodes with inherited annotations
+  // are added and the index is updated.
+
+  auto h = SimpleGraphHelper::Create(1);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  node0->SetLayeringAnnotation("RuleA");
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  auto index = CreateTwoEpIndex(*h.graph, "DeviceA", "RuleA", "DeviceB", "RuleB");
+
+  // Record the max node index before "layout transformation"
+  const NodeIndex first_new_node = h.graph->MaxNodeIndex();
+
+  // Simulate layout transformation adding new nodes with inherited annotation
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* extra_out = &h.graph->GetOrCreateNodeArg("extra_output", &type_proto);
+  NodeArg* output_arg = &h.graph->GetOrCreateNodeArg("output", nullptr);  // reuse existing
+  Node& new_node = h.graph->AddNode("new_node", "Abs", "Node with inherited annotation",
+                                    {output_arg}, {extra_out});
+  new_node.SetLayeringAnnotation("RuleA");  // Inherits parent's annotation
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  const NodeIndex end_node = h.graph->MaxNodeIndex();
+
+  // Collect new node indices (as done in graph_partitioner.cc)
+  InlinedVector<NodeIndex> new_node_indices;
+  for (NodeIndex idx = first_new_node; idx < end_node; ++idx) {
+    if (h.graph->GetNode(idx) != nullptr) {
+      new_node_indices.push_back(idx);
+    }
+  }
+
+  // Update index
+  ASSERT_FALSE(new_node_indices.empty());
+  index.Update(*h.graph, new_node_indices);
+
+  // New node should be assigned to rule 0 (DeviceA)
+  auto assign = index.GetNodeAssignment(*h.graph, new_node.Index());
+  ASSERT_TRUE(assign.has_value());
+  EXPECT_EQ(*assign, 0u);
+
+  // And the annotation string should be on the node
+  EXPECT_EQ(new_node.GetLayeringAnnotation(), "RuleA");
+}
+
+TEST(LayeringIndexPartitionerTest, UpdateWithUnannotatedNewNodeRemainsUnassigned) {
+  // New nodes created by layout transform that do NOT have annotations
+  // should remain unassigned after Update.
+
+  auto h = SimpleGraphHelper::Create(1);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  node0->SetLayeringAnnotation("RuleA");
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  auto index = CreateTwoEpIndex(*h.graph, "DeviceA", "RuleA", "DeviceB", "RuleB");
+
+  // Add a new node WITHOUT annotation
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* extra_out = &h.graph->GetOrCreateNodeArg("extra_output", &type_proto);
+  NodeArg* output_arg = &h.graph->GetOrCreateNodeArg("output", nullptr);
+  Node& new_node = h.graph->AddNode("unannotated_node", "Abs", "No annotation",
+                                    {output_arg}, {extra_out});
+  // Deliberately NOT setting annotation
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  std::vector<NodeIndex> new_nodes = {new_node.Index()};
+  index.Update(*h.graph, new_nodes);
+
+  // New node should remain unassigned
+  auto assign = index.GetNodeAssignment(*h.graph, new_node.Index());
+  EXPECT_FALSE(assign.has_value());
+}
+
+TEST(LayeringIndexPartitionerTest, InlineAnnotationMaterialization) {
+  // Validates the InlineNodes logic where a node has an inherited-only assignment
+  // (no explicit annotation string) and the annotation is materialized before inlining.
+  // This tests the code path:
+  //   if (layering_index != nullptr && !has_explicit_annotation) {
+  //     auto rule_idx = layering_index->GetNodeAssignment(graph, node->Index());
+  //     if (rule_idx) { ... node->SetLayeringAnnotation(rules.rules[*rule_idx].annotation); }
+  //   }
+
+  // Setup: A graph where a node is assigned via inheritance (subgraph scenario)
+  // but has no explicit annotation string on it.
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* input_arg = &graph.GetOrCreateNodeArg("input", &type_proto);
+  NodeArg* output_arg = &graph.GetOrCreateNodeArg("output", &type_proto);
+
+  // Create a node without explicit annotation
+  Node& node = graph.AddNode("inherited_node", "Abs", "Node with inherited assignment",
+                             {input_arg}, {output_arg});
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Create index where the node is somehow assigned (e.g., through inheritance)
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA", false});  // Index 0
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+
+  // The node has no annotation, so it shouldn't be assigned yet
+  ASSERT_TRUE(node.GetLayeringAnnotation().empty());
+  EXPECT_FALSE(index.GetNodeAssignment(graph, node.Index()).has_value());
+
+  // Now simulate what InlineNodes does: manually annotate and update
+  // This simulates the case where GetNodeAssignment returns a value
+  // for a node in a subgraph that inherited its parent's assignment.
+  node.SetLayeringAnnotation("RuleA");
+  std::vector<NodeIndex> updated = {node.Index()};
+  index.Update(graph, updated);
+
+  // After materialization + update, the node should be properly assigned
+  auto assign = index.GetNodeAssignment(graph, node.Index());
+  ASSERT_TRUE(assign.has_value());
+  EXPECT_EQ(*assign, 0u);
+
+  // And the annotation string should be on the node
+  EXPECT_EQ(node.GetLayeringAnnotation(), "RuleA");
+}
+
+TEST(LayeringIndexPartitionerTest, UpdateBatchMultipleNewAnnotatedNodes) {
+  // Tests that Update correctly handles a batch of multiple new nodes,
+  // some annotated with different rules. This mirrors the behavior after
+  // layout transformation creates several new nodes.
+
+  auto h = SimpleGraphHelper::Create(1);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  node0->SetLayeringAnnotation("RuleA");
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  auto index = CreateTwoEpIndex(*h.graph, "DeviceA", "RuleA", "DeviceB", "RuleB");
+
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+  // Add 3 new nodes: one for RuleA, one for RuleB, one unannotated
+  NodeArg* out1 = &h.graph->GetOrCreateNodeArg("new_out1", &type_proto);
+  NodeArg* out2 = &h.graph->GetOrCreateNodeArg("new_out2", &type_proto);
+  NodeArg* out3 = &h.graph->GetOrCreateNodeArg("new_out3", &type_proto);
+  NodeArg* output = &h.graph->GetOrCreateNodeArg("output", nullptr);
+
+  Node& new_a = h.graph->AddNode("new_a", "Abs", "", {output}, {out1});
+  new_a.SetLayeringAnnotation("RuleA");
+
+  Node& new_b = h.graph->AddNode("new_b", "Abs", "", {out1}, {out2});
+  new_b.SetLayeringAnnotation("RuleB");
+
+  Node& new_none = h.graph->AddNode("new_none", "Abs", "", {out2}, {out3});
+  // No annotation
+
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  std::vector<NodeIndex> new_nodes = {new_a.Index(), new_b.Index(), new_none.Index()};
+  index.Update(*h.graph, new_nodes);
+
+  // new_a -> RuleA -> rule index 0
+  auto assign_a = index.GetNodeAssignment(*h.graph, new_a.Index());
+  ASSERT_TRUE(assign_a.has_value());
+  EXPECT_EQ(*assign_a, 0u);
+
+  // new_b -> RuleB -> rule index 1
+  auto assign_b = index.GetNodeAssignment(*h.graph, new_b.Index());
+  ASSERT_TRUE(assign_b.has_value());
+  EXPECT_EQ(*assign_b, 1u);
+
+  // new_none -> unassigned
+  auto assign_none = index.GetNodeAssignment(*h.graph, new_none.Index());
+  EXPECT_FALSE(assign_none.has_value());
+}
+
+TEST(LayeringIndexPartitionerTest, MakeUnassignedThenReassignViaPrefixRule) {
+  // Test that prefix rules work correctly after unassign+update cycle.
+  // This covers the interaction between MakeNodeUnassigned, prefix matching,
+  // and Update.
+
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto type_proto;
+  type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  NodeArg* input_arg = &graph.GetOrCreateNodeArg("input", &type_proto);
+  NodeArg* output_arg = &graph.GetOrCreateNodeArg("output", &type_proto);
+
+  Node& node = graph.AddNode("node", "Abs", "Node", {input_arg}, {output_arg});
+  node.SetLayeringAnnotation("Layer_GPU_Compute");
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Prefix rule: "Layer_GPU" matches "Layer_GPU_Compute"
+  LayeringRules rules;
+  rules.rules.push_back({"GPUDevice", "Layer_GPU", true});  // Index 0, prefix match
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["GPUDevice"].insert(0);
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "GPUDevice";
+
+  auto index = LayeringIndex::Create(graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+
+  // Node should be assigned via prefix match
+  auto assign = index.GetNodeAssignment(graph, node.Index());
+  ASSERT_TRUE(assign.has_value());
+  EXPECT_EQ(*assign, 0u);
+
+  // Unassign the node
+  index.MakeNodeUnassigned(graph, node.Index());
+  EXPECT_FALSE(index.GetNodeAssignment(graph, node.Index()).has_value());
+
+  // Add a new node with a different annotation that also matches the prefix
+  NodeArg* new_out = &graph.GetOrCreateNodeArg("new_output", &type_proto);
+  Node& new_node = graph.AddNode("new_node", "Abs", "Node with inherited annotation",
+                                 {output_arg}, {new_out});
+  new_node.SetLayeringAnnotation("Layer_GPU_Memory");
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  std::vector<NodeIndex> new_nodes = {new_node.Index()};
+  index.Update(graph, new_nodes);
+
+  // New node should also be assigned via prefix match
+  auto new_assign = index.GetNodeAssignment(graph, new_node.Index());
+  ASSERT_TRUE(new_assign.has_value());
+  EXPECT_EQ(*new_assign, 0u);
+}
+
+TEST(LayeringIndexPartitionerTest, NoLayeringIndexAllNodesVisible) {
+  // When layering_index is nullptr (no layering configuration),
+  // all nodes should be visible to all EPs. This verifies the baseline
+  // behavior that the filtering code path is only active when layering is enabled.
+
+  auto h = SimpleGraphHelper::Create(3);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  auto* node2 = h.graph->GetNode(h.node_indices[2]);
+
+  // Even if nodes have annotations, without a LayeringIndex, everything is visible
+  node0->SetLayeringAnnotation("RuleA");
+  node2->SetLayeringAnnotation("RuleB");
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  // Without LayeringIndex, a standard GraphViewer should see all nodes
+  GraphViewer viewer(*h.graph);
+  EXPECT_EQ(viewer.NumberOfNodes(), 3);
+
+  // All nodes accessible
+  EXPECT_NE(viewer.GetNode(h.node_indices[0]), nullptr);
+  EXPECT_NE(viewer.GetNode(h.node_indices[1]), nullptr);
+  EXPECT_NE(viewer.GetNode(h.node_indices[2]), nullptr);
+}
+
+TEST(LayeringIndexPartitionerTest, EpWithNoLayeringRulesSeesAllUnassignedNodes) {
+  // An EP that has no rules in the LayeringIndex (i.e., GetLayeringRulesForThisEp returns nullopt)
+  // should still see unassigned nodes, but nodes assigned to other EPs are excluded.
+  // This is the behavior for a CPU fallback EP not mentioned in layering config,
+  // as implemented in graph_partitioner.cc create_graph_viewer:
+  //   if (!rules_opt || rules_opt->get().count(*rule_idx_opt) == 0) { include = false; }
+
+  auto h = SimpleGraphHelper::Create(4);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  auto* node2 = h.graph->GetNode(h.node_indices[2]);
+  node0->SetLayeringAnnotation("RuleA");
+  node2->SetLayeringAnnotation("RuleB");
+  // node1 and node3 are unannotated
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  auto index = CreateTwoEpIndex(*h.graph, "DeviceA", "RuleA", "DeviceB", "RuleB");
+
+  // "CPUDevice" has no rules in the index
+  auto rules_cpu = index.GetLayeringRulesForThisEp("CPUDevice");
+  EXPECT_FALSE(rules_cpu.has_value());
+
+  // Replicate create_graph_viewer filtering logic for an EP with no rules.
+  // When rules_opt is nullopt, any node with an assignment is excluded:
+  //   if (!rules_opt || ...) { include = false; }
+  // Unassigned nodes remain included.
+  InlinedVector<const Node*> filtered_for_cpu;
+  for (auto& node : h.graph->Nodes()) {
+    auto rule_idx_opt = index.GetNodeAssignment(*h.graph, node.Index());
+    bool include = true;
+    if (rule_idx_opt) {
+      if (!rules_cpu || rules_cpu->get().count(*rule_idx_opt) == 0) {
+        include = false;
+      }
+    }
+    if (include) {
+      filtered_for_cpu.push_back(&node);
+    }
+  }
+
+  // CPUDevice should see only the 2 unassigned nodes (node1, node3).
+  // node0 (RuleA/DeviceA) and node2 (RuleB/DeviceB) are excluded.
+  EXPECT_EQ(filtered_for_cpu.size(), 2u);
+
+  bool found[4] = {};
+  for (const auto* n : filtered_for_cpu) {
+    for (size_t i = 0; i < std::size(found); ++i) {
+      if (n->Index() == h.node_indices[i]) found[i] = true;
+    }
+  }
+  EXPECT_FALSE(found[0]) << "node0 assigned to DeviceA should be excluded";
+  EXPECT_TRUE(found[1]) << "node1 unassigned should be included";
+  EXPECT_FALSE(found[2]) << "node2 assigned to DeviceB should be excluded";
+  EXPECT_TRUE(found[3]) << "node3 unassigned should be included";
+}
+TEST(LayeringIndexPartitionerTest, MultipleRulesForSameEp) {
+  // An EP can have multiple rules assigned to it. All nodes matching any of its
+  // rules should be visible to it, while nodes matching other EP rules should not.
+
+  auto h = SimpleGraphHelper::Create(4);
+  auto* node0 = h.graph->GetNode(h.node_indices[0]);
+  auto* node1 = h.graph->GetNode(h.node_indices[1]);
+  auto* node2 = h.graph->GetNode(h.node_indices[2]);
+
+  node0->SetLayeringAnnotation("RuleA1");
+  node1->SetLayeringAnnotation("RuleA2");
+  node2->SetLayeringAnnotation("RuleB");
+  // node3 unannotated
+  ASSERT_STATUS_OK(h.graph->Resolve());
+
+  // DeviceA has two rules: RuleA1 (index 0) and RuleA2 (index 1)
+  // DeviceB has one rule: RuleB (index 2)
+  LayeringRules rules;
+  rules.rules.push_back({"DeviceA", "RuleA1", false});  // Index 0
+  rules.rules.push_back({"DeviceA", "RuleA2", false});  // Index 1
+  rules.rules.push_back({"DeviceB", "RuleB", false});   // Index 2
+
+  LayeringIndex::EpNameToLayeringIndices ep_map;
+  ep_map["DeviceA"].insert(0);
+  ep_map["DeviceA"].insert(1);
+  ep_map["DeviceB"].insert(2);
+
+  LayeringIndex::LayeringIndexToEpName rule_map;
+  rule_map[0] = "DeviceA";
+  rule_map[1] = "DeviceA";
+  rule_map[2] = "DeviceB";
+
+  auto index = LayeringIndex::Create(*h.graph, std::move(ep_map), std::move(rule_map), std::move(rules));
+
+  auto rules_a = index.GetLayeringRulesForThisEp("DeviceA");
+  ASSERT_TRUE(rules_a.has_value());
+  EXPECT_EQ(rules_a->get().size(), 2u);  // Both rule indices 0 and 1
+
+  // Simulate filtering for DeviceA
+  InlinedVector<const Node*> filtered_for_a;
+  for (auto& node : h.graph->Nodes()) {
+    auto rule_idx_opt = index.GetNodeAssignment(*h.graph, node.Index());
+    bool include = true;
+    if (rule_idx_opt) {
+      if (rules_a->get().count(*rule_idx_opt) == 0) {
+        include = false;
+      }
+    }
+    if (include) {
+      filtered_for_a.push_back(&node);
+    }
+  }
+
+  // DeviceA should see node0, node1 (both its rules), and node3 (unassigned) = 3 nodes
+  // node2 (RuleB/DeviceB) should be excluded
+  EXPECT_EQ(filtered_for_a.size(), 3u);
+
+  bool found[4] = {};
+  for (const auto* n : filtered_for_a) {
+    for (int i = 0; i < 4; ++i) {
+      if (n->Index() == h.node_indices[i]) found[i] = true;
+    }
+  }
+  EXPECT_TRUE(found[0]);   // node0 - RuleA1
+  EXPECT_TRUE(found[1]);   // node1 - RuleA2
+  EXPECT_FALSE(found[2]);  // node2 - RuleB (excluded)
+  EXPECT_TRUE(found[3]);   // node3 - unassigned
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
\ No newline at end of file
diff --git a/onnxruntime/test/framework/resource_accountant_test.cc b/onnxruntime/test/framework/resource_accountant_test.cc
new file mode 100644
index 0000000000000..a102fe4e7770b
--- /dev/null
+++ b/onnxruntime/test/framework/resource_accountant_test.cc
@@ -0,0 +1,327 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/resource_accountant.h"
+#include "core/graph/indexed_sub_graph.h"
+#include "core/graph/constants.h"
+#include "core/graph/model.h"
+
+#include "gtest/gtest.h"
+
+#include "test/util/include/asserts.h"
+#include "test/util/include/test_environment.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Test accountant mimicking SizeBasedStatsAccountant ad-hoc path:
+// Uses pending/committed weight sets so that:
+// - Within a GetCapability pass, shared weights are deduped
+// - Across passes, only committed weights persist and pending are discarded
+class TestDedupAccountant : public IResourceAccountant {
+ public:
+  TestDedupAccountant() = default;
+
+  ResourceCount GetConsumedAmount() const override {
+    return consumed_;
+  }
+
+  void AddConsumedAmount(const ResourceCount& amount) noexcept override {
+    if (std::holds_alternative<size_t>(amount)) {
+      consumed_ += std::get<size_t>(amount);
+    }
+  }
+
+  void RemoveConsumedAmount(const ResourceCount& amount) noexcept override {
+    if (std::holds_alternative<size_t>(amount)) {
+      consumed_ -= std::get<size_t>(amount);
+    }
+  }
+
+  ResourceCount ComputeResourceCount(const Node& node) override {
+    const auto* graph = node.GetContainingGraph();
+    if (graph == nullptr) {
+      return static_cast<size_t>(0);
+    }
+
+    size_t total = 0;
+    for (const auto* input_def : node.InputDefs()) {
+      if (!input_def->Exists()) {
+        continue;
+      }
+      const auto& name = input_def->Name();
+      constexpr bool check_outer_scope = true;
+      const auto* init = graph->GetInitializer(name, check_outer_scope);
+      if (init != nullptr) {
+        if (committed_weights_.count(name) > 0) {
+          continue;
+        }
+        if (pending_weights_.count(name) > 0) {
+          continue;
+        }
+        auto it = weight_sizes_.find(name);
+        if (it != weight_sizes_.end()) {
+          total += it->second;
+        }
+        pending_weights_.insert(name);
+        pending_weights_by_node_[node.Index()].insert(name);
+      }
+    }
+    return total;
+  }
+
+  void ResetPendingWeights() override {
+    pending_weights_.clear();
+    pending_weights_by_node_.clear();
+  }
+
+  void CommitWeightsForNode(NodeIndex node_index) override {
+    auto it = pending_weights_by_node_.find(node_index);
+    if (it != pending_weights_by_node_.end()) {
+      for (const auto& name : it->second) {
+        pending_weights_.erase(name);
+      }
+      committed_weights_.insert(it->second.begin(), it->second.end());
+      pending_weights_by_node_.erase(it);
+    }
+  }
+
+  void RegisterWeight(const std::string& name, size_t size) {
+    weight_sizes_[name] = size;
+  }
+
+  size_t GetConsumedSizeT() const { return consumed_; }
+
+ private:
+  size_t consumed_ = 0;
+  InlinedHashSet<std::string> committed_weights_;
+  InlinedHashSet<std::string> pending_weights_;
+  InlinedHashMap<NodeIndex, InlinedHashSet<std::string>> pending_weights_by_node_;
+  InlinedHashMap<std::string, size_t> weight_sizes_;
+};
+
+// Two Add nodes that share a single initializer weight_W.
+struct SharedWeightGraph {
+  std::unique_ptr<Model> model;
+  Graph* graph = nullptr;
+  Node* node_a = nullptr;
+  Node* node_b = nullptr;
+
+  static SharedWeightGraph Create() {
+    SharedWeightGraph h;
+    std::unordered_map<std::string, int> dom;
+    dom[kOnnxDomain] = 12;
+    h.model = std::make_unique<Model>(
+        "test_model", false, ModelMetaData(), PathString(),
+        IOnnxRuntimeOpSchemaRegistryList(), dom,
+        std::vector<ONNX_NAMESPACE::FunctionProto>(),
+        DefaultLoggingManager().DefaultLogger());
+    h.graph = &h.model->MainGraph();
+
+    ONNX_NAMESPACE::TypeProto ft;
+    ft.mutable_tensor_type()->set_elem_type(
+        ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    ft.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(250);
+
+    ONNX_NAMESPACE::TensorProto wp;
+    wp.set_name("weight_W");
+    wp.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    wp.add_dims(250);
+    for (int i = 0; i < 250; ++i) {
+      wp.add_float_data(0.0f);
+    }
+    h.graph->AddInitializedTensor(wp);
+
+    auto* ia = &h.graph->GetOrCreateNodeArg("input_a", &ft);
+    auto* ib = &h.graph->GetOrCreateNodeArg("input_b", &ft);
+    auto* wa = &h.graph->GetOrCreateNodeArg("weight_W", &ft);
+    auto* oa = &h.graph->GetOrCreateNodeArg("out_a", &ft);
+    auto* ob = &h.graph->GetOrCreateNodeArg("out_b", &ft);
+
+    h.node_a = &h.graph->AddNode("node_A", "Add", "A", {ia, wa}, {oa});
+    h.node_b = &h.graph->AddNode("node_B", "Add", "B", {ib, wa}, {ob});
+
+    auto status = h.graph->Resolve();
+    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+    return h;
+  }
+};
+
+// Regression: AccountForAllNodes sums pre-stored per-node costs
+// that already have correct within-pass weight deduplication.
+TEST(ResourceAccountantTest, AccountForAllNodes_CorrectlyUsesPreStoredCosts) {
+  auto h = SharedWeightGraph::Create();
+  TestDedupAccountant accountant;
+  accountant.RegisterWeight("weight_W", 1000);
+
+  IndexedSubGraph sub_graph;
+  sub_graph.nodes.push_back(h.node_a->Index());
+  sub_graph.nodes.push_back(h.node_b->Index());
+  sub_graph.SetAccountant(&accountant);
+
+  auto cost_a = accountant.ComputeResourceCount(*h.node_a);
+  sub_graph.AppendNodeCost(cost_a);
+  EXPECT_EQ(std::get<size_t>(cost_a), size_t{1000});
+
+  auto cost_b = accountant.ComputeResourceCount(*h.node_b);
+  sub_graph.AppendNodeCost(cost_b);
+  EXPECT_EQ(std::get<size_t>(cost_b), size_t{0});
+
+  ASSERT_TRUE(sub_graph.IsAccountingEnabled());
+  sub_graph.AccountForAllNodes();
+
+  EXPECT_EQ(accountant.GetConsumedSizeT(), size_t{1000})
+      << "AccountForAllNodes should sum pre-stored costs (1000 + 0)";
+}
+
+// Verifies that ResetPendingWeights + re-probe produces correct results.
+// After probing (which only writes to pending), resetting pending and
+// re-probing should see the full weight cost again since nothing was committed.
+TEST(ResourceAccountantTest, ComputeAndAccountForNode_CorrectAfterReset) {
+  auto h = SharedWeightGraph::Create();
+  TestDedupAccountant accountant;
+  accountant.RegisterWeight("weight_W", 1000);
+
+  // Probing pass populates pending weights
+  auto cost_a = accountant.ComputeResourceCount(*h.node_a);
+  EXPECT_EQ(std::get<size_t>(cost_a), size_t{1000});
+  auto cost_b = accountant.ComputeResourceCount(*h.node_b);
+  EXPECT_EQ(std::get<size_t>(cost_b), size_t{0});
+
+  // Discard the pass (simulating capabilities.clear() before second GetCapability)
+  accountant.ResetPendingWeights();
+
+  // Re-probe: weight_W was never committed, so it should be counted again
+  IndexedSubGraph sub_graph;
+  sub_graph.nodes.push_back(h.node_a->Index());
+  sub_graph.SetAccountant(&accountant);
+  auto recomputed_cost = accountant.ComputeResourceCount(*h.node_a);
+  sub_graph.AccountForNode(h.node_a->Index(), recomputed_cost);
+
+  EXPECT_EQ(accountant.GetConsumedSizeT(), size_t{1000})
+      << "After ResetPendingWeights, re-probe should see full weight cost";
+}
+
+// Each node has a unique initializer. AccountForAllNodes sums both.
+TEST(ResourceAccountantTest, AccountForAllNodes_NoSharedWeights) {
+  std::unordered_map<std::string, int> dom;
+  dom[kOnnxDomain] = 12;
+  Model model("test_model", false, ModelMetaData(), PathString(),
+              IOnnxRuntimeOpSchemaRegistryList(), dom,
+              std::vector<ONNX_NAMESPACE::FunctionProto>(),
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto ft;
+  ft.mutable_tensor_type()->set_elem_type(
+      ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  ft.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(100);
+
+  const char* names[] = {"weight_1", "weight_2"};
+  for (const char* wn : names) {
+    ONNX_NAMESPACE::TensorProto tp;
+    tp.set_name(wn);
+    tp.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    tp.add_dims(100);
+    for (int i = 0; i < 100; ++i) {
+      tp.add_float_data(0.0f);
+    }
+    graph.AddInitializedTensor(tp);
+  }
+
+  auto* input = &graph.GetOrCreateNodeArg("input", &ft);
+  auto* w1 = &graph.GetOrCreateNodeArg("weight_1", &ft);
+  auto* w2 = &graph.GetOrCreateNodeArg("weight_2", &ft);
+  auto* out1 = &graph.GetOrCreateNodeArg("out1", &ft);
+  auto* out2 = &graph.GetOrCreateNodeArg("out2", &ft);
+
+  auto& node1 = graph.AddNode("n1", "Add", "", {input, w1}, {out1});
+  auto& node2 = graph.AddNode("n2", "Add", "", {out1, w2}, {out2});
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  TestDedupAccountant accountant;
+  accountant.RegisterWeight("weight_1", 400);
+  accountant.RegisterWeight("weight_2", 600);
+
+  IndexedSubGraph sub_graph;
+  sub_graph.nodes.push_back(node1.Index());
+  sub_graph.nodes.push_back(node2.Index());
+  sub_graph.SetAccountant(&accountant);
+
+  sub_graph.AppendNodeCost(accountant.ComputeResourceCount(node1));
+  sub_graph.AppendNodeCost(accountant.ComputeResourceCount(node2));
+
+  ASSERT_TRUE(sub_graph.IsAccountingEnabled());
+  sub_graph.AccountForAllNodes();
+
+  EXPECT_EQ(accountant.GetConsumedSizeT(), size_t{1000})
+      << "No shared weights: should sum all costs (400 + 600)";
+}
+
+// AccountForNode per-node and AccountForAllNodes bulk produce same result.
+TEST(ResourceAccountantTest, AccountForNode_MatchesAccountForAllNodes) {
+  auto h = SharedWeightGraph::Create();
+
+  // Per-node path
+  TestDedupAccountant acc1;
+  acc1.RegisterWeight("weight_W", 1000);
+  IndexedSubGraph sub1;
+  sub1.nodes.push_back(h.node_a->Index());
+  sub1.nodes.push_back(h.node_b->Index());
+  sub1.SetAccountant(&acc1);
+  sub1.AppendNodeCost(acc1.ComputeResourceCount(*h.node_a));
+  sub1.AppendNodeCost(acc1.ComputeResourceCount(*h.node_b));
+  sub1.AccountForNode(0);
+  sub1.AccountForNode(1);
+  size_t per_node = acc1.GetConsumedSizeT();
+
+  // Bulk path
+  TestDedupAccountant acc2;
+  acc2.RegisterWeight("weight_W", 1000);
+  IndexedSubGraph sub2;
+  sub2.nodes.push_back(h.node_a->Index());
+  sub2.nodes.push_back(h.node_b->Index());
+  sub2.SetAccountant(&acc2);
+  sub2.AppendNodeCost(acc2.ComputeResourceCount(*h.node_a));
+  sub2.AppendNodeCost(acc2.ComputeResourceCount(*h.node_b));
+  sub2.AccountForAllNodes();
+  size_t bulk = acc2.GetConsumedSizeT();
+
+  EXPECT_EQ(per_node, bulk)
+      << "Per-node and bulk should produce identical results";
+  EXPECT_EQ(per_node, size_t{1000});
+}
+
+// Cross-subgraph dedup: EP1 commits node_A, EP2 probes node_B and
+// correctly sees weight_W as already accounted.
+TEST(ResourceAccountantTest, CrossSubGraph_DedupWorks) {
+  auto h = SharedWeightGraph::Create();
+  TestDedupAccountant accountant;
+  accountant.RegisterWeight("weight_W", 1000);
+
+  // EP1 probes and commits node_A
+  IndexedSubGraph sub1;
+  sub1.nodes.push_back(h.node_a->Index());
+  sub1.SetAccountant(&accountant);
+  sub1.AppendNodeCost(accountant.ComputeResourceCount(*h.node_a));
+  sub1.AccountForNode(0);
+  EXPECT_EQ(accountant.GetConsumedSizeT(), size_t{1000});
+
+  // EP2 probes node_B: weight_W already committed
+  auto cost_b = accountant.ComputeResourceCount(*h.node_b);
+  EXPECT_EQ(std::get<size_t>(cost_b), size_t{0})
+      << "weight_W was committed by EP1, should be deduped for EP2";
+
+  // EP2 commits node_B with cost 0
+  IndexedSubGraph sub2;
+  sub2.nodes.push_back(h.node_b->Index());
+  sub2.SetAccountant(&accountant);
+  sub2.AppendNodeCost(cost_b);
+  sub2.AccountForNode(0);
+
+  EXPECT_EQ(accountant.GetConsumedSizeT(), size_t{1000})
+      << "Total should still be 1000 - weight_W counted once across both";
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index ed2b98e5280b5..656b0ef86289d 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <fstream>
 #include <iostream>
 #include <absl/base/config.h>
 
@@ -9,9 +10,11 @@
 #include "core/framework/execution_providers.h"
 #include "core/framework/graph_partitioner.h"
 #include "core/framework/kernel_registry.h"
+#include "core/framework/layering_annotations.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/bfc_arena.h"
 #include "core/framework/ep_context_options.h"
+#include "core/framework/resource_accountant.h"
 #include "core/framework/session_state.h"
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
@@ -280,7 +283,7 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
                 graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
           },
           sess_options.config_options,
-          DefaultLoggingManager().DefaultLogger()));
+          DefaultLoggingManager().DefaultLogger(), nullptr /*layering_index*/));
 
   ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -367,7 +370,8 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                                                              cpu_allocator, debug_graph_fn);
         },
         sess_options.config_options,
-        default_logger));
+        default_logger,
+        nullptr /*layering_index*/));
 
     EXPECT_STATUS_OK(session_state.FinalizeSessionState(model.ModelPath(), krm));
 
@@ -414,9 +418,50 @@ namespace {
 
 using ParitionVerifierFn = std::function<void(const Graph&)>;
 
+// Collect unique node names from a graph and all its subgraphs
+// using the same naming scheme as the resource accountant.
+static void CollectNodeNames(const Graph& graph, std::vector<std::string>& names) {
+  for (const auto& node : graph.Nodes()) {
+    names.push_back(IResourceAccountant::MakeUniqueNodeName(node));
+    for (const auto& [_, subgraph] : node.GetAttributeNameToSubgraphMap()) {
+      CollectNodeNames(*subgraph, names);
+    }
+  }
+}
+
+// Generates a node stats file dynamically from the current graph,
+// assigning each node a fixed cost. Returns the total cost across
+// all nodes so callers can choose a threshold relative to the actual total.
+// This avoids relying on a pre-baked stats file whose node name hashes
+// become stale when graph optimizers change node input/output names.
+static void GenerateDynamicNodeStatsFile(const ORTCHAR_T* model_path,
+                                         const std::filesystem::path& output_path,
+                                         size_t& total_cost,
+                                         size_t cost_per_node = 1024) {
+  const auto& default_logger = DefaultLoggingManager().DefaultLogger();
+  std::shared_ptr<onnxruntime::Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_path, model, nullptr, default_logger));
+  Graph& graph = model->MainGraph();
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  std::vector<std::string> node_names;
+  CollectNodeNames(graph, node_names);
+
+  std::ofstream ofs(output_path);
+  ASSERT_TRUE(ofs.is_open());
+  ofs << "#name,input_sizes,initializers_sizes,total_dynamic_sizes,total_temp_allocations\n";
+  for (const auto& name : node_names) {
+    ofs << name << "," << cost_per_node << ",0,0,0\n";
+  }
+  ofs.close();
+
+  total_cost = node_names.size() * cost_per_node;
+}
+
 void LoadWithResourceAwarePartitioning(const ORTCHAR_T* model_path,
                                        const SessionOptions& sess_options,
-                                       const ParitionVerifierFn& verifier_fn) {
+                                       const ParitionVerifierFn& verifier_fn,
+                                       const std::string& layering_config = std::string()) {
   const auto& log_manager = DefaultLoggingManager();
   log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
   const auto& default_logger = log_manager.DefaultLogger();
@@ -431,9 +476,12 @@ void LoadWithResourceAwarePartitioning(const ORTCHAR_T* model_path,
   auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
 
   ExecutionProviders execution_providers;
-  auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider();
-  tmp_cpu_execution_provider->SetLogger(&default_logger);
-  ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider)));
+  auto tmp_execution_provider = DefaultCudaExecutionProvider();
+  tmp_execution_provider->SetLogger(&default_logger);
+  ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_execution_provider)));
+  tmp_execution_provider = DefaultCpuExecutionProvider();
+  tmp_execution_provider->SetLogger(&default_logger);
+  ASSERT_STATUS_OK(execution_providers.Add(kCpuExecutionProvider, std::move(tmp_execution_provider)));
 
   KernelRegistryManager krm;
   ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
@@ -445,6 +493,16 @@ void LoadWithResourceAwarePartitioning(const ORTCHAR_T* model_path,
   SessionState session_state(model->MainGraph(), execution_providers, tp.get(), nullptr, dtm, edlm,
                              default_logger, profiler, sess_options);
 
+  LayeringIndex* layering_index = nullptr;
+  std::optional<LayeringIndex> layering_index_storage;
+  if (!layering_config.empty()) {
+    ASSERT_STATUS_OK(LayeringIndex::Create(graph, layering_config, {}, execution_providers,
+                                           default_logger, layering_index_storage));
+    if (layering_index_storage.has_value()) {
+      layering_index = &layering_index_storage.value();
+    }
+  }
+
   // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup
   auto graph_optimizer_registry = std::make_unique<GraphOptimizerRegistry>(&sess_options,
                                                                            execution_providers.Get(onnxruntime::kCpuExecutionProvider),
@@ -455,7 +513,8 @@ void LoadWithResourceAwarePartitioning(const ORTCHAR_T* model_path,
   layout_transformation::DebugGraphFn debug_graph_fn;
   ASSERT_STATUS_OK(
       partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn,
-                            sess_options.config_options, default_logger, GraphPartitioner::Mode::kNormal,
+                            sess_options.config_options, default_logger, layering_index,
+                            GraphPartitioner::Mode::kNormal,
                             epctx::ModelGenOptions{},
                             debug_graph_fn));
 
@@ -484,16 +543,28 @@ TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) {
 
 TEST(SessionStateTest, TestResourceAwarePartitioning_LargeLimit) {
   constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx");
-  constexpr const char* limit_setting = "10000,tiny_gpt2_beamsearch_node_stats.txt";
+  std::error_code ec;
+  const std::filesystem::path stats_path =
+      std::filesystem::temp_directory_path(ec) / "tiny_gpt2_beamsearch_dynamic_stats_large.txt";
+  ASSERT_FALSE(ec) << "temp_directory_path failed: " << ec.message();
+
+  // Generate node stats dynamically so names always match the current graph
+  constexpr size_t cost_per_node = 1024;
+  size_t total_cost = 0;
+  GenerateDynamicNodeStatsFile(model_path, stats_path, total_cost, cost_per_node);
+  ASSERT_GT(total_cost, 0U);
+
+  // Use a limit much larger than total cost so all nodes are assigned CUDA.
+  size_t large_limit_kb = (total_cost * 2) / 1024 + 1;
+  std::string limit_setting = std::to_string(large_limit_kb) + "," + stats_path.string();
 
-  // Large limit, all nodes are still assigned
   SessionOptions sess_options;
   sess_options.enable_mem_pattern = false;
   sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
   sess_options.use_deterministic_compute = false;
   sess_options.enable_mem_reuse = false;
   ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(
-      kOrtSessionOptionsResourceCudaPartitioningSettings, limit_setting));
+      kOrtSessionOptionsResourceCudaPartitioningSettings, limit_setting.c_str()));
 
   LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) {
     const auto& graph_nodes = graph.Nodes();
@@ -501,20 +572,36 @@ TEST(SessionStateTest, TestResourceAwarePartitioning_LargeLimit) {
       EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
     }
   });
+
+  std::error_code remove_ec;
+  std::filesystem::remove(stats_path, remove_ec);
 }
 
 TEST(SessionStateTest, TestResourceAwarePartitioning_CPUOffloaded) {
   constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx");
-  constexpr const char* limit_setting = "5000,tiny_gpt2_beamsearch_node_stats.txt";
+  std::error_code ec;
+  const std::filesystem::path stats_path =
+      std::filesystem::temp_directory_path(ec) / "tiny_gpt2_beamsearch_dynamic_stats_offload.txt";
+  ASSERT_FALSE(ec) << "temp_directory_path failed: " << ec.message();
+
+  // Generate node stats dynamically so names always match the current graph.
+  constexpr size_t cost_per_node = 1024;
+  size_t total_cost = 0;
+  GenerateDynamicNodeStatsFile(model_path, stats_path, total_cost, cost_per_node);
+  ASSERT_GT(total_cost, 0U);
+
+  // Set threshold to half the total cost so some nodes must be offloaded to CPU.
+  size_t half_limit_kb = (total_cost / 2) / 1024;
+  ASSERT_GT(half_limit_kb, 0U);
+  std::string limit_setting = std::to_string(half_limit_kb) + "," + stats_path.string();
 
-  // Large limit, all nodes are still assigned
   SessionOptions sess_options;
   sess_options.enable_mem_pattern = false;
   sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
   sess_options.use_deterministic_compute = false;
   sess_options.enable_mem_reuse = false;
   ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(
-      kOrtSessionOptionsResourceCudaPartitioningSettings, limit_setting));
+      kOrtSessionOptionsResourceCudaPartitioningSettings, limit_setting.c_str()));
 
   LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) {
     const auto& graph_nodes = graph.Nodes();
@@ -527,6 +614,38 @@ TEST(SessionStateTest, TestResourceAwarePartitioning_CPUOffloaded) {
     }
     EXPECT_TRUE(cpu_node_found);
   });
+
+  std::error_code remove_ec;
+  std::filesystem::remove(stats_path, remove_ec);
+}
+
+TEST(SessionStateTest, TestLayeringPartitioning) {
+  constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/layering/tiny_gpt2_beamsearch_layering.onnx");
+  constexpr const char* layering_setting =
+      "cpu(Embed,Decode);gpu(GptAttention0,GptAttention1,GptAttention2,GptAttention3,GptAttention4)";
+
+  // Set the session options for layering
+  SessionOptions sess_options;
+  sess_options.enable_mem_pattern = false;
+  sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+  sess_options.use_deterministic_compute = false;
+  sess_options.enable_mem_reuse = false;
+  ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(
+      kOrtSessionOptionsLayerAssignmentSettings, layering_setting));
+
+  LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) {
+     const auto& graph_nodes = graph.Nodes();
+     for (const auto& node : graph_nodes) {
+       const std::string& name = node.Name();
+       const bool expected_on_cpu = (name.find("EmbedLayer") == 0) || (name == "LayerNorm_10") || (name == "MatMul_1165");
+
+       const std::string& ep = node.GetExecutionProviderType();
+       if (expected_on_cpu) {
+         EXPECT_EQ(ep, kCpuExecutionProvider) << "Node " << name << " expected on CPU but found on " << ep;
+       } else {
+         EXPECT_EQ(ep, kCudaExecutionProvider) << "Node " << name << " expected on CUDA but found on " << ep;
+       }
+     } }, layering_setting);
 }
 
 #endif  // USE_CUDA
@@ -909,9 +1028,8 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) {
   OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
   std::vector<float> float_data(1, 1);
   auto value = std::make_unique<OrtValue>();
-  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(),
-                       TensorShape(std::vector<int64_t>{1}), reinterpret_cast<void*>(float_data.data()),
-                       mem_info, *value);
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
+                       float_data.data(), mem_info, *value);
 
   ASSERT_STATUS_OK(sess_options.AddInitializer("node_0_input_1", value.get()));
 
@@ -1379,6 +1497,5 @@ INSTANTIATE_TEST_SUITE_P(SessionStateTests,
                                          PrepackingTestParam{true, false},
                                          PrepackingTestParam{true, true}));
 #endif
-
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 8c5859823ac16..572fb6992ec76 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -728,6 +728,64 @@ TEST_F(PathValidationTest, ValidateExternalDataPathEmptyModelPathWithSymlinkOuts
   EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("escapes working directory"));
 }
 
+TEST(TensorProtoUtilsTest, GetNodeProtoLayeringAnnotation) {
+  // Case 1: Annotation exists
+  {
+    ONNX_NAMESPACE::NodeProto node_proto;
+    node_proto.set_name("test_node");
+    auto* prop = node_proto.add_metadata_props();
+    prop->set_key(utils::kNodeProtoLayerAnnotation);
+    prop->set_value("foo");
+
+    auto result = utils::GetNodeProtoLayeringAnnotation(node_proto);
+    EXPECT_TRUE(result.has_value());
+    EXPECT_EQ(result.value(), "foo");
+  }
+
+  // Case 2: Annotation missing (empty metadata_props)
+  {
+    ONNX_NAMESPACE::NodeProto node_proto;
+    node_proto.set_name("test_node");
+
+    auto result = utils::GetNodeProtoLayeringAnnotation(node_proto);
+    EXPECT_FALSE(result.has_value());
+  }
+
+  // Case 3: Other metadata exists, but not the annotation
+  {
+    ONNX_NAMESPACE::NodeProto node_proto;
+    node_proto.set_name("test_node");
+    auto* prop = node_proto.add_metadata_props();
+    prop->set_key("some_other_key");
+    prop->set_value("some_value");
+
+    auto result = utils::GetNodeProtoLayeringAnnotation(node_proto);
+    EXPECT_FALSE(result.has_value());
+  }
+
+  // Case 4: Multiple metadata, including the annotation
+  {
+    ONNX_NAMESPACE::NodeProto node_proto;
+    node_proto.set_name("test_node");
+
+    auto* prop1 = node_proto.add_metadata_props();
+    prop1->set_key("some_other_key");
+    prop1->set_value("some_value");
+
+    auto* prop2 = node_proto.add_metadata_props();
+    prop2->set_key(utils::kNodeProtoLayerAnnotation);
+    prop2->set_value("bar");
+
+    auto* prop3 = node_proto.add_metadata_props();
+    prop3->set_key("yet_another_key");
+    prop3->set_value("baz");
+
+    auto result = utils::GetNodeProtoLayeringAnnotation(node_proto);
+    EXPECT_TRUE(result.has_value());
+    EXPECT_EQ(result.value(), "bar");
+  }
+}
+
 // Tests for ValidateEmbeddedTensorProtoDataSizeAndShape and embedded initializer size limits
 
 TEST(TensorProtoDataSizeShapeValidationTest, ValidTensorProtoWithRawData) {
diff --git a/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc b/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc
index 2fcf86d0447e5..5240c909d2878 100644
--- a/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc
@@ -263,5 +263,90 @@ TEST(MLOpTest, SVMClassifierLinear) {
   test.Run();
 }
 
+// 3 classes, 2 support vectors (1 each for first two classes), 4 features.
+// Correctly sized attributes:
+//   coefficients: (class_count-1) * vector_count = 2*2 = 4
+//   rho: class_count*(class_count-1)/2 = 3
+//   prob_a/prob_b (if present): 3
+
+TEST(MLOpTest, SVMClassifierUndersizedCoefficients) {
+  OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain);
+
+  std::vector<float> coefficients = {1.f, 1.f};  // needs 4, only 2 provided
+  std::vector<float> support_vectors = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f};
+  std::vector<float> rho = {0.1f, 0.1f, 0.1f};  // correct size
+  std::vector<float> kernel_params = {0.01f, 0.f, 3.f};
+  std::vector<int64_t> classes = {0, 1, 2};
+  std::vector<int64_t> vectors_per_class = {1, 1, 0};
+
+  test.AddAttribute("kernel_type", std::string("RBF"));
+  test.AddAttribute("coefficients", coefficients);
+  test.AddAttribute("support_vectors", support_vectors);
+  test.AddAttribute("vectors_per_class", vectors_per_class);
+  test.AddAttribute("rho", rho);
+  test.AddAttribute("kernel_params", kernel_params);
+  test.AddAttribute("classlabels_ints", classes);
+
+  test.AddInput<float>("X", {1, 4}, {0.f, 0.f, 0.f, 0.f});
+  test.AddOutput<int64_t>("Y", {1}, {1});
+  test.AddOutput<float>("Z", {1, 3}, {0.f, 0.f, 0.f});
+
+  test.Run(OpTester::ExpectResult::kExpectFailure, "coefficients attribute size");
+}
+
+TEST(MLOpTest, SVMClassifierUndersizedRho) {
+  OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain);
+
+  std::vector<float> coefficients = {1.f, 1.f, 1.f, 1.f};  // correct size
+  std::vector<float> support_vectors = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f};
+  std::vector<float> rho = {0.1f};  // needs 3, only 1 provided
+  std::vector<float> kernel_params = {0.01f, 0.f, 3.f};
+  std::vector<int64_t> classes = {0, 1, 2};
+  std::vector<int64_t> vectors_per_class = {1, 1, 0};
+
+  test.AddAttribute("kernel_type", std::string("RBF"));
+  test.AddAttribute("coefficients", coefficients);
+  test.AddAttribute("support_vectors", support_vectors);
+  test.AddAttribute("vectors_per_class", vectors_per_class);
+  test.AddAttribute("rho", rho);
+  test.AddAttribute("kernel_params", kernel_params);
+  test.AddAttribute("classlabels_ints", classes);
+
+  test.AddInput<float>("X", {1, 4}, {0.f, 0.f, 0.f, 0.f});
+  test.AddOutput<int64_t>("Y", {1}, {1});
+  test.AddOutput<float>("Z", {1, 3}, {0.f, 0.f, 0.f});
+
+  test.Run(OpTester::ExpectResult::kExpectFailure, "rho attribute size");
+}
+
+TEST(MLOpTest, SVMClassifierUndersizedProba) {
+  OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain);
+
+  std::vector<float> coefficients = {1.f, 1.f, 1.f, 1.f};  // correct size
+  std::vector<float> support_vectors = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f};
+  std::vector<float> rho = {0.1f, 0.1f, 0.1f};  // correct size
+  std::vector<float> proba = {0.5f};            // needs 3, only 1 provided
+  std::vector<float> probb = {0.5f};            // needs 3, only 1 provided
+  std::vector<float> kernel_params = {0.01f, 0.f, 3.f};
+  std::vector<int64_t> classes = {0, 1, 2};
+  std::vector<int64_t> vectors_per_class = {1, 1, 0};
+
+  test.AddAttribute("kernel_type", std::string("RBF"));
+  test.AddAttribute("coefficients", coefficients);
+  test.AddAttribute("support_vectors", support_vectors);
+  test.AddAttribute("vectors_per_class", vectors_per_class);
+  test.AddAttribute("rho", rho);
+  test.AddAttribute("prob_a", proba);
+  test.AddAttribute("prob_b", probb);
+  test.AddAttribute("kernel_params", kernel_params);
+  test.AddAttribute("classlabels_ints", classes);
+
+  test.AddInput<float>("X", {1, 4}, {0.f, 0.f, 0.f, 0.f});
+  test.AddOutput<int64_t>("Y", {1}, {1});
+  test.AddOutput<float>("Z", {1, 3}, {0.f, 0.f, 0.f});
+
+  test.Run(OpTester::ExpectResult::kExpectFailure, "prob_a attribute size");
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index 17a6931eef99c..41280746c2a16 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -74,6 +74,34 @@ TEST(DequantizeLinearOpTest, Int4_LargeInitializerInput) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+// Regression test: int8 tensor whose byte size is not a multiple of 4.
+// DML graph fusion rounds tensor sizes to a multiple of 4 via AlignToPow2.
+// If the original buffer is not padded, the subsequent memcpy reads past the
+// allocation boundary (heap-buffer-overflow detectable with ASan).
+// Mirrors the WebNN PoC: dequantizeLinear with int8[135] (135 % 4 != 0).
+TEST(DequantizeLinearOpTest, Int8_NonAlignedSize_Initializer) {
+  OpTester test("DequantizeLinear", 10);
+  constexpr int64_t kNumElements = 135;  // 135 bytes, AlignToPow2(135,4)=136
+
+  std::vector<int8_t> x_data(kNumElements);
+  std::vector<float> y_expected(kNumElements);
+  const float scale = 0.5f;
+  const int8_t zero_point = 0;
+  for (int64_t i = 0; i < kNumElements; ++i) {
+    x_data[i] = static_cast<int8_t>(i % 127);
+    y_expected[i] = (x_data[i] - zero_point) * scale;
+  }
+
+  // Mark all inputs as initializers so they go through DML's ProcessInputData
+  // → UnpackInitializer → AlignToPow2 code path during graph fusion.
+  test.AddInput<int8_t>("x", {kNumElements}, x_data, /*is_initializer=*/true);
+  test.AddInput<float>("x_scale", {1}, {scale}, /*is_initializer=*/true);
+  test.AddInput<int8_t>("x_zero_point", {1}, {zero_point}, /*is_initializer=*/true);
+  test.AddOutput<float>("y", {kNumElements}, y_expected);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 // scalar zero & scale with int4
 TEST(DequantizeLinearOpTest, Int4) {
   OpTester test("DequantizeLinear", 21);
@@ -516,6 +544,90 @@ TEST(QuantizeLinearOpTest, Int8) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+// Repro for new-delete-type-mismatch in DML EP during graph fusion.
+// QuantizeLinear float32→int8 with 5D input triggers a type-size
+// mismatch (192 bytes allocated, 1 byte deallocated) visible under ASan.
+TEST(QuantizeLinearOpTest, Int8_5D_DML_TypeMismatch) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 13);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddInput<int8_t>("y_zero_point", {}, {0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Same as above but with per-axis quantization along axis 0 to exercise
+// the DML graph fusion path with per-channel int8 quantization.
+TEST(QuantizeLinearOpTest, Int8_5D_PerAxis_DML_TypeMismatch) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 13);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {6}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  test.AddInput<int8_t>("y_zero_point", {6}, {0, 0, 0, 0, 0, 0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Opset 21 QuantizeLinear float32→uint8 WITHOUT zero_point.
+// Without zero_point, the output type defaults to uint8.
+TEST(QuantizeLinearOpTest, Uint8_5D_NoZeroPoint_Opset21_DML) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 21);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {0.0f, 51.0f, 102.0f, 153.0f, 204.0f, 255.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddOutput<uint8_t>("y", dims, {0, 51, 102, 153, 204, 255});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Opset 21 QuantizeLinear float32→int8 with zero_point (the customer's exact scenario).
+TEST(QuantizeLinearOpTest, Int8_5D_WithZeroPoint_Opset21_DML) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 21);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddInput<int8_t>("y_zero_point", {}, {0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
 // Test uint16 QuantizeLinear (per tensor)
 TEST(QuantizeLinearOpTest, Uint16) {
   OpTester test("QuantizeLinear", 21);
@@ -1275,6 +1387,11 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size,
   SessionOptions so;
   std::vector<std::string> log_msgs;  // redirect error messages
   std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  auto webgpu_ep = DefaultWebGpuExecutionProvider();
+  if (webgpu_ep) {
+    eps.push_back(std::move(webgpu_ep));
+  }
+
   eps.push_back(DefaultCpuExecutionProvider());
   so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
                                 const char* logid, const char* code_location, const char* message) {
@@ -1320,6 +1437,12 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
   SessionOptions so;
   std::vector<std::string> log_msgs;  // redirect error messages
   std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  if (!ep) {
+    auto webgpu_ep = DefaultWebGpuExecutionProvider();
+    if (webgpu_ep) {
+      eps.push_back(std::move(webgpu_ep));
+    }
+  }
   eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
   so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
                                 const char* logid, const char* code_location, const char* message) {
@@ -1365,6 +1488,10 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(int64_t block_size,
   SessionOptions so;
   std::vector<std::string> log_msgs;  // redirect error messages
   std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  auto webgpu_ep = DefaultWebGpuExecutionProvider();
+  if (webgpu_ep) {
+    eps.push_back(std::move(webgpu_ep));
+  }
   eps.push_back(DefaultCpuExecutionProvider());
   so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
                                 const char* logid, const char* code_location, const char* message) {
@@ -1553,7 +1680,14 @@ void DequantizeLinearOp21BlockedTest_Int4_Succeed(std::vector<int64_t>&& dims,
   std::vector<Tout> x_scale, y;
   std::vector<Tin> x, x_zero_point;
   std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  if (!ep) {
+    auto webgpu_ep = DefaultWebGpuExecutionProvider();
+    if (webgpu_ep) {
+      eps.push_back(std::move(webgpu_ep));
+    }
+  }
   eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
+
   int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
   bool use_zero_point = !x_zero_point_.empty();
 
@@ -1597,6 +1731,10 @@ void DequantizeLinearOp21BlockedTest_Int_Succeed(std::vector<int64_t>&& dims,
   std::vector<Tout> x_scale, y;
   std::vector<Tin> x, x_zero_point;
   std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  auto webgpu_ep = DefaultWebGpuExecutionProvider();
+  if (webgpu_ep) {
+    eps.push_back(std::move(webgpu_ep));
+  }
   eps.push_back(DefaultCpuExecutionProvider());
 
   int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
@@ -1633,6 +1771,10 @@ void DequantizeLinearOp21BlockedTest_Float8_Succeed(std::vector<int64_t>&& dims,
   std::vector<Tout> x_scale, y;
   std::vector<Tin> x, x_zero_point;
   std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  auto webgpu_ep = DefaultWebGpuExecutionProvider();
+  if (webgpu_ep) {
+    eps.push_back(std::move(webgpu_ep));
+  }
   eps.push_back(DefaultCpuExecutionProvider());
 
   int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
diff --git a/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc b/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc
new file mode 100644
index 0000000000000..8041f0dae8c28
--- /dev/null
+++ b/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc
@@ -0,0 +1,139 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef USE_DML
+
+#include "gtest/gtest.h"
+
+#include <wrl/implements.h>
+#include <wrl/client.h>
+#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h"
+
+#include <stdexcept>
+
+namespace onnxruntime {
+namespace test {
+
+// A trivial COM interface for testing.
+MIDL_INTERFACE("A1B2C3D4-E5F6-7890-ABCD-EF1234567890")
+ITestInterface : public IUnknown {
+  virtual int STDMETHODCALLTYPE GetValue() = 0;
+};
+
+// A RuntimeClass whose constructor succeeds and stores a value.
+class SucceedingClass : public Microsoft::WRL::RuntimeClass<
+                            Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, ITestInterface> {
+ public:
+  int value;
+
+  SucceedingClass(int v) : value(v) {}
+
+  int STDMETHODCALLTYPE GetValue() override { return value; }
+};
+
+// A RuntimeClass that tracks whether its destructor ran.
+class TrackedClass : public Microsoft::WRL::RuntimeClass<
+                         Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, ITestInterface> {
+ public:
+  bool& destroyed;
+
+  TrackedClass(bool& flag) : destroyed(flag) { destroyed = false; }
+  ~TrackedClass() { destroyed = true; }
+
+  int STDMETHODCALLTYPE GetValue() override { return 42; }
+};
+
+// A RuntimeClass whose constructor always throws.
+// Uses a ref-counted witness to verify cleanup: the witness is destroyed
+// (via Release) during stack unwinding if memory is freed correctly.
+class ThrowingClass : public Microsoft::WRL::RuntimeClass<
+                          Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, ITestInterface> {
+ public:
+  Microsoft::WRL::ComPtr<TrackedClass> witness;
+
+  ThrowingClass(bool& witness_destroyed) {
+    // Create a witness that will be destroyed when this object's members
+    // are cleaned up during stack unwinding.
+    witness = Dml::SafeMakeOrThrow<TrackedClass>(witness_destroyed);
+    throw std::runtime_error("intentional throw");
+  }
+
+  int STDMETHODCALLTYPE GetValue() override { return -1; }
+};
+
+// Verify that SafeMakeOrThrow creates an object with ref count 1,
+// and that the object is properly released when the ComPtr goes out of scope.
+TEST(SafeMakeOrThrowTest, SuccessPath_RefCountIsOne) {
+  Microsoft::WRL::ComPtr<SucceedingClass> obj = Dml::SafeMakeOrThrow<SucceedingClass>(123);
+
+  ASSERT_NE(obj.Get(), nullptr);
+  EXPECT_EQ(obj->GetValue(), 123);
+
+  // AddRef/Release to observe ref count: AddRef returns new count.
+  unsigned long refAfterAdd = obj->AddRef();
+  EXPECT_EQ(refAfterAdd, 2u);
+
+  unsigned long refAfterRelease = obj->Release();
+  EXPECT_EQ(refAfterRelease, 1u);
+}
+
+// Verify that the object is destroyed when the last ComPtr releases it.
+TEST(SafeMakeOrThrowTest, SuccessPath_DestructorRunsOnRelease) {
+  bool destroyed = false;
+  {
+    auto obj = Dml::SafeMakeOrThrow<TrackedClass>(destroyed);
+    EXPECT_FALSE(destroyed);
+  }
+  // ComPtr went out of scope — destructor should have run.
+  EXPECT_TRUE(destroyed);
+}
+
+// Verify that copying the ComPtr increments the ref count and
+// the object survives until the last reference is released.
+TEST(SafeMakeOrThrowTest, SuccessPath_MultipleReferences) {
+  bool destroyed = false;
+  Microsoft::WRL::ComPtr<TrackedClass> copy;
+  {
+    auto obj = Dml::SafeMakeOrThrow<TrackedClass>(destroyed);
+    copy = obj;
+    EXPECT_FALSE(destroyed);
+  }
+  // Original ComPtr gone, but copy still holds a reference.
+  EXPECT_FALSE(destroyed);
+
+  copy.Reset();
+  EXPECT_TRUE(destroyed);
+}
+
+// Verify that when the constructor throws, the exception propagates
+// and sub-objects are properly cleaned up (no leak).
+TEST(SafeMakeOrThrowTest, FailurePath_ConstructorThrows) {
+  bool witness_destroyed = false;
+  EXPECT_THROW(
+      Dml::SafeMakeOrThrow<ThrowingClass>(witness_destroyed),
+      std::runtime_error);
+  // The witness ComPtr member was constructed before the throw.
+  // If cleanup worked correctly, the witness should have been destroyed
+  // when the ThrowingClass sub-objects were unwound.
+  EXPECT_TRUE(witness_destroyed);
+}
+
+// Verify that QI works correctly on a SafeMakeOrThrow-created object.
+TEST(SafeMakeOrThrowTest, SuccessPath_QueryInterface) {
+  auto obj = Dml::SafeMakeOrThrow<SucceedingClass>(42);
+
+  Microsoft::WRL::ComPtr<IUnknown> unk;
+  HRESULT hr = obj.As(&unk);
+  EXPECT_EQ(hr, S_OK);
+  EXPECT_NE(unk.Get(), nullptr);
+
+  Microsoft::WRL::ComPtr<ITestInterface> iface;
+  hr = unk.As(&iface);
+  EXPECT_EQ(hr, S_OK);
+  EXPECT_EQ(iface->GetValue(), 42);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // USE_DML
diff --git a/onnxruntime/test/python/transformers/conformer_model_generator.py b/onnxruntime/test/python/transformers/conformer_model_generator.py
index 4e76478bfb649..d067c484b2edd 100644
--- a/onnxruntime/test/python/transformers/conformer_model_generator.py
+++ b/onnxruntime/test/python/transformers/conformer_model_generator.py
@@ -10,6 +10,11 @@
 from bert_model_generator import float_tensor
 from onnx import TensorProto, helper, numpy_helper
 
+# Minimum non-zero value used for the QK attention bias initializer in test models.
+# A zero bias would be eliminated by ORT's basic constant folding (it removes Add(x, 0)
+# as a no-op), breaking the fusion patterns that expect an Add node before Softmax.
+_NON_ZERO_QK_BIAS = 1e-4
+
 
 # Adapted from bert_model_generator.py
 def get_tensor_and_weight(name: str, shape: list[int], random=False, zeros=False):
@@ -530,6 +535,422 @@ def create_conformer_attention(
     return helper.make_model(graph, opset_imports=(opsetid,))
 
 
+def create_conformer_attention_simple_bias(
+    hidden_size=64,
+    num_heads=4,
+    epsilon=0.000009999999747378752,
+):
+    """
+    Standard conformer attention where the QK add_bias is a plain initializer (no positional
+    embedding computation). The extra_q_nodes match_parent_path will return None for both the
+    conformer-transducer and Nemotron patterns, so fusion proceeds with extra_q_nodes=None.
+
+    This is a regression test to verify that the fix restoring optional extra_q_nodes semantics
+    works correctly: graphs that never had an auxiliary Q branch must still fuse.
+
+    Q path:  MatMul -> Add(bias, matmul_out) -> Reshape -> Transpose([0,2,1,3]) -> Div -> matmul_qk
+    K path:  MatMul -> Add(matmul_out, bias) -> Reshape -> Transpose([0,2,3,1]) -> matmul_qk
+    V path:  MatMul -> Add(matmul_out, bias) -> Reshape -> Transpose([0,2,1,3]) -> matmul_qkv
+    QK:      MatMul -> Add(qk_out, qk_bias_init) -> Softmax -> MatMul
+    Output:  Transpose -> Reshape -> MatMul -> Add(bias, matmul) -> SkipLayerNorm
+    """
+    assert hidden_size % num_heads == 0
+    head_size = hidden_size // num_heads
+
+    inputs = [
+        helper.make_tensor_value_info("input_0", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+        helper.make_tensor_value_info("input_1", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+    ]
+    outputs = [
+        helper.make_tensor_value_info("output_0", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+        helper.make_tensor_value_info("output_1", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+    ]
+    nodes = []
+
+    # SkipLayerNorm
+    nodes.append(
+        helper.make_node(
+            "SkipLayerNormalization",
+            ["input_0", "input_1", "ln_weight", "ln_bias"],
+            ["ln_out", "", "", "ln_skip_out"],
+            "skiplayernorm",
+            domain="com.microsoft",
+            epsilon=epsilon,
+        )
+    )
+
+    # Q path: MatMul -> Add(bias[0], matmul[1]) -> Reshape -> Transpose -> Div
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "q_weight"], ["q_matmul_out"], "q_matmul"),
+            helper.make_node("Add", ["q_bias", "q_matmul_out"], ["q_add_out"], "q_add"),
+            helper.make_node("Reshape", ["q_add_out", "qkv_reshape_shape"], ["q_4d_bsnh"], "q_reshape"),
+            helper.make_node("Transpose", ["q_4d_bsnh"], ["q_4d_bnsh"], "q_transpose", perm=[0, 2, 1, 3]),
+            helper.make_node("Div", ["q_4d_bnsh", "q_scale"], ["q_scaled"], "q_div"),
+        ]
+    )
+
+    # K path: MatMul -> Add(matmul[0], bias[1]) -> Reshape -> Transpose (single, for K^T)
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "k_weight"], ["k_matmul_out"], "k_matmul"),
+            helper.make_node("Add", ["k_matmul_out", "k_bias"], ["k_add_out"], "k_add"),
+            helper.make_node("Reshape", ["k_add_out", "qkv_reshape_shape"], ["k_4d_bsnh"], "k_reshape"),
+            # perm=[0,2,3,1]: [B,S,H,D] -> [B,H,D,S] giving K^T for attention dot product
+            helper.make_node("Transpose", ["k_4d_bsnh"], ["k_transposed"], "k_transpose", perm=[0, 2, 3, 1]),
+        ]
+    )
+
+    # V path: MatMul -> Add(matmul[0], bias[1]) -> Reshape -> Transpose (BNSH)
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "v_weight"], ["v_matmul_out"], "v_matmul"),
+            helper.make_node("Add", ["v_matmul_out", "v_bias"], ["v_add_out"], "v_add"),
+            helper.make_node("Reshape", ["v_add_out", "qkv_reshape_shape"], ["v_4d_bsnh"], "v_reshape"),
+            helper.make_node("Transpose", ["v_4d_bsnh"], ["v_4d_bnsh"], "v_transpose", perm=[0, 2, 1, 3]),
+        ]
+    )
+
+    # QK: MatMul -> Add(qk_out, simple_bias_init) -> Softmax -> MatMul
+    # qk_bias is a plain initializer, so extra_q_nodes will be None.
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["q_scaled", "k_transposed"], ["qk_out"], "matmul_qk"),
+            helper.make_node("Add", ["qk_out", "qk_bias"], ["qk_add_out"], "add_qk"),
+            helper.make_node("Softmax", ["qk_add_out"], ["softmax_out"], "softmax_qk", axis=3),
+            helper.make_node("MatMul", ["softmax_out", "v_4d_bnsh"], ["qkv_bnsh"], "matmul_qkv"),
+        ]
+    )
+
+    # Output: Transpose -> Reshape -> MatMul -> Add -> SkipLayerNorm
+    nodes.extend(
+        [
+            helper.make_node("Transpose", ["qkv_bnsh"], ["qkv_bsnh"], "qkv_transpose", perm=[0, 2, 1, 3]),
+            helper.make_node("Reshape", ["qkv_bsnh", "out_reshape_shape"], ["attn_out"], "out_reshape"),
+            helper.make_node("MatMul", ["attn_out", "out_weight"], ["out_matmul"], "out_matmul"),
+            helper.make_node("Add", ["out_bias", "out_matmul"], ["out_add"], "out_add"),
+            helper.make_node(
+                "SkipLayerNormalization",
+                ["ln_skip_out", "out_add", "ln_weight", "ln_bias"],
+                ["output_0", "", "", "output_1"],
+                "next_skiplayernorm",
+                domain="com.microsoft",
+                epsilon=epsilon,
+            ),
+        ]
+    )
+
+    q_weight, _ = get_tensor_and_weight("q_weight", [hidden_size, hidden_size])
+    k_weight, _ = get_tensor_and_weight("k_weight", [hidden_size, hidden_size])
+    v_weight, _ = get_tensor_and_weight("v_weight", [hidden_size, hidden_size])
+
+    initializers = [
+        float_tensor("ln_weight", [hidden_size]),
+        float_tensor("ln_bias", [hidden_size]),
+        float_tensor("out_weight", [hidden_size, hidden_size]),
+        float_tensor("out_bias", [hidden_size]),
+        q_weight,
+        k_weight,
+        v_weight,
+        numpy_helper.from_array(np.array([1.0] * hidden_size, dtype="float32"), name="q_bias"),
+        numpy_helper.from_array(np.array([1.0] * hidden_size, dtype="float32"), name="k_bias"),
+        numpy_helper.from_array(np.array([1.0] * hidden_size, dtype="float32"), name="v_bias"),
+        # QK bias: a simple non-zero initializer so extra_q_nodes won't match any positional-embed pattern.
+        # Non-zero so ORT's constant folding (which removes Add(x, 0)) doesn't eliminate this node.
+        numpy_helper.from_array(np.array([_NON_ZERO_QK_BIAS], dtype="float32"), name="qk_bias"),
+        numpy_helper.from_array(np.array(1.0 / np.sqrt(head_size), dtype="float32"), name="q_scale"),
+        # Reshape shape [0, 0, num_heads, head_size] for Q/K/V
+        numpy_helper.from_array(np.array([0, 0, num_heads, head_size], dtype="int64"), name="qkv_reshape_shape"),
+        # Reshape shape [0, 0, hidden_size] for output
+        numpy_helper.from_array(np.array([0, 0, hidden_size], dtype="int64"), name="out_reshape_shape"),
+    ]
+
+    graph = helper.make_graph(
+        nodes, "conformer_simple_bias_graph", inputs, outputs, initializers, doc_string="conformer"
+    )
+    opsetid = helper.make_opsetid("ai.onnx", min(onnx.defs.onnx_opset_version(), 16))
+    return helper.make_model(graph, opset_imports=(opsetid,))
+
+
+def create_conformer_attention_no_add_kv(
+    hidden_size=64,
+    num_heads=4,
+    epsilon=0.000009999999747378752,
+):
+    """
+    Nemotron-like conformer attention model with no Add-bias nodes in the K and V paths,
+    and a Q path that begins with Transpose→Add→Reshape→MatMul (no leading Div/Mul).
+    The QKV output path also omits the trailing Add before the SkipLayerNorm.
+
+    This exercises the following new fallback patterns:
+      - QKV output: ["MatMul", "Reshape", "Transpose", "MatMul"] with [1, 0, 0, 0]
+      - Q path:     ["Transpose", "Add", "Reshape", "MatMul"]     with [0, 0, 0, 0]
+      - K path:     ["Transpose", "Reshape", "MatMul"]            with [1, 0, 0]
+      - V path:     ["Transpose", "Reshape", "MatMul"]            with [1, 0, 0]
+    """
+    assert hidden_size % num_heads == 0
+    head_size = hidden_size // num_heads
+
+    inputs = [
+        helper.make_tensor_value_info("input_0", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+        helper.make_tensor_value_info("input_1", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+    ]
+    outputs = [
+        helper.make_tensor_value_info("output_0", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+        helper.make_tensor_value_info("output_1", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+    ]
+    nodes = []
+
+    # SkipLayerNorm
+    nodes.append(
+        helper.make_node(
+            "SkipLayerNormalization",
+            ["input_0", "input_1", "ln_weight", "ln_bias"],
+            ["ln_out", "", "", "ln_skip_out"],
+            "skiplayernorm",
+            domain="com.microsoft",
+            epsilon=epsilon,
+        )
+    )
+
+    # Q path: MatMul -> Reshape -> Add(reshape[0], bias[1]) -> Transpose -> matmul_qk
+    # Matches: ["Transpose", "Add", "Reshape", "MatMul"] with [0, 0, 0, 0]
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "q_weight"], ["q_matmul_out"], "q_matmul"),
+            helper.make_node("Reshape", ["q_matmul_out", "qkv_reshape_shape"], ["q_4d_bsnh"], "q_reshape"),
+            helper.make_node("Add", ["q_4d_bsnh", "q_bias_4d"], ["q_4d_biased"], "q_add"),
+            helper.make_node("Transpose", ["q_4d_biased"], ["q_4d_bnsh"], "q_transpose", perm=[0, 2, 1, 3]),
+        ]
+    )
+
+    # K path: MatMul -> Reshape -> Transpose (no Add)
+    # Matches: ["Transpose", "Reshape", "MatMul"] with [1, 0, 0]
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "k_weight"], ["k_matmul_out"], "k_matmul"),
+            helper.make_node("Reshape", ["k_matmul_out", "qkv_reshape_shape"], ["k_4d_bsnh"], "k_reshape"),
+            # perm=[0,2,3,1]: [B,S,H,D] -> [B,H,D,S] for K^T
+            helper.make_node("Transpose", ["k_4d_bsnh"], ["k_transposed"], "k_transpose", perm=[0, 2, 3, 1]),
+        ]
+    )
+
+    # V path: MatMul -> Reshape -> Transpose (no Add)
+    # Matches: ["Transpose", "Reshape", "MatMul"] with [1, 0, 0]
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "v_weight"], ["v_matmul_out"], "v_matmul"),
+            helper.make_node("Reshape", ["v_matmul_out", "qkv_reshape_shape"], ["v_4d_bsnh"], "v_reshape"),
+            helper.make_node("Transpose", ["v_4d_bsnh"], ["v_4d_bnsh"], "v_transpose", perm=[0, 2, 1, 3]),
+        ]
+    )
+
+    # QK: MatMul -> Add(qk_out, bias) -> Softmax -> MatMul
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["q_4d_bnsh", "k_transposed"], ["qk_out"], "matmul_qk"),
+            helper.make_node("Add", ["qk_out", "qk_bias"], ["qk_add_out"], "add_qk"),
+            helper.make_node("Softmax", ["qk_add_out"], ["softmax_out"], "softmax_qk", axis=3),
+            helper.make_node("MatMul", ["softmax_out", "v_4d_bnsh"], ["qkv_bnsh"], "matmul_qkv"),
+        ]
+    )
+
+    # Output: Transpose -> Reshape -> MatMul (no trailing Add before SkipLayerNorm)
+    # Matches QKV path: ["MatMul", "Reshape", "Transpose", "MatMul"] with [1, 0, 0, 0]
+    nodes.extend(
+        [
+            helper.make_node("Transpose", ["qkv_bnsh"], ["qkv_bsnh"], "qkv_transpose", perm=[0, 2, 1, 3]),
+            helper.make_node("Reshape", ["qkv_bsnh", "out_reshape_shape"], ["attn_out"], "out_reshape"),
+            helper.make_node("MatMul", ["attn_out", "out_weight"], ["out_matmul"], "out_matmul"),
+            helper.make_node(
+                "SkipLayerNormalization",
+                ["ln_skip_out", "out_matmul", "ln_weight", "ln_bias"],
+                ["output_0", "", "", "output_1"],
+                "next_skiplayernorm",
+                domain="com.microsoft",
+                epsilon=epsilon,
+            ),
+        ]
+    )
+
+    q_weight, _ = get_tensor_and_weight("q_weight", [hidden_size, hidden_size])
+    k_weight, _ = get_tensor_and_weight("k_weight", [hidden_size, hidden_size])
+    v_weight, _ = get_tensor_and_weight("v_weight", [hidden_size, hidden_size])
+
+    initializers = [
+        float_tensor("ln_weight", [hidden_size]),
+        float_tensor("ln_bias", [hidden_size]),
+        float_tensor("out_weight", [hidden_size, hidden_size]),
+        q_weight,
+        k_weight,
+        v_weight,
+        # Q bias in 4D shape [1, 1, num_heads, head_size] for broadcasting after Reshape
+        numpy_helper.from_array(np.ones([1, 1, num_heads, head_size], dtype="float32"), name="q_bias_4d"),
+        # Non-zero qk_bias so ORT's constant folding (which removes Add(x, 0)) doesn't eliminate this node.
+        numpy_helper.from_array(np.array([_NON_ZERO_QK_BIAS], dtype="float32"), name="qk_bias"),
+        numpy_helper.from_array(np.array([0, 0, num_heads, head_size], dtype="int64"), name="qkv_reshape_shape"),
+        numpy_helper.from_array(np.array([0, 0, hidden_size], dtype="int64"), name="out_reshape_shape"),
+    ]
+
+    graph = helper.make_graph(nodes, "conformer_no_add_kv_graph", inputs, outputs, initializers, doc_string="conformer")
+    opsetid = helper.make_opsetid("ai.onnx", min(onnx.defs.onnx_opset_version(), 16))
+    return helper.make_model(graph, opset_imports=(opsetid,))
+
+
+def create_conformer_attention_qk_div_masking(
+    hidden_size=64,
+    num_heads=4,
+    epsilon=0.000009999999747378752,
+):
+    """
+    Conformer attention with QK masking using Where→Softmax→Where→Div→Add→MatMul.
+
+    This exercises the new QK path:
+      ["Where", "Softmax", "Where", "Div", "Add", "MatMul"] with [0, 2, 0, 2, 0, 0]
+
+    The graph structure for the masked QK computation is:
+      MatMul(Q,K^T) → Add(qk_bias) → Div(scale) → inner_Where → Softmax → outer_Where → MatMul(V)
+    """
+    assert hidden_size % num_heads == 0
+    head_size = hidden_size // num_heads
+
+    inputs = [
+        helper.make_tensor_value_info("input_0", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+        helper.make_tensor_value_info("input_1", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+    ]
+    outputs = [
+        helper.make_tensor_value_info("output_0", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+        helper.make_tensor_value_info("output_1", TensorProto.FLOAT, ["batch_size", "seq_len", hidden_size]),
+    ]
+    nodes = []
+
+    # SkipLayerNorm
+    nodes.append(
+        helper.make_node(
+            "SkipLayerNormalization",
+            ["input_0", "input_1", "ln_weight", "ln_bias"],
+            ["ln_out", "", "", "ln_skip_out"],
+            "skiplayernorm",
+            domain="com.microsoft",
+            epsilon=epsilon,
+        )
+    )
+
+    # Q path: MatMul -> Add(bias, matmul_out) -> Reshape -> Transpose -> Div
+    # Matches: ["Div", "Transpose", "Reshape", "Add", "MatMul"] with [0, 0, 0, 0, 1]
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "q_weight"], ["q_matmul_out"], "q_matmul"),
+            helper.make_node("Add", ["q_bias", "q_matmul_out"], ["q_add_out"], "q_add"),
+            helper.make_node("Reshape", ["q_add_out", "qkv_reshape_shape"], ["q_4d_bsnh"], "q_reshape"),
+            helper.make_node("Transpose", ["q_4d_bsnh"], ["q_4d_bnsh"], "q_transpose", perm=[0, 2, 1, 3]),
+            helper.make_node("Div", ["q_4d_bnsh", "q_scale"], ["q_scaled"], "q_div"),
+        ]
+    )
+
+    # K path: MatMul -> Add(matmul_out, bias) -> Reshape -> Transpose
+    # Matches: ["Transpose", "Reshape", "Add", "MatMul"] with [1, 0, 0, 0]
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "k_weight"], ["k_matmul_out"], "k_matmul"),
+            helper.make_node("Add", ["k_matmul_out", "k_bias"], ["k_add_out"], "k_add"),
+            helper.make_node("Reshape", ["k_add_out", "qkv_reshape_shape"], ["k_4d_bsnh"], "k_reshape"),
+            helper.make_node("Transpose", ["k_4d_bsnh"], ["k_transposed"], "k_transpose", perm=[0, 2, 3, 1]),
+        ]
+    )
+
+    # V path: MatMul -> Add(matmul_out, bias) -> Reshape -> Transpose
+    # Matches: ["Transpose", "Reshape", "Add", "MatMul"] with [1, 0, 0, 0]
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["ln_out", "v_weight"], ["v_matmul_out"], "v_matmul"),
+            helper.make_node("Add", ["v_matmul_out", "v_bias"], ["v_add_out"], "v_add"),
+            helper.make_node("Reshape", ["v_add_out", "qkv_reshape_shape"], ["v_4d_bsnh"], "v_reshape"),
+            helper.make_node("Transpose", ["v_4d_bsnh"], ["v_4d_bnsh"], "v_transpose", perm=[0, 2, 1, 3]),
+        ]
+    )
+
+    # QK computation with Div masking:
+    #   MatMul(QK) -> Add(qk_bias) -> Div(scale) -> inner_Where -> Softmax -> outer_Where -> MatMul(V)
+    #
+    # Matches: ["Where", "Softmax", "Where", "Div", "Add", "MatMul"] with [0, 2, 0, 2, 0, 0]
+    # where_qk = inner_Where
+    nodes.extend(
+        [
+            helper.make_node("MatMul", ["q_scaled", "k_transposed"], ["qk_out"], "matmul_qk"),
+            helper.make_node("Add", ["qk_out", "qk_bias"], ["qk_add_out"], "add_qk"),
+            helper.make_node("Div", ["qk_add_out", "qk_div_scale"], ["qk_div_out"], "div_qk"),
+            # inner_Where: condition ? qk_div_out : mask_value  → input[0]=cond, [1]=mask, [2]=qk_div_out
+            helper.make_node(
+                "Where",
+                ["mask_condition", "mask_value", "qk_div_out"],
+                ["inner_where_out"],
+                "inner_where",
+            ),
+            helper.make_node("Softmax", ["inner_where_out"], ["softmax_out"], "softmax_qk", axis=3),
+            # outer_Where: condition ? zeros : softmax_out → input[0]=cond, [1]=zeros, [2]=softmax_out
+            helper.make_node(
+                "Where",
+                ["mask_condition", "zeros_val", "softmax_out"],
+                ["outer_where_out"],
+                "outer_where",
+            ),
+            helper.make_node("MatMul", ["outer_where_out", "v_4d_bnsh"], ["qkv_bnsh"], "matmul_qkv"),
+        ]
+    )
+
+    # Output: Transpose -> Reshape -> MatMul -> Add -> SkipLayerNorm
+    nodes.extend(
+        [
+            helper.make_node("Transpose", ["qkv_bnsh"], ["qkv_bsnh"], "qkv_transpose", perm=[0, 2, 1, 3]),
+            helper.make_node("Reshape", ["qkv_bsnh", "out_reshape_shape"], ["attn_out"], "out_reshape"),
+            helper.make_node("MatMul", ["attn_out", "out_weight"], ["out_matmul"], "out_matmul"),
+            helper.make_node("Add", ["out_bias", "out_matmul"], ["out_add"], "out_add"),
+            helper.make_node(
+                "SkipLayerNormalization",
+                ["ln_skip_out", "out_add", "ln_weight", "ln_bias"],
+                ["output_0", "", "", "output_1"],
+                "next_skiplayernorm",
+                domain="com.microsoft",
+                epsilon=epsilon,
+            ),
+        ]
+    )
+
+    q_weight, _ = get_tensor_and_weight("q_weight", [hidden_size, hidden_size])
+    k_weight, _ = get_tensor_and_weight("k_weight", [hidden_size, hidden_size])
+    v_weight, _ = get_tensor_and_weight("v_weight", [hidden_size, hidden_size])
+
+    initializers = [
+        float_tensor("ln_weight", [hidden_size]),
+        float_tensor("ln_bias", [hidden_size]),
+        float_tensor("out_weight", [hidden_size, hidden_size]),
+        float_tensor("out_bias", [hidden_size]),
+        q_weight,
+        k_weight,
+        v_weight,
+        numpy_helper.from_array(np.array([1.0] * hidden_size, dtype="float32"), name="q_bias"),
+        numpy_helper.from_array(np.array([1.0] * hidden_size, dtype="float32"), name="k_bias"),
+        numpy_helper.from_array(np.array([1.0] * hidden_size, dtype="float32"), name="v_bias"),
+        # Non-zero qk_bias so ORT's constant folding (which removes Add(x, 0)) doesn't eliminate this node.
+        numpy_helper.from_array(np.array([_NON_ZERO_QK_BIAS], dtype="float32"), name="qk_bias"),
+        numpy_helper.from_array(np.array(1.0 / np.sqrt(head_size), dtype="float32"), name="q_scale"),
+        numpy_helper.from_array(np.array(float(head_size), dtype="float32"), name="qk_div_scale"),
+        # Boolean mask condition (all True = no masking, for test purposes)
+        helper.make_tensor("mask_condition", TensorProto.BOOL, [1, 1, 1, 1], [True]),
+        numpy_helper.from_array(np.array([-1e9], dtype="float32"), name="mask_value"),
+        numpy_helper.from_array(np.array([0.0], dtype="float32"), name="zeros_val"),
+        numpy_helper.from_array(np.array([0, 0, num_heads, head_size], dtype="int64"), name="qkv_reshape_shape"),
+        numpy_helper.from_array(np.array([0, 0, hidden_size], dtype="int64"), name="out_reshape_shape"),
+    ]
+
+    graph = helper.make_graph(
+        nodes, "conformer_qk_div_masking_graph", inputs, outputs, initializers, doc_string="conformer"
+    )
+    opsetid = helper.make_opsetid("ai.onnx", min(onnx.defs.onnx_opset_version(), 16))
+    return helper.make_model(graph, opset_imports=(opsetid,))
+
+
 if __name__ == "__main__":
     np.random.seed(2)
     num_heads = 8
diff --git a/onnxruntime/test/python/transformers/test_conformer.py b/onnxruntime/test/python/transformers/test_conformer.py
index 471ba9756bcf8..e3e52cc456d42 100644
--- a/onnxruntime/test/python/transformers/test_conformer.py
+++ b/onnxruntime/test/python/transformers/test_conformer.py
@@ -5,10 +5,16 @@
 # --------------------------------------------------------------------------
 
 import os
+import tempfile
 import unittest
 
 import onnx
-from conformer_model_generator import create_conformer_attention
+from conformer_model_generator import (
+    create_conformer_attention,
+    create_conformer_attention_no_add_kv,
+    create_conformer_attention_qk_div_masking,
+    create_conformer_attention_simple_bias,
+)
 from parity_utilities import find_transformers_source
 
 if find_transformers_source():
@@ -46,6 +52,32 @@ def verify_fusion(self, optimized_model, expected_model_filename):
                 )
             )
 
+    def count_fused_attention_nodes(self, optimized_model):
+        """Return the number of Attention and MultiHeadAttention nodes in the optimized graph."""
+        return sum(
+            1
+            for node in optimized_model.model.graph.node
+            if node.op_type in ("Attention", "MultiHeadAttention") and node.domain == "com.microsoft"
+        )
+
+    def _run_conformer_optimization(self, model, num_heads, hidden_size):
+        """Save the model to a temp file, run the conformer optimizer, and return the result."""
+        with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+            model_path = f.name
+        try:
+            onnx.save(model, model_path)
+            options = FusionOptions("conformer")
+            optimized = optimize_model(
+                model_path,
+                model_type="conformer",
+                num_heads=num_heads,
+                hidden_size=hidden_size,
+                optimization_options=options,
+            )
+        finally:
+            os.remove(model_path)
+        return optimized
+
     def test_ct_mha_fusion(self):
         num_heads = 8
         hidden_size = 512
@@ -64,6 +96,60 @@ def test_ct_mha_fusion(self):
         os.remove(model_path)
         self.verify_fusion(optimized_model, "conformer_self_mha_fused.onnx")
 
+    def test_conformer_no_extra_q_nodes(self):
+        """Regression test: standard conformer without positional embedding extra-Q path.
+
+        Before the fix, the extra_q_nodes block required one of the two branch patterns to match.
+        When neither matched (simple QK bias, no CT or Nemotron positional embed), fusion would
+        incorrectly return early. This test verifies that fusion still produces a fused attention
+        node when extra_q_nodes is None throughout.
+        """
+        num_heads = 4
+        hidden_size = 64
+        model = create_conformer_attention_simple_bias(num_heads=num_heads, hidden_size=hidden_size)
+        optimized = self._run_conformer_optimization(model, num_heads, hidden_size)
+        fused_count = self.count_fused_attention_nodes(optimized)
+        self.assertEqual(fused_count, 1, f"Expected 1 fused attention node, got {fused_count}")
+
+    def test_nemotron_conformer_no_bias_kv(self):
+        """Nemotron-like model with no Add-bias in K/V paths and no leading Add in QKV output.
+
+        Exercises the new fallback matchers introduced for Nemotron graph shapes:
+          - QKV output path without leading Add: ["MatMul", "Reshape", "Transpose", "MatMul"]
+          - Q path (Transpose→Add→Reshape→MatMul, no leading Div/Mul):
+              ["Transpose", "Add", "Reshape", "MatMul"] with [0, 0, 0, 0]
+          - K/V paths without bias Add:
+              ["Transpose", "Reshape", "MatMul"] with [1, 0, 0]
+        Because add_k and add_v are None, the fused node must be MultiHeadAttention.
+        """
+        num_heads = 4
+        hidden_size = 64
+        model = create_conformer_attention_no_add_kv(num_heads=num_heads, hidden_size=hidden_size)
+        optimized = self._run_conformer_optimization(model, num_heads, hidden_size)
+        fused_count = self.count_fused_attention_nodes(optimized)
+        self.assertEqual(fused_count, 1, f"Expected 1 fused attention node, got {fused_count}")
+        # add_k / add_v are None → use_packed_attention_op is False → MultiHeadAttention
+        mha_count = sum(
+            1
+            for node in optimized.model.graph.node
+            if node.op_type == "MultiHeadAttention" and node.domain == "com.microsoft"
+        )
+        self.assertEqual(mha_count, 1, f"Expected MultiHeadAttention node, got {mha_count}")
+
+    def test_conformer_qk_div_masking(self):
+        """Conformer with a Where→Softmax→Where→Div→Add→MatMul QK masking path.
+
+        Exercises the new QK fallback:
+          ["Where", "Softmax", "Where", "Div", "Add", "MatMul"] with [0, 2, 0, 2, 0, 0]
+        which handles graphs where the QK logits are scaled by Div before the Where mask is applied.
+        """
+        num_heads = 4
+        hidden_size = 64
+        model = create_conformer_attention_qk_div_masking(num_heads=num_heads, hidden_size=hidden_size)
+        optimized = self._run_conformer_optimization(model, num_heads, hidden_size)
+        fused_count = self.count_fused_attention_nodes(optimized)
+        self.assertEqual(fused_count, 1, f"Expected 1 fused attention node, got {fused_count}")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/testdata/layering/tiny_gpt2_beamsearch_layering.onnx b/onnxruntime/test/testdata/layering/tiny_gpt2_beamsearch_layering.onnx
new file mode 100644
index 0000000000000..57efb4ebe11a3
Binary files /dev/null and b/onnxruntime/test/testdata/layering/tiny_gpt2_beamsearch_layering.onnx differ
diff --git a/onnxruntime/test/testdata/layering/tiny_gpt2_beamsearch_layering.txt b/onnxruntime/test/testdata/layering/tiny_gpt2_beamsearch_layering.txt
new file mode 100644
index 0000000000000..5affbde73e5b3
--- /dev/null
+++ b/onnxruntime/test/testdata/layering/tiny_gpt2_beamsearch_layering.txt
@@ -0,0 +1,55 @@
+Embed:EmbedLayer
+GptAttention0:GptAttention_0
+GptAttention0:Add_295
+GptAttention0:LayerNorm_1
+GptAttention0:FullyConnect_MatMul_0
+GptAttention0:FastGelu_AddBias_0
+GptAttention0:FullyConnect_MatMul_1
+GptAttention0:FullyConnect_Add_1
+GptAttention0:Add_360
+GptAttention1:LayerNorm_2
+GptAttention1:GptAttention_1
+GptAttention1:Add_492
+GptAttention1:FullyConnect_MatMul_2
+GptAttention1:FastGelu_AddBias_1
+GptAttention1:FullyConnect_MatMul_3
+GptAttention1:FullyConnect_Add_3
+GptAttention1:Add_557
+GptAttention2:LayerNorm_4
+GptAttention2:GptAttention_2
+GptAttention2:Add_689
+GptAttention2:LayerNorm_5
+GptAttention2:FullyConnect_MatMul_4
+GptAttention2:FastGelu_AddBias_2
+GptAttention2:FullyConnect_MatMul_5
+GptAttention2:FullyConnect_Add_5
+GptAttention2:Add_754
+GptAttention3:LayerNorm_6
+GptAttention3:GptAttention_3
+GptAttention3:Add_886
+GptAttention3:LayerNorm_7
+GptAttention3:FullyConnect_MatMul_6
+GptAttention3:FastGelu_AddBias_3
+GptAttention3:FullyConnect_MatMul_7
+GptAttention3:FullyConnect_Add_7
+GptAttention3:Add_951
+GptAttention4:LayerNorm_8
+GptAttention4:GptAttention_4
+GptAttention4:Add_1083
+GptAttention4:LayerNorm_9
+GptAttention4:FullyConnect_MatMul_8
+GptAttention4:FastGelu_AddBias_4
+GptAttention4:FullyConnect_MatMul_9
+GptAttention4:FullyConnect_Add_9
+GptAttention4:Add_1148
+Decode:LayerNorm_10
+Decode:MatMul_1165
+
+
+
+
+
+
+
+
+
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index df59dd1049dc6..7eeb9cb59d0b2 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1723,7 +1723,18 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 test_output = f"--gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
                 run_subprocess([os.path.join(cwd, exe), test_output], cwd=cwd, dll_path=dll_path)
         else:
-            ctest_cmd = [ctest_path, "--build-config", config, "--verbose", "--timeout", args.ctest_timeout]
+            num_parallel_jobs = number_of_parallel_jobs(args)
+            ctest_cmd = [
+                ctest_path,
+                "--build-config",
+                config,
+                "--verbose",
+                "--timeout",
+                args.ctest_timeout,
+                "--parallel",
+                str(num_parallel_jobs),
+                "--output-on-failure",
+            ]
             run_subprocess(ctest_cmd, cwd=cwd, dll_path=dll_path)
 
         if args.enable_pybind:
@@ -2327,9 +2338,6 @@ def main():
         if args.nnapi_min_api < 27:
             raise BuildError("--nnapi_min_api should be 27+")
 
-    if args.build_wasm_static_lib:
-        args.build_wasm = True
-
     if args.build_wasm:
         if not args.disable_wasm_exception_catching and args.disable_exceptions:
             # When '--disable_exceptions' is set, we set '--disable_wasm_exception_catching' as well
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index 61b91f37eac19..c7d66362da51b 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -938,6 +938,10 @@ def convert_arg_line_to_args(self, arg_line: str) -> list[str]:  # Use list[str]
     if args.android_ndk_path:
         args.android_ndk_path = os.path.normpath(args.android_ndk_path)
 
+    # Treat --build_wasm_static_lib as implying --build_wasm
+    if args.build_wasm_static_lib:
+        args.build_wasm = True
+
     # Handle WASM exception logic
     if args.enable_wasm_api_exception_catching:
         args.disable_wasm_exception_catching = True  # Catching at API level implies disabling broader catching
diff --git a/tools/ci_build/github/azure-pipelines/main-release-pipeline.yml b/tools/ci_build/github/azure-pipelines/main-release-pipeline.yml
index dd9321212a140..f955667eaf50e 100644
--- a/tools/ci_build/github/azure-pipelines/main-release-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/main-release-pipeline.yml
@@ -1,3 +1,7 @@
+schedules:
+- cron: '0 12 * * *'
+  displayName: "Nightly RC Build"
+
 trigger: none
 
 parameters:
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_centos.sh
index 1ced7cd2f90c8..0517fa7be9daa 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_centos.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
 set -e
 
-os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' </etc/redhat-release | cut -d \. -f1)
 
 echo "installing for os major version : $os_major_version"
 dnf install -y glibc-langpack-\* which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
+
+if ! command -v ccache &>/dev/null; then
+  dnf install -y ccache # FIXME: base image should already have ccache installed
+fi
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
index 2ffe21159fd1f..7a29fd7fc728c 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
@@ -9,6 +9,9 @@ ARG BUILD_USER=onnxruntimedev
 USER root
 WORKDIR /
 
+# FIXME: base image should already have ccache installed
+RUN if ! command -v ccache; then dnf install -y ccache; fi
+
 RUN dnf install -y --nodocs \
         wget \
         tar \
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
index a487bf7f91507..093da075be13c 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' </etc/redhat-release | cut -d \. -f1)
 
 echo "installing for os major version : $os_major_version"
 if [ "$os_major_version" -gt 7 ]; then
@@ -11,3 +11,7 @@ fi
 
 # Install automatic documentation generation dependencies
 $PACKAGE_MANAGER install -y graphviz
+
+if ! command -v ccache &>/dev/null; then
+    "$PACKAGE_MANAGER" install -y ccache # FIXME: base image should already have ccache installed
+fi
diff --git a/tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh b/tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh
index 946820299c6a7..1a4e0cf77c57b 100755
--- a/tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh
+++ b/tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh
@@ -21,6 +21,9 @@ python3 "$ORT_ROOT/tools/ci_build/build.py" \
     --build_dir "$MIN_BUILD_DIR" \
     --config Debug \
     --skip_submodule_sync \
+    --use_cache \
+    --use_vcpkg \
+    --use_vcpkg_ms_internal_asset_cache \
     --parallel \
     --cmake_generator=Ninja \
     --use_nnapi \
@@ -35,7 +38,7 @@ python3 "$ORT_ROOT/tools/ci_build/build.py" \
     --disable_generation_ops \
     --disable_exceptions \
     --include_ops_by_config "$ORT_ROOT/onnxruntime/test/testdata/required_ops_and_types.config" \
-    --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+    --skip_tests
 
 # Push onnxruntime_test_all and testdata to emulator
 adb push "$MIN_BUILD_DIR/Debug/onnxruntime_test_all" /data/local/tmp/
diff --git a/tools/nuget/validate_package.py b/tools/nuget/validate_package.py
index 0ad1fc07eafd7..59e88ea15e7c6 100644
--- a/tools/nuget/validate_package.py
+++ b/tools/nuget/validate_package.py
@@ -5,6 +5,8 @@
 import glob
 import os
 import re
+import shutil
+import subprocess
 import sys
 import zipfile  # Available Python 3.2 or higher
 
@@ -238,7 +240,7 @@ def validate_tarball(args):
     package_folder = re.search("(.*)[.].*", package_name).group(1)
 
     print("tar zxvf " + package_name)
-    os.system("tar zxvf " + package_name)
+    subprocess.run(["tar", "zxvf", package_name], check=True)
 
     is_windows_ai_package = False
     zip_file = None
@@ -336,7 +338,7 @@ def validate_nuget(args):
 
         # Make a copy of the Nuget package
         print("Copying [" + full_nuget_path + "] -> [" + nupkg_copy_name + "], and extracting its contents")
-        os.system("copy " + full_nuget_path + " " + nupkg_copy_name)
+        shutil.copy2(full_nuget_path, nupkg_copy_name)
 
         # Convert nupkg to zip
         os.rename(nupkg_copy_name, zip_copy_name)