amd · andrej · Jan 7, 2026 · Jan 7, 2026 · Jan 8, 2026 · Jan 13, 2026
@@ -42,7 +42,7 @@ runs:
         esac
 
         if [ "${{ inputs.extensive }}" = "true" ]; then
-          pytest $TEST_PATH --csv-output=tests_latest.csv ${{ inputs.test_flags }}
+          pytest $TEST_PATH --csv-output=tests_latest.csv --log-cli-level=DEBUG ${{ inputs.test_flags }}
         else
-          pytest -m "not extensive" $TEST_PATH --csv-output=tests_latest.csv ${{ inputs.test_flags }}
+          pytest -m "not extensive" $TEST_PATH --csv-output=tests_latest.csv --log-cli-level=DEBUG ${{ inputs.test_flags }}
         fi
@@ -38,14 +38,38 @@ jobs:
           source ci_env/bin/activate
           pip install -r requirements_examples.txt
 
+      - name: Print mlir-aie version
+        shell: bash
+        env:
+          HOME: /workspace
+        run: |
+          set -euxo pipefail
+          source ci_env/bin/activate
+          echo "=== mlir-aie version ==="
+          aie-opt --version
+          echo "========================"
+
+      - name: Print resource limits
+        shell: bash
+        run: |
+          echo "=== ulimit -a ==="
+          ulimit -a
+          echo ""
+          echo "=== /proc/self/limits ==="
+          cat /proc/self/limits
+          echo ""
+          echo "=== Free memory ==="
+          free -h
+
       - name: Run examples
         id: test
         continue-on-error: true
         uses: ./.github/actions/test
+        env:
+          PYTHONUNBUFFERED: "1"
         with:
           env_name: ci_env
           test_suite: "applications"
-          test_flags: "--iterations 1"
 
       - name: Commit example test results
         uses: ./.github/actions/commit_results
@@ -61,6 +85,17 @@ jobs:
           path: ${{ steps.commit_results.outputs.results_dir }}
           retention-days: 14
 
+      - name: Upload MLIR artifacts for debugging
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: mlir-artifacts
+          path: |
+            iron/applications/llama_3.2_1b/build/*.mlir
+            iron/applications/llama_3.2_1b/build_elf/*.mlir
+          retention-days: 14
+          if-no-files-found: warn
+
       - name: Fail workflow if examples failed
         if: steps.test.outcome == 'failure'
         run: exit 1
@@ -177,4 +177,12 @@ void partial_softmax_bf16(bfloat16 *restrict input,
     partial_softmax_alias_bf16(input, output, scale_buffer, input_size, row_idx, num_rows, scale);
 }
 
+void mask_bf16(bfloat16 *inout, const int32 unmasked_size, const int32 total_size)
+{
+    // TODO: Optimize this to use vector code
+    for (int32 i = unmasked_size; i < total_size; i++) {
+        inout[i] = (bfloat16)(-INFINITY);
+    }
+}
+
 } // extern "C"
@@ -15,6 +15,10 @@
 
 #include <aie_api/aie.hpp>
 
+#ifndef VEC_SIZE
+#define VEC_SIZE 64
+#endif
+
 void matvec_scalar(uint32_t m,
                    uint32_t k,
                    const bfloat16 *__restrict a,
@@ -40,22 +44,17 @@ Matrix-vector multiplication kernel
  - c: Pointer to the output vector
  - r: Vector size; data from the matrix and vector will be loaded in and processed in chunks of this size
 */
-template <uint32_t r>
-void matvec_vectorized(uint32_t m,
-                       uint32_t k,
-                       const bfloat16 *__restrict a,
-                       const bfloat16 *__restrict b,
-                       bfloat16 *__restrict c)
+template <uint32_t r, uint32_t k>
+void matvec_vectorized(uint32_t m, const bfloat16 *__restrict a, const bfloat16 *__restrict b, bfloat16 *__restrict c)
 {
     ::aie::set_rounding(aie::rounding_mode::conv_even);
     bfloat16 *c_end = c + m;
     const bfloat16 *b_end = b + k;
     for (; c < c_end; c++) {
         aie::accum acc = aie::zeros<accfloat, r>();
-        // The following two pragmas enable pipelining the zero-overhead loop, but they do assume that k is at least
-        // two. This assumption should hold for any useful use of this function; if k were one, this would be a simple
-        // scalar multiplication of a vector.
-        AIE_LOOP_MIN_ITERATION_COUNT(2)
+        // The following two pragmas enable pipelining the zero-overhead loop, but they do assume that there are at
+        // least two iterations of the loop, i.e. k >= 2*r. This pragma will break the code if that is not the case!
+        AIE_LOOP_MIN_ITERATION_COUNT(k / VEC_SIZE)
         for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
             aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
             aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
@@ -72,25 +71,23 @@ extern "C" {
  * `c`.  */
 
 void matvec_scalar_bf16_bf16(uint32_t m,
-                             uint32_t k,
                              uint32_t row_offset,
                              const bfloat16 *__restrict a_in,
                              const bfloat16 *__restrict b_in,
                              bfloat16 *__restrict c_out)
 {
     c_out += row_offset;
-    matvec_scalar(m, k, a_in, b_in, c_out);
+    matvec_scalar(m, DIM_K, a_in, b_in, c_out);
 }
 
 void matvec_vectorized_bf16_bf16(uint32_t m,
-                                 uint32_t k,
                                  uint32_t row_offset,
                                  const bfloat16 *__restrict a_in,
                                  const bfloat16 *__restrict b_in,
                                  bfloat16 *__restrict c_out)
 {
     c_out += row_offset;
-    matvec_vectorized<64>(m, k, a_in, b_in, c_out);
+    matvec_vectorized<VEC_SIZE, DIM_K>(m, a_in, b_in, c_out);
 }
 
 } // extern "C"
@@ -17,7 +17,9 @@
 def aie_context(request):
     """Create a fresh AIEContext for each test"""
     verbose_mlir = request.config.option.verbose > 0
-    return AIEContext(mlir_verbose=verbose_mlir)
+    ctx = AIEContext(mlir_verbose=verbose_mlir)
+    yield ctx
+    ctx.device_manager.reset()
 
 
 def pytest_addoption(parser):