From d1c3ed8c8b76124148b6ab0d6556a6cc65ce58dd Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Sun, 22 Mar 2026 16:15:47 +0100
Subject: [PATCH 01/12] Fix evaluation metric, draft a testing framework

---
 .github/workflows/tests.yml                   | 117 ++++++
 README.md                                     |   8 +-
 test/README.md                                | 209 ++++++++++
 test/conftest.py                              |  13 +
 test/integration/__init__.py                  |   0
 .../test_feature_set_consistency.py           | 393 ++++++++++++++++++
 test/integration/test_metamorphic.py          | 264 ++++++++++++
 test/unit/__init__.py                         |   0
 .../unit/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 145 bytes
 ...uild_rmsk_gtf.cpython-312-pytest-7.4.4.pyc | Bin 0 -> 14887 bytes
 ...test_evaluate.cpython-312-pytest-7.4.4.pyc | Bin 0 -> 55247 bytes
 ...parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc | Bin 0 -> 14535 bytes
 ...imulate_reads.cpython-312-pytest-7.4.4.pyc | Bin 0 -> 45973 bytes
 test/unit/test_build_rmsk_gtf.py              | 110 +++++
 test/unit/test_evaluate.py                    | 368 ++++++++++++++++
 test/unit/test_parse_gtf_t2g.py               | 122 ++++++
 test/unit/test_simulate_reads.py              | 299 +++++++++++++
 test/workflow/Snakefile_test                  | 191 +++++++++
 .../configs/test_negative_control.yaml        |  51 +++
 test/workflow/envs/test_evaluation.yaml       |   9 +
 test/workflow/test_snakemake_dryrun.py        |  92 ++++
 workflow/scripts/evaluate.py                  |  44 +-
 22 files changed, 2271 insertions(+), 19 deletions(-)
 create mode 100644 .github/workflows/tests.yml
 create mode 100644 test/README.md
 create mode 100644 test/conftest.py
 create mode 100644 test/integration/__init__.py
 create mode 100644 test/integration/test_feature_set_consistency.py
 create mode 100644 test/integration/test_metamorphic.py
 create mode 100644 test/unit/__init__.py
 create mode 100644 test/unit/__pycache__/__init__.cpython-312.pyc
 create mode 100644 test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc
 create mode 100644 test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc
 create mode 100644 test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc
 create mode 100644 test/unit/__pycache__/test_simulate_reads.cpython-312-pytest-7.4.4.pyc
 create mode 100644 test/unit/test_build_rmsk_gtf.py
 create mode 100644 test/unit/test_evaluate.py
 create mode 100644 test/unit/test_parse_gtf_t2g.py
 create mode 100644 test/unit/test_simulate_reads.py
 create mode 100644 test/workflow/Snakefile_test
 create mode 100644 test/workflow/configs/test_negative_control.yaml
 create mode 100644 test/workflow/envs/test_evaluation.yaml
 create mode 100644 test/workflow/test_snakemake_dryrun.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..ea12b79
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,117 @@
+name: Tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [master, dev]
+  pull_request:
+    branches: [master, dev]
+
+jobs:
+  # Unit and integration tests. Needs Python and scipy
+  unit-and-integration:
+    name: Unit and integration tests
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install Python dependencies
+        run: pip install pytest pytest-cov scipy
+
+      - name: Run unit and integration tests
+        run: |
+          pytest test/unit/ test/integration/ \
+            -v \
+            --tb=short \
+            --cov=workflow/scripts \
+            --cov-report=term-missing \
+            --cov-fail-under=70
+
+  # Snakemake dry-run. Installs snakemake via micromamba but does not run any rules.
+  snakemake-dryrun:
+    name: Snakemake dry-run
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          micromamba-version: latest
+          environment-name: snakemake
+          create-args: >-
+            python=3.11
+            snakemake>=8
+          init-shell: bash
+
+      - name: Dry-run main Snakefile with SmartSeq2 config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            --configfile configs/simulation_smartseq2.yaml \
+            --dry-run \
+            --quiet
+
+      - name: Dry-run main Snakefile with Chromium config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            --configfile configs/simulation_chromium.yaml \
+            --dry-run \
+            --quiet
+
+      - name: Dry-run test Snakefile with negative control config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            -s ../test/workflow/Snakefile_test \
+            --configfile ../test/workflow/configs/test_negative_control.yaml \
+            --dry-run \
+            --quiet
+
+      - name: Run workflow dry-run pytest tests
+        shell: bash -el {0}
+        run: |
+          pytest test/workflow/test_snakemake_dryrun.py \
+            -v --tb=short -m workflow
+
+  # Full negative control run. Requires reference data on the runner and an env.
+  # triggered manually
+  negative-control-run:
+    name: Negative control full run
+    runs-on: ubuntu-latest
+    if: |
+      github.event_name == 'workflow_dispatch'
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          micromamba-version: latest
+          environment-name: snakemake
+          create-args: >-
+            python=3.11
+            snakemake>=8
+          init-shell: bash
+
+      - name: Run negative control workflow
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            -s ../test/workflow/Snakefile_test \
+            --configfile ../test/workflow/configs/test_negative_control.yaml \
+            --use-conda \
+            --cores 4 \
+            --conda-frontend mamba
diff --git a/README.md b/README.md
index f8532c2..6bc5ef0 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Repeat element quantification in bulk and single cells
+# Repetitive element quantification in bulk and single-cell RNA-seq
 
 ## Status
 
@@ -86,6 +86,12 @@ per-(barcode, locus) counts directly, without splitting into per-cell BAM files.
 
 See workflow/methods.md for full details.
 
+## Testing
+
+Unit tests, integration tests, and Snakemake dry-run tests are in the
+`test/` directory. See [test/README.md](test/README.md) for details on
+design, coverage, and how to run the tests.
+
 ## Contact
 
 izaskun dot mallona dot work at gmail.com
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 0000000..d33baf9
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,209 @@
+# Tests
+
+This directory contains the test suite for the repeats pipeline. Tests are
+run with pytest and cover the Python scripts in workflow/scripts/ as well as
+the correctness of the evaluation logic end-to-end.
+
+See also: the main [README.md](../README.md) for pipeline usage.
+
+## Running the tests
+
+All commands below assume the repository root as the working directory unless
+stated otherwise.
+
+### Unit and integration tests
+
+Install dependencies:
+
+```
+pip install pytest pytest-cov scipy
+```
+
+Run unit and integration tests:
+
+```
+cd /path/to/repeats
+pytest test/unit/ test/integration/ -v
+```
+
+Run with coverage:
+
+```
+cd /path/to/repeats
+pytest test/unit/ test/integration/ --cov=workflow/scripts --cov-report=term-missing
+```
+
+### Snakemake dry-run tests
+
+These require snakemake in PATH and are tagged with the `workflow` marker.
+Run via pytest (from the repo root):
+
+```
+cd /path/to/repeats
+pytest test/workflow/test_snakemake_dryrun.py -v -m workflow
+```
+
+Or run the dry-runs directly with snakemake. The working directory must be
+`workflow/` for the production Snakefile, and paths to the test Snakefile and
+configs are given relative to that directory:
+
+```
+cd /path/to/repeats/workflow
+
+snakemake --configfile configs/simulation_smartseq2.yaml --dry-run --quiet
+
+snakemake --configfile configs/simulation_chromium.yaml --dry-run --quiet
+
+snakemake -s ../test/workflow/Snakefile_test \
+  --configfile ../test/workflow/configs/test_negative_control.yaml \
+  --dry-run --quiet
+```
+
+### Negative control workflow (full run)
+
+Requires reference data accessible from the runner and snakemake with conda
+support. Run from the `workflow/` directory:
+
+```
+cd /path/to/repeats/workflow
+
+snakemake -s ../test/workflow/Snakefile_test \
+  --configfile ../test/workflow/configs/test_negative_control.yaml \
+  --use-conda \
+  --cores 4 \
+  --conda-frontend mamba
+```
+
+
+## Directory layout
+
+```
+test/
+  unit/                    unit tests for individual script functions
+  integration/             end-to-end tests using synthetic input files
+  workflow/                snakemake dry-run tests and the negative control workflow
+    Snakefile_test         test workflow that reuses the production snmk modules
+    configs/               config for the negative control run
+    envs/                  conda environment for test workflow rules
+```
+
+
+## Unit tests (test/unit/)
+
+Each file targets one script in workflow/scripts/.
+
+test_evaluate.py covers:
+- pearson_r, spearman_r: known values, edge cases (too few points, constant vector)
+- log1p_rmse: perfect recovery gives 0.0, symmetry, empty input
+- detection_metrics: all-correct, no-overlap, partial overlap, specificity
+- build_aligned_vectors: zero-filling for missing features, cell ordering
+- load_ground_truth: all four granularities (locus, gene_id, family_id, class_id)
+  and the valid_locus_ids filter that was added to fix the genic/intergenic
+  partitioning bug
+- load_count_matrix: feature tracking including zero-count rows
+- compute_metrics_for_subset: perfect observer and null observer
+
+test_simulate_reads.py covers:
+- reverse_complement: known sequences, N passthrough, involution property
+- parse_gtf_attribute: quoted and unquoted values, missing key
+- extract_repeat_sequence: N content filter, strand handling, length filter
+- sample_subseq: output length, N padding for short sequences
+- sample_count_geometric: always >= 1, respects max_count, approximate mean
+- build_cell_plan and build_locus_to_cells: output structure
+- write_ground_truth: TSV format and row count
+- parse_gtf_repeats_by_chrom: minimal GTF parsing, length filter, chrom filter
+
+test_build_rmsk_gtf.py covers:
+- make_gtf_attributes: format and dup index incrementing
+- convert_rmsk_to_gtf: basic output, dup index, length filter, chrom filter,
+  1-based GTF coordinates, gzip output
+
+test_parse_gtf_t2g.py covers:
+- parse_attrs: quoted values, missing keys, empty string
+- main() with sys.argv patching: 2-col and 4-col output, correct mapping,
+  deduplication, feature-type filtering
+
+
+## Integration tests (test/integration/)
+
+test_feature_set_consistency.py documents and tests a bug that caused lower
+evaluation metrics for genic_repeats and intergenic_repeats at gene_id
+granularity compared to the full repeats set, even for a hypothetically
+perfect observer.
+
+Root cause: evaluate.py was aggregating ground truth counts across all loci
+for a given gene_id (including both genic and intergenic copies), then
+filtering at the gene_id level. For a gene_id like AluSz6 that has copies
+in both contexts, the genic truth count included intergenic loci, inflating
+truth relative to observed and degrading Pearson correlation and recall.
+
+The fix adds locus-level filtering before aggregation (see valid_locus_ids
+in load_ground_truth). The tests in this file verify:
+
+1. gene_ids overlap between genic and intergenic locus maps (structural
+   precondition for the bug, expected in real data)
+2. full repeats at gene_id with a perfect observer gives metrics of 1.0
+   (baseline sanity check)
+3. genic_repeats at locus granularity with a perfect observer gives 1.0
+   (locus level is unaffected by the partition issue)
+4. genic_repeats at gene_id with a perfect genic observer now gives 1.0
+   (regression test for the fix; would fail if the fix were reverted)
+5. intergenic_repeats at gene_id with a perfect observer gives 1.0
+6. subset metrics are not lower than full-set metrics for a perfect observer
+   (monotonicity property)
+
+test_metamorphic.py tests properties that must hold across input transformations
+without needing exact expected values:
+
+- scale invariance: multiplying observed counts by a constant does not change
+  Pearson or Spearman
+- scaling does change log1p_rmse
+- monotone recall: adding true positives never decreases recall
+- noise degrades metrics: progressively noisier observers give lower Pearson
+- perfect beats null: a perfect observer strictly outperforms a zero observer
+  on all metrics
+- granularity aggregation consistency: a perfect locus-level count matrix
+  summed to family_id also gives metrics of 1.0
+- random counts give low correlation (below 0.7 for 50-feature random data)
+
+
+## Workflow tests (test/workflow/)
+
+test_snakemake_dryrun.py verifies that snakemake --dry-run succeeds for both
+production configs (simulation_smartseq2.yaml and simulation_chromium.yaml)
+and for the test negative control config. These tests are skipped if snakemake
+is not in the PATH and are marked with the workflow marker.
+
+Snakefile_test is a separate Snakemake workflow that includes all production
+snmk modules and adds the negative control rules:
+
+simulate_from_genes runs simulate_reads.py with the Ensembl gene GTF instead
+of the repeat GTF. Reads come from gene body regions rather than repeat
+elements. This gives a ground truth over gene features.
+
+check_negative_control_recall runs evaluate.py comparing that gene-body
+ground truth against the repeat quantification output and asserts that recall
+is below the threshold in testing.negative_control_max_recall (default 0.10).
+A low recall means the pipeline correctly does not attribute gene-body reads
+to repeat elements.
+
+The conda environment for these rules is defined in
+test/workflow/envs/test_evaluation.yaml (python, scipy, pandas, numpy).
+
+
+## GitHub Actions
+
+.github/workflows/tests.yml defines three jobs:
+
+unit-and-integration runs on every push and pull request. It uses plain
+Python and does not need any bioinformatics tools installed.
+
+snakemake-dryrun runs on every push and pull request. It installs snakemake
+via micromamba and runs dry-run checks on both production configs and the test
+Snakefile. Snakemake installs conda dependencies itself when --use-conda is
+passed, but dry-runs do not trigger conda installs.
+
+negative-control-run is only triggered manually (workflow_dispatch) or when
+the repository variable ENABLE_FULL_INTEGRATION is set to true. It runs the
+full negative control workflow with --use-conda and requires reference data
+to be accessible from the runner.
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000..ad80b2e
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,13 @@
+"""
+Shared pytest configuration and path setup.
+
+Adds workflow/scripts to sys.path so unit tests can import scripts directly.
+"""
+import sys
+import os
+
+SCRIPTS_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), '..', 'workflow', 'scripts')
+)
+if SCRIPTS_DIR not in sys.path:
+    sys.path.insert(0, SCRIPTS_DIR)
diff --git a/test/integration/__init__.py b/test/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/integration/test_feature_set_consistency.py b/test/integration/test_feature_set_consistency.py
new file mode 100644
index 0000000..3559831
--- /dev/null
+++ b/test/integration/test_feature_set_consistency.py
@@ -0,0 +1,393 @@
+"""
+Integration tests for evaluation consistency across feature sets.
+
+Background
+----------
+The pipeline evaluates each (feature_set × granularity × aligner) combination.
+An initially puzzling result was that genic_repeats and intergenic_repeats at
+gene_id granularity showed lower metrics than the full repeats set, even for a
+hypothetically perfect observer.
+
+Root cause (now fixed in evaluate.py)
+--------------------------------------
+At gene_id granularity, load_ground_truth() previously aggregated ALL loci for
+a given gene_id (summing genic + intergenic copies), then filtered the already-
+aggregated dict by checking whether the gene_id appeared in the genic locus_map.
+
+For a gene_id like AluSz6 that has copies in BOTH genic and intergenic regions:
+
+  - genic locus_map contains AluSz6 (it has genic copies)
+  - OLD truth['c1']['AluSz6'] = 5 (genic) + 3 (intergenic) = 8   ← inflated
+  - observed from genic normalization = 5 (only genic copy)
+  - -> truth > observed -> degraded Pearson / RMSE, false-negative detections
+
+The fix moves locus_map loading to BEFORE load_ground_truth() and passes a
+valid_locus_ids set so filtering happens AT LOCUS LEVEL before any aggregation.
+
+Scenario used by these tests
+-----------------------------
+We construct a minimal ground truth with:
+  - genic_alu_1      -> AluSz6, Alu, SINE, count=5   (GENIC)
+  - intergenic_alu_1 -> AluSz6, Alu, SINE, count=3   (INTERGENIC - same gene_id!)
+  - genic_alu_2      -> AluSx,  Alu, SINE, count=7   (GENIC)
+  - intergenic_alu_2 -> AluSx,  Alu, SINE, count=6   (INTERGENIC - same gene_id!)
+  - genic_L1_1       -> L1PA2,  L1,  LINE, count=2   (GENIC)
+  - genic_mir_1      -> MIR,    MIR, SINE, count=4   (GENIC)
+  - genic_L2_1       -> L2,     L2,  LINE, count=3   (GENIC)
+  - intergenic_dna_1 -> TcMar,  TcMar, DNA, count=1  (INTERGENIC)
+  - [TcMar has a genic reference locus genic_tcmar_ref not expressed in
+     simulation, but present in the genic locus_map - this triggers the
+     false-negative detection failure in the old code.]
+
+Genic locus_map contains:
+  genic_alu_1, genic_alu_2, genic_L1_1, genic_mir_1, genic_L2_1, genic_tcmar_ref
+
+Intergenic locus_map contains:
+  intergenic_alu_1, intergenic_alu_2, intergenic_dna_1
+
+AluSz6 and AluSx appear in BOTH locus_maps (gene_id level).
+
+With the fix:
+  genic truth at gene_id: AluSz6=5, AluSx=7, L1PA2=2, MIR=4, L2=3, TcMar=0
+  genic observed at gene_id (from genic normalization): AluSz6=5, AluSx=7, L1PA2=2, MIR=4, L2=3
+  -> perfect match -> pearson=1.0, recall=1.0, precision=1.0
+"""
+import csv
+import math
+import os
+import subprocess
+import sys
+
+import pytest
+
+SCRIPTS_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), '..', '..', 'workflow', 'scripts')
+)
+EVALUATE = os.path.join(SCRIPTS_DIR, 'evaluate.py')
+
+# ---------------------------------------------------------------------------
+# Shared fixture: build the scenario files
+# ---------------------------------------------------------------------------
+
+GROUND_TRUTH_ROWS = [
+    # cell_id, locus_id, repeat_id (=gene_id), family_id, class_id, true_count
+    ('cell_001', 'genic_alu_1',       'AluSz6', 'Alu',   'SINE', '5'),
+    ('cell_001', 'intergenic_alu_1',  'AluSz6', 'Alu',   'SINE', '3'),
+    ('cell_001', 'genic_alu_2',       'AluSx',  'Alu',   'SINE', '7'),
+    ('cell_001', 'intergenic_alu_2',  'AluSx',  'Alu',   'SINE', '6'),
+    ('cell_001', 'genic_L1_1',        'L1PA2',  'L1',    'LINE', '2'),
+    ('cell_001', 'genic_mir_1',       'MIR',    'MIR',   'SINE', '4'),
+    ('cell_001', 'genic_L2_1',        'L2',     'L2',    'LINE', '3'),
+    ('cell_001', 'intergenic_dna_1',  'TcMar',  'TcMar', 'DNA',  '1'),
+]
+
+# Genic locus_map includes a reference locus for TcMar that was NOT expressed
+# in the simulation.  In the old code, TcMar's intergenic count (1) would
+# bleed into the genic truth because TcMar is in the genic locus_map.
+GENIC_LOCUS_MAP = [
+    ('genic_alu_1',      'AluSz6', 'Alu',   'SINE'),
+    ('genic_alu_2',      'AluSx',  'Alu',   'SINE'),
+    ('genic_L1_1',       'L1PA2',  'L1',    'LINE'),
+    ('genic_mir_1',      'MIR',    'MIR',   'SINE'),
+    ('genic_L2_1',       'L2',     'L2',    'LINE'),
+    # reference locus for TcMar - exists in genome but NOT expressed
+    ('genic_tcmar_ref',  'TcMar',  'TcMar', 'DNA'),
+]
+
+INTERGENIC_LOCUS_MAP = [
+    ('intergenic_alu_1',  'AluSz6', 'Alu',   'SINE'),
+    ('intergenic_alu_2',  'AluSx',  'Alu',   'SINE'),
+    ('intergenic_dna_1',  'TcMar',  'TcMar', 'DNA'),
+]
+
+FULL_LOCUS_MAP = GENIC_LOCUS_MAP + INTERGENIC_LOCUS_MAP
+
+# "Perfect genic observer" at gene_id level - exactly what genic normalization produces:
+# only counts from genic loci, using GENIC_LOCUS_MAP.
+GENIC_OBSERVED_GENE_ID = {
+    'AluSz6': 5,
+    'AluSx':  7,
+    'L1PA2':  2,
+    'MIR':    4,
+    'L2':     3,
+    # TcMar: 0 (its only genic reference locus was not expressed)
+}
+
+# "Perfect intergenic observer" at gene_id level
+INTERGENIC_OBSERVED_GENE_ID = {
+    'AluSz6': 3,
+    'AluSx':  6,
+    'TcMar':  1,
+}
+
+# "Perfect full observer" at gene_id level
+FULL_OBSERVED_GENE_ID = {
+    'AluSz6': 8,
+    'AluSx':  13,
+    'L1PA2':  2,
+    'MIR':    4,
+    'L2':     3,
+    'TcMar':  1,
+}
+
+# "Perfect genic observer" at locus level
+GENIC_OBSERVED_LOCUS = {
+    'genic_alu_1':  5,
+    'genic_alu_2':  7,
+    'genic_L1_1':   2,
+    'genic_mir_1':  4,
+    'genic_L2_1':   3,
+}
+
+
+# ---------------------------------------------------------------------------
+# File-writing helpers
+# ---------------------------------------------------------------------------
+
+def write_ground_truth(tmp_path):
+    p = tmp_path / 'ground_truth.tsv'
+    with open(p, 'w') as fh:
+        fh.write('cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n')
+        for row in GROUND_TRUTH_ROWS:
+            fh.write('\t'.join(row) + '\n')
+    return p
+
+
+def write_locus_map(tmp_path, name, rows):
+    p = tmp_path / name
+    with open(p, 'w') as fh:
+        for row in rows:
+            fh.write('\t'.join(row) + '\n')
+    return p
+
+
+def write_count_matrix(tmp_path, name, feature_counts, cells=('cell_001',)):
+    p = tmp_path / name
+    features = sorted(feature_counts)
+    with open(p, 'w') as fh:
+        fh.write('feature_id\t' + '\t'.join(cells) + '\n')
+        for feat in features:
+            vals = [str(feature_counts[feat]) for _ in cells]
+            fh.write(feat + '\t' + '\t'.join(vals) + '\n')
+    return p
+
+
+def run_evaluate(tmp_path, gt, counts, locus_map, granularity, feature_set):
+    prefix = str(tmp_path / f'{feature_set}_{granularity}')
+    cmd = [
+        sys.executable, EVALUATE,
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'test',
+        '--multimapper-mode', 'unique',
+        '--granularity', granularity,
+        '--feature-set', feature_set,
+        '--locus-map', str(locus_map),
+        '--output-prefix', prefix,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f'evaluate.py failed:\n{result.stderr}')
+    return read_global_metrics(prefix + '_global_metrics.tsv')
+
+
+def read_global_metrics(path):
+    with open(path) as fh:
+        reader = csv.DictReader(fh, delimiter='\t')
+        return next(reader)
+
+
+def parse_float(v):
+    try:
+        return float(v)
+    except (ValueError, TypeError):
+        return float('nan')
+
+
+# ---------------------------------------------------------------------------
+# Test 1: Structural - gene_id overlap between genic and intergenic locus_maps
+# ---------------------------------------------------------------------------
+
+def test_gene_ids_overlap_between_genic_and_intergenic_locus_maps():
+    """
+    Documents the structural precondition for the bug:
+    AluSz6 and AluSx appear as gene_ids in BOTH genic and intergenic locus_maps.
+    This is expected in real data (repeat families span both contexts) and is
+    the reason why locus-level filtering before aggregation is necessary.
+    """
+    genic_gene_ids = {row[1] for row in GENIC_LOCUS_MAP}
+    intergenic_gene_ids = {row[1] for row in INTERGENIC_LOCUS_MAP}
+    shared = genic_gene_ids & intergenic_gene_ids
+    assert len(shared) > 0, (
+        'Expected at least one gene_id in both genic and intergenic maps '
+        f'but got genic={genic_gene_ids}, intergenic={intergenic_gene_ids}'
+    )
+    assert 'AluSz6' in shared
+    assert 'AluSx' in shared
+
+
+def test_full_locus_map_partitions_at_locus_level_not_gene_id():
+    """
+    At locus level, genic and intergenic locus_maps are disjoint.
+    At gene_id level they overlap.  This is why the old post-aggregation
+    filter could not correctly separate the two subsets.
+    """
+    genic_loci = {row[0] for row in GENIC_LOCUS_MAP}
+    intergenic_loci = {row[0] for row in INTERGENIC_LOCUS_MAP}
+    # Locus level: disjoint
+    assert genic_loci & intergenic_loci == set()
+    # Gene_id level: overlapping
+    genic_gene_ids = {row[1] for row in GENIC_LOCUS_MAP}
+    intergenic_gene_ids = {row[1] for row in INTERGENIC_LOCUS_MAP}
+    assert genic_gene_ids & intergenic_gene_ids != set()
+
+
+# ---------------------------------------------------------------------------
+# Test 2: Full repeats, gene_id - perfect observer -> perfect metrics (baseline)
+# ---------------------------------------------------------------------------
+
+def test_full_repeats_gene_id_perfect_observer_gives_perfect_metrics(tmp_path):
+    """
+    Sanity baseline: if observed = truth for all repeats at gene_id level,
+    all accuracy metrics should be 1.0 / 0.0.
+    """
+    gt = write_ground_truth(tmp_path)
+    lm = write_locus_map(tmp_path, 'full_locus_map.tsv', FULL_LOCUS_MAP)
+    counts = write_count_matrix(tmp_path, 'full_counts.tsv', FULL_OBSERVED_GENE_ID)
+    m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'repeats')
+    assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['log1p_rmse']) == pytest.approx(0.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# Test 3: Genic repeats, locus granularity - perfect observer -> perfect metrics
+# ---------------------------------------------------------------------------
+
+def test_genic_repeats_locus_granularity_perfect_observer(tmp_path):
+    """
+    At locus level, a perfect genic observer should give perfect metrics.
+    No cross-partition contamination is possible at locus level because
+    locus IDs are unique per repeat instance.
+    """
+    gt = write_ground_truth(tmp_path)
+    lm = write_locus_map(tmp_path, 'genic_locus_map.tsv', GENIC_LOCUS_MAP)
+    counts = write_count_matrix(tmp_path, 'genic_locus_counts.tsv', GENIC_OBSERVED_LOCUS)
+    m = run_evaluate(tmp_path, gt, counts, lm, 'locus', 'genic_repeats')
+    assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# Test 4 (THE KEY TEST): Genic repeats, gene_id - perfect genic observer
+#          should now give perfect metrics after the fix
+# ---------------------------------------------------------------------------
+
+def test_genic_repeats_gene_id_perfect_genic_observer_gives_perfect_metrics(tmp_path):
+    """
+    Core regression test for the locus-level filtering fix.
+
+    With the OLD code (post-aggregation filter):
+      - truth['cell_001']['AluSz6'] = 5 + 3 = 8  (sums ALL AluSz6 loci)
+      - observed['cell_001']['AluSz6'] = 5  (only genic loci)
+      - -> truth > observed -> degraded pearson, false-negative for TcMar
+
+    With the FIXED code (locus-level filter BEFORE aggregation):
+      - only genic loci contribute to truth
+      - truth['cell_001']['AluSz6'] = 5, truth['cell_001']['AluSx'] = 7
+      - TcMar: its intergenic count is excluded; genic_tcmar_ref is not expressed
+        -> TcMar absent from both truth and observed -> no false negative
+      - observed matches truth exactly -> all metrics = 1.0
+    """
+    gt = write_ground_truth(tmp_path)
+    lm = write_locus_map(tmp_path, 'genic_locus_map.tsv', GENIC_LOCUS_MAP)
+    counts = write_count_matrix(tmp_path, 'genic_gene_id_counts.tsv', GENIC_OBSERVED_GENE_ID)
+    m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'genic_repeats')
+
+    pearson = parse_float(m['pearson_r'])
+    recall = parse_float(m['recall'])
+    precision = parse_float(m['precision'])
+    rmse = parse_float(m['log1p_rmse'])
+
+    assert pearson == pytest.approx(1.0, abs=1e-4), (
+        f'pearson={pearson} - expected 1.0; '
+        'if this fails, the locus-level filtering fix may have been reverted'
+    )
+    assert recall == pytest.approx(1.0, abs=1e-4), f'recall={recall}'
+    assert precision == pytest.approx(1.0, abs=1e-4), f'precision={precision}'
+    assert rmse == pytest.approx(0.0, abs=1e-4), f'log1p_rmse={rmse}'
+
+
+# ---------------------------------------------------------------------------
+# Test 5: Intergenic repeats, gene_id - same fix applies
+# ---------------------------------------------------------------------------
+
+def test_intergenic_repeats_gene_id_perfect_intergenic_observer_gives_perfect_metrics(tmp_path):
+    """
+    Mirror of test 4 for the intergenic subset.
+
+    After the fix, intergenic truth for AluSz6 = 3 (only intergenic_alu_1),
+    not 8 (which would include genic_alu_1).
+    """
+    gt = write_ground_truth(tmp_path)
+    lm = write_locus_map(tmp_path, 'intergenic_locus_map.tsv', INTERGENIC_LOCUS_MAP)
+    counts = write_count_matrix(
+        tmp_path, 'intergenic_counts.tsv', INTERGENIC_OBSERVED_GENE_ID)
+    m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'intergenic_repeats')
+
+    assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4), (
+        f"pearson={m['pearson_r']} - expected 1.0 for perfect intergenic observer"
+    )
+    assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# Test 6: Genic + intergenic metrics should not be lower than full-set metrics
+#          for a perfect full observer (monotonicity property)
+# ---------------------------------------------------------------------------
+
+def test_subset_metrics_not_lower_than_full_for_perfect_observer(tmp_path):
+    """
+    Metamorphic property: if an aligner perfectly recovers ALL repeats,
+    evaluating a clean subset should not give LOWER metrics than the full set.
+
+    We use a perfect full observer and check that both genic and intergenic
+    sub-evaluations also give perfect metrics (1.0).
+
+    This test would have failed before the fix because the inflated truth
+    counts caused pearson < 1.0 even for a perfect observer.
+    """
+    gt = write_ground_truth(tmp_path)
+
+    # Full repeats evaluation
+    lm_full = write_locus_map(tmp_path, 'full_lm.tsv', FULL_LOCUS_MAP)
+    cnt_full = write_count_matrix(tmp_path, 'full_cnt.tsv', FULL_OBSERVED_GENE_ID)
+    m_full = run_evaluate(tmp_path, gt, cnt_full, lm_full, 'gene_id', 'repeats')
+
+    # Genic repeats evaluation (using genic counts only)
+    lm_genic = write_locus_map(tmp_path, 'genic_lm.tsv', GENIC_LOCUS_MAP)
+    cnt_genic = write_count_matrix(tmp_path, 'genic_cnt.tsv', GENIC_OBSERVED_GENE_ID)
+    m_genic = run_evaluate(tmp_path, gt, cnt_genic, lm_genic, 'gene_id', 'genic_repeats')
+
+    # Intergenic repeats evaluation
+    lm_inter = write_locus_map(tmp_path, 'inter_lm.tsv', INTERGENIC_LOCUS_MAP)
+    cnt_inter = write_count_matrix(
+        tmp_path, 'inter_cnt.tsv', INTERGENIC_OBSERVED_GENE_ID)
+    m_inter = run_evaluate(tmp_path, gt, cnt_inter, lm_inter, 'gene_id', 'intergenic_repeats')
+
+    full_pearson = parse_float(m_full['pearson_r'])
+    genic_pearson = parse_float(m_genic['pearson_r'])
+    inter_pearson = parse_float(m_inter['pearson_r'])
+
+    # All should be 1.0 (perfect observer) - none should be lower than full
+    assert full_pearson == pytest.approx(1.0, abs=1e-4)
+    assert genic_pearson >= full_pearson - 0.01, (
+        f'genic pearson={genic_pearson} is lower than full={full_pearson}; '
+        'subset metrics should not be worse than full-set metrics for a perfect observer'
+    )
+    assert inter_pearson >= full_pearson - 0.01, (
+        f'intergenic pearson={inter_pearson} is lower than full={full_pearson}'
+    )
diff --git a/test/integration/test_metamorphic.py b/test/integration/test_metamorphic.py
new file mode 100644
index 0000000..e080e64
--- /dev/null
+++ b/test/integration/test_metamorphic.py
@@ -0,0 +1,264 @@
+"""
+Metamorphic tests for the evaluation pipeline.
+
+Metamorphic testing verifies properties that must hold across transformations
+of the input, without needing exact expected values.  This is especially
+useful for a bioinformatics benchmarking pipeline where true correct outputs
+are not always known in advance.
+
+Properties tested
+-----------------
+1. Scale invariance:     multiplying observed counts by a constant should not
+                         change Pearson or Spearman (but will change RMSE).
+2. Monotone recall:      adding true positives to observed should increase
+                         or maintain recall - never decrease it.
+3. Noise degrades metrics:  adding random noise to a perfect observer should
+                         monotonically worsen Pearson and RMSE.
+4. Null observer is worst:  a perfect observer beats a zero observer on all
+                         quantification and detection metrics.
+5. Permuting cell labels degrades per-cell metrics but not global metrics
+                         computed on pooled counts.
+6. Family_id granularity:  summing a perfect locus-level count matrix to
+                         family_id should also give perfect metrics.
+"""
+import math
+import random
+
+import pytest
+
+import evaluate as ev
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def make_truth_obs(n_cells=4, n_features=8, seed=42):
+    """Return a (truth, observed) pair where observed = truth (perfect)."""
+    rng = random.Random(seed)
+    cells = [f'c{i}' for i in range(n_cells)]
+    feats = [f'feat_{i}' for i in range(n_features)]
+    truth = {c: {f: rng.randint(1, 20) for f in feats} for c in cells}
+    obs = {c: dict(d) for c, d in truth.items()}
+    return truth, obs, cells, feats
+
+
+# ---------------------------------------------------------------------------
+# 1. Scale invariance of rank-based metrics
+# ---------------------------------------------------------------------------
+
+def test_scale_invariance_pearson():
+    """Multiplying observed by a scalar should not change Pearson."""
+    truth, obs, cells, feats = make_truth_obs()
+    m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats)
+
+    obs_scaled = {c: {f: v * 10 for f, v in d.items()} for c, d in obs.items()}
+    m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats)
+
+    assert m1['pearson_r'] == pytest.approx(m2['pearson_r'], abs=1e-4)
+
+
+def test_scale_invariance_spearman():
+    """Multiplying observed by a scalar should not change Spearman."""
+    truth, obs, cells, feats = make_truth_obs()
+    m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats)
+
+    obs_scaled = {c: {f: v * 3 for f, v in d.items()} for c, d in obs.items()}
+    m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats)
+
+    assert m1['spearman_r'] == pytest.approx(m2['spearman_r'], abs=1e-4)
+
+
+def test_scale_changes_rmse():
+    """Scaling observed DOES change log1p_rmse."""
+    truth, obs, cells, feats = make_truth_obs()
+    m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats)
+
+    obs_scaled = {c: {f: v * 100 for f, v in d.items()} for c, d in obs.items()}
+    m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats)
+
+    # RMSE for perfect observer is 0; scaled observer should have higher RMSE
+    assert m2['log1p_rmse'] > m1['log1p_rmse']
+
+
+# ---------------------------------------------------------------------------
+# 2. Monotone recall: adding true positives never decreases recall
+# ---------------------------------------------------------------------------
+
+def test_monotone_recall_adding_true_positives():
+    """
+    Start with a partially correct observer.  Add more true positives.
+    Recall must not decrease.
+    """
+    truth = {'c1': {'A': 5, 'B': 3, 'C': 2, 'D': 7}}
+    feats = ['A', 'B', 'C', 'D']
+
+    obs_partial = {'c1': {'A': 5}}   # only A correct
+    obs_more    = {'c1': {'A': 5, 'B': 3}}   # A and B correct
+
+    m1 = ev.compute_metrics_for_subset(truth, obs_partial, ['c1'], feats)
+    m2 = ev.compute_metrics_for_subset(truth, obs_more, ['c1'], feats)
+
+    assert m2['recall'] >= m1['recall'] - 1e-9
+
+
+def test_monotone_recall_full_recovery_is_maximum():
+    truth, obs, cells, feats = make_truth_obs()
+    # Partial: only first half of features observed
+    feats_half = feats[:len(feats)//2]
+    obs_partial = {c: {f: d[f] for f in feats_half if f in d} for c, d in obs.items()}
+
+    m_partial = ev.compute_metrics_for_subset(truth, obs_partial, cells, feats)
+    m_full = ev.compute_metrics_for_subset(truth, obs, cells, feats)
+
+    assert m_full['recall'] >= m_partial['recall'] - 1e-9
+    assert m_full['f1'] >= m_partial['f1'] - 1e-9
+
+
+# ---------------------------------------------------------------------------
+# 3. Noise degrades metrics monotonically
+# ---------------------------------------------------------------------------
+
+def test_increasing_noise_degrades_pearson():
+    """
+    Progressively noisier observers should give monotonically worse Pearson.
+    """
+    truth, _, cells, feats = make_truth_obs(seed=7)
+    prev_r = 1.0
+
+    rng = random.Random(7)
+    obs = {c: dict(d) for c, d in truth.items()}
+
+    noise_levels = [0, 1, 3, 10, 50]
+    for level in noise_levels:
+        noisy_obs = {
+            c: {f: max(0, v + rng.randint(-level, level)) for f, v in d.items()}
+            for c, d in truth.items()
+        }
+        m = ev.compute_metrics_for_subset(truth, noisy_obs, cells, feats)
+        r = m['pearson_r']
+        if not isinstance(r, str) and not math.isnan(float(r)):
+            assert float(r) <= prev_r + 0.05, (
+                f'Pearson increased from {prev_r} to {r} as noise level went to {level}'
+            )
+            prev_r = float(r)
+
+
+# ---------------------------------------------------------------------------
+# 4. Perfect observer strictly beats null observer
+# ---------------------------------------------------------------------------
+
+def test_perfect_beats_null_on_all_metrics():
+    truth, perfect_obs, cells, feats = make_truth_obs(seed=3)
+    null_obs = {}
+
+    m_perfect = ev.compute_metrics_for_subset(truth, perfect_obs, cells, feats)
+    m_null = ev.compute_metrics_for_subset(truth, null_obs, cells, feats)
+
+    null_pearson = m_null.get('pearson_r', 'NA')
+    null_pearson_val = 0.0 if null_pearson in ('NA', '') else float(null_pearson)
+    assert float(m_perfect['pearson_r']) > null_pearson_val
+    assert float(m_perfect['recall']) > float(m_null['recall'])
+    assert float(m_perfect['f1']) > float(m_null['f1'])
+    assert (float(m_perfect['log1p_rmse'])
+            < float(m_null['log1p_rmse']))
+
+
+# ---------------------------------------------------------------------------
+# 5. Granularity aggregation consistency
+# ---------------------------------------------------------------------------
+
+def test_locus_counts_summed_to_family_give_correct_metrics(tmp_path):
+    """
+    A locus-level count matrix summed to family_id should give the same
+    result as loading ground truth at family_id granularity directly.
+    Validates that the granularity aggregation pipeline is self-consistent.
+    """
+    import csv
+    import os
+    import sys
+    import subprocess
+
+    EVALUATE = os.path.join(
+        os.path.dirname(__file__), '..', '..', 'workflow', 'scripts', 'evaluate.py'
+    )
+
+    gt_rows = [
+        ('cell_001', 'AluSz6_dup1', 'AluSz6', 'Alu',  'SINE', '5'),
+        ('cell_001', 'AluSz6_dup2', 'AluSz6', 'Alu',  'SINE', '3'),
+        ('cell_001', 'L1PA2_dup1',  'L1PA2',  'L1',   'LINE', '7'),
+        ('cell_001', 'MIR3_dup1',   'MIR3',   'MIR',  'SINE', '2'),
+    ]
+    locus_map_rows = [
+        ('AluSz6_dup1', 'AluSz6', 'Alu',  'SINE'),
+        ('AluSz6_dup2', 'AluSz6', 'Alu',  'SINE'),
+        ('L1PA2_dup1',  'L1PA2',  'L1',   'LINE'),
+        ('MIR3_dup1',   'MIR3',   'MIR',  'SINE'),
+    ]
+
+    gt_path = tmp_path / 'gt.tsv'
+    with open(gt_path, 'w') as fh:
+        fh.write('cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n')
+        for row in gt_rows:
+            fh.write('\t'.join(row) + '\n')
+
+    lm_path = tmp_path / 'lm.tsv'
+    with open(lm_path, 'w') as fh:
+        for row in locus_map_rows:
+            fh.write('\t'.join(row) + '\n')
+
+    # Perfect observer at family_id level (manually aggregated)
+    family_counts = {'Alu': 8, 'L1': 7, 'MIR': 2}
+    cnt_path = tmp_path / 'counts.tsv'
+    with open(cnt_path, 'w') as fh:
+        fh.write('feature_id\tcell_001\n')
+        for fam, cnt in family_counts.items():
+            fh.write(f'{fam}\t{cnt}\n')
+
+    prefix = str(tmp_path / 'family_eval')
+    cmd = [
+        sys.executable, EVALUATE,
+        '--ground-truth', str(gt_path),
+        '--observed-counts', str(cnt_path),
+        '--aligner', 'test',
+        '--multimapper-mode', 'unique',
+        '--granularity', 'family_id',
+        '--feature-set', 'repeats',
+        '--locus-map', str(lm_path),
+        '--output-prefix', prefix,
+    ]
+    r = subprocess.run(cmd, capture_output=True, text=True)
+    assert r.returncode == 0, r.stderr
+
+    with open(prefix + '_global_metrics.tsv') as fh:
+        m = next(csv.DictReader(fh, delimiter='\t'))
+
+    pearson = float(m['pearson_r'])
+    recall = float(m['recall'])
+    assert pearson == pytest.approx(1.0, abs=1e-4), (
+        f'family_id aggregation: pearson={pearson}, expected 1.0'
+    )
+    assert recall == pytest.approx(1.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# 6. Random counts give low (not artificially inflated) metrics
+# ---------------------------------------------------------------------------
+
+def test_random_observer_gives_low_correlation():
+    """
+    A random observer uncorrelated with truth should give Pearson near 0.
+    Uses enough data points to make this reliable.
+    """
+    rng = random.Random(99)
+    n_feats = 50
+    feats = [f'f{i}' for i in range(n_feats)]
+    truth = {'c1': {f: rng.randint(1, 100) for f in feats}}
+    random_obs = {'c1': {f: rng.randint(1, 100) for f in feats}}
+
+    m = ev.compute_metrics_for_subset(truth, random_obs, ['c1'], feats)
+    # With 50 random pairs, Pearson should rarely be above 0.5
+    r = float(m['pearson_r'])
+    assert abs(r) < 0.7, (
+        f'Random observer gave high Pearson={r}; this suggests the metric is inflated'
+    )
diff --git a/test/unit/__init__.py b/test/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/unit/__pycache__/__init__.cpython-312.pyc b/test/unit/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35e9f2c76be87eee40bf03e688d1bc8fc64e26ee
GIT binary patch
literal 145
zcmX@j%ge<81VMlIXM*U*AOanHW&w&!XQ*V*Wb|9fP{ah}eFmxdWuTvtpPQ<mnVXoC
zlb@HUUtE-|UzA#qnpje-Uy@o}qF<VqS)v~wp9!Sm<Mj$EZ*kb<=BJeAq}mm+0u5jU
R;$jfvBQql-V-Yiu1pp-MB7*<`

literal 0
HcmV?d00001

diff --git a/test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc71beaae3eb275f86d42c970b04ec914627ddf4
GIT binary patch
literal 14887
zcmeHOU2GfIm7W=14ml);`myEEj(;Timrg8;`myY!u9Mbsk~VPDy4_9GG%U?aSyU)e
zxHF1u?r>Mz^#aWzKwP9iT(k@LX(7P|*4U>!wm}~X^u>gfQkV^(0D0KeTWS}ZT|MnN
zcYeiCCh6EIg5+{|=FXjS@7#N5=A7@GbLWq$8WA9UB>XmCjSIrZc;F<-&fNQZ$h<31
zVM3rHl?vj7#P@+2l9fbZCO9FB)|t>mh@S~hg!x{XQ20JF5v75v!bFS`psFG$0*%q&
zRdFItWhm*OA)uW!477_XK)Y!K=r$S!y8VijRLjGW9~JX@ub$I%t#`7-dgn{*^~pkM
z{<xN9`B_~%esL~eplLRvy`H|LPY%ppiLB!Y&S&j$@LdRK0RH!04dS45(K$Ko4A&`u
zJ$9Z3D7i_j!iqqH710n4;SHf88ItxZ9a^!{wCv=(J(Ue<QlufQF)2B%`09i`UMFXW
zH|;*rupV_vr(;ge+uyXaZwOkCuT8j(^~n&NHkqHZ(bZkfqDaG3`HfVDaa95|LZe0?
zFY+UzcR5$^UANmQ^Y#XDuIuc7CQ{+DSP?4{jeR|nm4`Ev>en;*kY+Oe^-MmTnY<z-
zJI<T(vxT_}<rC{DaLoW5nB>9>=f^iXFXf84be{J1TMYe6qABHzOJceE;E~xYsifAM
zFZN<~IH{Q=qw8!*G6Qf1$?!%zqiH#&bB<BqfWp!8<2uU}EnVPiI!)(hhx^MNlbM-(
z;fiwt3bF-ggdby<`j;XmLDgYyo5{SMLj{t?%jPf6fyOZ-&&*yy&6CbBvn!p=6bfmU
zn`PP3%xs3`OeL*lCUaO~#(*=8nn=$VHB&B>fLSvWX>F=BhuR5v(w7RQi<v^Yn3>6$
z@n`u^@}=T9V<l#8OM`Bk$>{0a<=H}}n9<=(GRRQ;n^A1r8v3CBV8}YKDvtOMPH3o~
zdNFN0eSE4klRKWDf!>#jnd2JE9*6GaGN3wfY{%z7zw#XK|0Ydp?&G6xFg12}SwCE-
z6aLzE&|V9|htZB&bbmFv|Hku+ncF?@$;;6*3uo8J!P<jvE9BrB8LU0%wn7Hi$bs5}
zZY$)#=c)byIdBgqM`#R1{uiJ}`%t7QbO~;PE+Ul?vT*1UFak}XOUMWSbO~dPmZ6K}
zsClaVm7$A5BQ|s)G)iLzv7t+_W#|%g#^UU4=pq3aQu$1Z00P^aN$~+@GXBtJ(gl`H
zL7>YCATa(YXVOv=!5336Jv+3Hu-BAcOqt|GkjU#OxyrlhggWO$nV#xr3NSE4$flG@
zF%^$c0x<-qTUQ+I2li2hXf-LbqgdeyBzutTMY0b`FAxnOJr|z6tPis0fi0yV$oV<Y
zCPkXN%$@|+1XjNV1Vo{t)|9@g(s$$VVrr4zKDVqqz3{EZrEBC^?LoH{a_sX||9~80
z`=P^Gw2++C0U2&B4M7G>K(J$eEl|l2IhBCI3&8V)(;%lZQ1Y)G7ldT+e7R##NevEi
zq71^%6qT~99D_2Y9EIay$iUgil8_9U!R!<(&6q(=&oJE#<cidkHGBl9Nw~UHKx3u3
z>NaIdrEAz<>n7QEDa=hkR+?vHaOwd3wbMWrgpcC=H%8$ubT5uCj@|CRec|@_dx_PK
zo}W)$yLf&3b@}?m8#`}A7l#%n7hk?JI{ryORG$++5yas0qBS+PVFrx}f@?^ggxtG=
z0h3dPhY<nfw4aYN3xbZ*$dhBXIdVnBx)QG|`OK*X=A@2KJWrmH=qq!j?@im3g_fmh
zd_n4x(}QVzXX=uB4sU&3q5-GRm4GKl4X>0?QVANw2)=#<W<oY(?SJ%bP9xLXot(F)
zK|=;DBV!#;Hqwy(n8P!TZ&F=y`Ur;aPG9@G>e%2=M}feT({tvpvD-Q8?LS)sWMBwP
z@m71(bN?QBvjtMvo0|qb@i@^GZ^G*u+D-Uetp=t&;l_N3oAx1FKZbRbemwbF>a~k}
zJ0tAOsI#~ABLVYXhNmDvV>^L=y3uo_I&~_Fp;&yC2#r^wM#PBnXNMR>EBJcQ_}L-w
z>~vrq@7eLyX*8!1=gLj^q7EPJv~fg?cF}GlX3tZo<#`G@EjfF8o+2<$1V*gTZ8%Rb
z{_S7I@zb741m<7|p96mX0adkf_a8Ix?kB$7cc1S3Dvut}->&l;TwM?7m-UTMX-*#i
z%i9zq>=3*YOX7xf`ZRb225nbB<nQcE?`dxP4<@xEw!<J9sq~(<N>89ippYw?!9u>6
z)0PPO2L>7XMHxyXJUp?<hj7yJA)I8WIN1R55I`a+xU}n#up1vqYD*Gt3!53))C^ph
zeH$vTV>2enT`m<F`dHVIaJcGk(DraZgBx4BE>itVU1nsS<@H=z&t29zq+zHx%x&3H
z@eS~TSRQY^ggy&10BFeskd&dE9g!3`$6+^H$m{G3_M->M4kV~r8NPT7RVPCY#~w$5
zk~68AVSQ#cJ)6;|OgUei%olPd-XYAeXOIoivfmFe>Oa`zhZyraB~Af!%jXJI<CxoD
zXV}j+sv$hFk)5@(Q21Vs!W)b}eKDiuvn&JE5Z7zh0f#LJt7^~nNLB4$IJZh7HPTZh
zJ-?iKXL^P7t;Tm;-&u|KFFf}l3D?N(D%t&;3paNB`p38SFJ8L&<O&&Ejcu#N_Eckg
zmSg*Bv4hpv!R6TDTI^Ugc5IO@$A%Zi*T}Bgqu*B5?rW(xUw-?wH($Gcc3JIPIJZXj
z*4o+H=RaB_t!{rJN7u+`t)1<&fEr&CZfl(2k(t&3wH2ee6})LfY!7O~8JUK>GiA7h
zqlP)`4u2_Nx8n49OV}N06Lv>A>;|{UCfKdE408P-`q9B|mBw4{ksnMq=#dAe+k)Nk
zCNQym*xlj6?oQfeJapJiy8%)M97t&>Mx;eJwVlJM9<0$an0BAbCbQ&%Zo0z;(^2{u
zK(}b#iYI=4Fs*Pf9RWCnqu*r#>aW`90Zu)Bo}B_2#|V&hisTyTz+`}pU}*{odUEoA
zfztd{keBQ!EPEQscaeMp$v2U3tnduxo<(vN$vBelA>lxYJK)aa;qyqog9P!<eL^uq
z{T$h`mw+u9V}FZPUPkf?l4p=`AKV1yevIT*BtJpIJ#=Zz{S=6XArU`1u)x8wEx|#)
zm}R+{Tv6B1e`x#aa?r?0Z0*}yVTBLFo!6#rWEZuS@UTDs=YXh;{3Q?w%AbUVj>o<v
zKw^!I)LP#DL`K@;hn7F9*6|_XYW=_AnQ6yQhHCv5aeP^h3gl`#_{h{YUkw92_N7Wd
z5<q>}C1{&AT8vQ%0$0%Gk|Ej>GGrsfy%}L6tgY*=CuG|7N_m%oMIvFaXTw;>6TlEh
z@q9!Ld?3#C_!1DWeMI+l$#A`cpa4D22HrJ~6Z}p}6JFQ%*M!egl=r{3>z``Up6omi
zP54~z3q&zw?(b4~JV>!sPeez9hbLc4_w^h(SGN5)0aQKk55ixDf4C9?L9E(>I15d$
z{|pEijjs6RLf5o}ZBi_AqZUk1%esY(ZQX)7X%#`nE&^lGDuRq%G~^UP#xsitToW)-
zIRduv0Jj6JykjS)AkNO^F2q5;1zkjgc8I+IIWEPyv^NO^-BaZbU%)-ZWAacuoa|uV
zNA58s_?)sIAi03#he&>eq*Z$}TBi;ChGB|vxsO`GUp!`o+pW%}HpRqGS*NU+cxq_V
z(TEm>hJ3oiQXPl3R2^XQ>N%!i5TEHxgUaXNul)xQzkT_UXJ6j9u$W!hF|<O4n;V&H
z<jL9>wXKjRKP#f$P{uC-H?$|BZ+QX$g*;D`c%@}1abB2*FS@BTCEk)9yYHcgJ6z#$
zR(Ly*l`aWc=_fE(Jm@RFB^!ZXNN*_w+?AW`GFQ}9f;0?45D0N_B@i+~O~WAy2!D}A
zu#Qg&L|e9kz8Jd<0_rXIsF_{n&Q?>q%;f~N6!WRYM#9=tBcA$f#OJCJs@is07y{lM
zM%Y#lO3QA3*A{Y~c3VB5S_r{YKxn6Jl|-id?H9$jgJ(smgb+d)?o$WhfT*U^t|nu3
zt*a({t<H&i{U+^sA(C!;?qV&^R?L|tXYUJ&STh#A0&klL|4Lw9NN%&$DT>{s$Od7@
zJ+wnyl_o_qj*FV+b2QBZ$eLx*pbxnfiw1zHP=3*;;>H}E3K?6cg3_z^K7;xMwF-f1
zZ9cD0fo?@tG%QC0b(<ufyP2;?K;1f)>~2M0{tfoJ9Z#(p@p_R|>$Rv@Xu+)VE7cQz
zC+x|soUli?RF?b>lr&gtmbUaJGhjsDgTFR%KO^gnmGE9PuyBvY)xOq@t3SxUz5D@r
zX*IF0mPl3;$>qcowM43#NG&IZYl&0U#HsfVtR&7X$HuFPGuKk@r~c*pcM@k7#_z<w
zAm8m8IaGU~tx694-`${~o)fvP`@R}_AxPfuAwaV*k#2I;_Xa%3?Wym-f`x_9t1dZK
zOxx=bT=9)6nTEK4C7^USuR$nyYZa!u>n;t4RN>Ch$Yy7#uTBHI%UjsqgwGYaG-|uP
zLh$8->T~?L>k51=HL}W@$J^uf_&dIe;|H6o(`R!rbbhfZB;3P=F+GMW%FIA&Fl8x)
z=df(E=V~1QFtCga4uT+U!M5SD+B0zKB4|pTEM~9a6+Jf=_jLUf%U%Vdp{K+(N-UGa
zT8`T&)P)O_x!6zVwR8zqQ$y@4N7+9@8@#bUKn7MkHHv+;W1Fn1+iU9Hs=9Ys?X9VY
zs_LO-^+-(}sHy{t`m#ELfv7#TcD8o;?~pzBu~;VB6nT!Qac~E>NgmPStNSn9G{C*(
zIJZW*P0?tnjMLu=q3{6~6`UII!4)pG3jPBD|2{6X`jt<7G!k;oLMsrBo@L>y(@4m9
z!mITKSv|fscfm7Qmps7<c)!az*=T8)$7wCVa>XYs?e&RIe4BXkwTACiUD|TFQm`Ul
z5%voyK@;ZzZ@wak@O$guF9%)`=EY>x5-H_~y)t3oQrVQi1%P70xBfH)Z>SEQ<+??v
z$<8fhh)FJ$Audl!v#?HqptHv3C*fiQb$~B8@5SZorHUy{PO+=V%_R#LD*N!T7f2(a
zav$gh-!rfre2_rXFyPraoxKTdT*7w#4T#^@(I}R_yBa^Zbu1lLyuQ8j&7IfAuAg0j
z#jrnE?cV<G;a?nnC;9WxrvV|-|6lEdv+H8tACSGD2BD2RO5X+tOTX*yI@=fceP8hG
z(NHqP&~9Mpi{pC16rZr}AR;Fw14;Inp>&#-vgx!LcD~-=>m$saY3tLE0n0vs&oRKa
z#usFmid$rc>@w>v{xdXbN+tOI!>Vh>H-DRGIm^%tWVl$^a-d}}hw%vRVB(;ei*tSt
z*l|3uA4wk&GyIL25}hmLzRBK!Lqvlb`rcs4swn<hl0@>UEQsP&;h&=aDJUNc>c_&K
zyYZlS@NOp&2R=y%V^6J)K6Nh;5T6(C#-iehySt;}=-u72c<gR>L_B<VS6JM8Z%0Vn
H$FKSS#K8oR

literal 0
HcmV?d00001

diff --git a/test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69d220d59aa6fedcf637b8ab9027dabd6e933a0f
GIT binary patch
literal 55247
zcmeHwdvF}bncvRt?0Xk)kRU}-E55|#10(=GL|T*#k<`PMU`v$sprzYd?kvEC7Fe)n
z!3Vs1fqc##70VIryNjSZxek_{GI3HlOj4<0xe|wRa*ivGD}+D^%$NzCvr{ykJC{Vd
zPIt;5m*3a(>Rk+X7Z3?j76#aFXJ)&5dS<$R-?O`a5{-sj*naLhlL>aXT)&_LbK}3@
z;OeO4a=qeGT>D*$q`0%vemDR249F?B<Qnkq_euIM{{4RbOJIM1e+Tym`FCi4h<}Io
zhxvD8e}sQW_ec46-Tr!IyW;tZw7)@-zv9~8sCYq}6d!1_;s<R}0-&u*5Ok3e0$r?x
zLEDrF=n^Fgx>TtHy-leHU8XdE-mWx)E?1gB?@*dSV@eC?ok}a{3S|-KN@X$VDy0o{
zwXy_sjj|N<F6B1RwaPN(c4fJ8hZ0lnR93v^!|_Vvc$IOyT3MsqrL0xr{CJ&}emx!c
z$@}BV29&f;*$BE`X$Re)Yy#b=YzA#tIzTrmouHeQF3=8T3uvdZ6|_s)2D;^#JAQW(
z&)OCG!eAyJ%cs@68tcum*pVE2zBijYvPn&`%urt4ls=ry4kz>J_Mu~;(C*yfG*e^0
zfA{9qv0QH~e=r>z8fNK@y~Bg4d?q&-%jWtrshzQbbe?5Wu~cqgXgHswU(^jTC7Dkm
zK~l*K_H76y2Nj<Ga5|OGv5m=Wrf+Z{J(!OjOlOC1k@nc0Tw09{=JGL?J~W(R=~!~`
zSS)=spJs!}Z0tZLml;Hj2T*)U4drvWtQy-G%ME6a#ne=0=vZuRGRx9Q<yb5;$ggDF
z2~{(d8&s09^x)xmd+0JX&z_WQwo-ii@bTbt73~lWwcsj9BVm`TC>K1V%kns<Zv`~l
zCD`q!v}M~H#_=0Q>Y^v_HIo!%#jSYWl;1G&ZFLpBdB6E<f56<czYE@iNAbStGjkMu
z1z$gERKA&4m0b8eTK=ocRdAd2K?mc1Z@BeT$RY1@t#oy`ROtvB#xpJne{}As=NZ=#
zDemuKlvI<BXs)=V`3KWSvYElOCI#beE!3MyXO+R^Kw8y;N;;bv$lx`_{hFN3sCi9(
zK7CBp<lIntQ1hhJ!&+!hCY67RQ6epTguV~rP2nKO4_NMqip+uhz))f+nLnt>RJ113
zc}9;%lZG^R??FZ{O{FIn)3*<|ZaSD7NN>svB(vGvU~-enQkz(MD4j%0rbfPL7|osU
zC4`1;TsDzc53|K6jb5s%f{eP(2O3Und9i-9`@FyLtbfUc)+J-B#tyxD*I#USb;F;x
zPqoI6hb{!8$CD?T#}2#`ymS2V3z7PldtU4r>pZ#cjC?M#_qcRE5PUiMV)UhY+6liH
zKC$OqV95u5S7^n10aqkC`p^eKd;dd~UhBWv*}cv4);90`E`Q2+K{zWk%r!W8#nlIt
zd|fKKg(m4z+yx1**=WxDTyf9HM*gH@$yguD4G$`bJR8m*Y{$)_M=9uC`C&E~%K!lA
zS+`I4G=D0c%_cI67RcsO!z%3s^{2>pdy@m1>@og(Dw|aK?;#YLPNYzNex#xNI&q0V
z*0_f?qTZ~T2)#y?Iso1rTZC_|aUl78Iw9s|F`Qq8k4kT3)b-bm&yIfPLb&C`(`WDK
zIJM>U(y8#Cv+|y+_|cUTJ0U$oI=*_;=)j8Jfjz>LlFY79l=D_zbThZvC8%2#y!}?U
zrK{+h;!)&&tHYXS#qKqNlZuz9c@^qRdtA>w2^jGe+<ROv<IVQDQtszsIOER;%~J68
zd>m99x_7~^_{`dw<$m4$(hAvC3>5v&buIW5nb*d-#sRY&ry9TXwCoDH6hHl$*TsE!
z_MzSqK1KhLnUH~B4af+Ld@)4*(rkXc=!W1Hv7<3+U$H0lJ=xBGH_j(`t5bjsJ7Zrm
z&t()G7|n4fa2xHbSme1zLi|XjhRlIrBy492GI~9l6yl)O6ZdI8HOKO4g+=JRm&1%E
z_k)LM9yQN2PhUD85Ayb@HJasE719Dm6;#$j`2$7@m9^5LOoy73(qu48Ri$@H_suu4
zZSx8zz_D27DBFZo>+n&(4>Ia{FXRd~p15Z+aK~u(h5FXBi|(4LUpxAc0WL2s8*4fr
z2)!J5F>qq}_^R*5zZF0E(A0{plZ{(X%ae`woD1ywS<ABHd;Z6sN2Zz|ee0pqhyG;u
z*|zw}Rg-O-rrWwE+qzD5pE@+vw&R_aoo608zUOT7qvs=aFF*9+LnodX+w=7=j&J$y
zj&JQa`NX&Hn{4eo^~7ZBj&~wE-}ky&Z~wsS3N1qA2&n$yir3ZBBN29)r#@6~NA<tC
z`Tk9wx1y`>Ul(|*b0hYDxXw#-llT4|r3gl^h4;+A#164mqi2d3<}T@jo`Qsavs!1r
zPOMh(IQ7wjY_@E@{hLRU>!rJRtM)4R6I!*~f&Jl}(UyH9&+=Ahwq+d{2a-e7jALsD
zbC6qzgT;69WO<ex)FmjuWGvQ~9!&Ec3p!&l6FPbNk*HOm)t-41W-IZWdVrkNZW*3}
zixKw6y#ka{lhhlchd3kwmeBO-IILR5vrhcH10VGZAZEL}Z!)j~?M~-p)YyL3y!iNT
ztBJhw%-5e8Q~&t)#+Av|b?-#hzwdFiw0+>AX5iCV7;h5#zRtqFvA#Rtc`M-UZt|ya
zA#0<~!5e@E#`cQK2IPK`(HY5g2B%Z=RXvMzMORVE%VwGVB(IfiMKq$KNAVP8H>93|
zryxV#GOhqvcNM(PujNqeJ0YpdZTRgsl@y^o#ar<8N{Wwi^;)b|@#ifGtG`Z2ZtR+}
z*kPC2DwN+aGiAYI5=>0lR4L`41W*ptT!ST*YhXW1vzst6heJ(f=0J@C!c`pd*wSZ}
zbRM%7Rnl2qyh=K+S)LMn%{PM9QuHe!C0y`lBz_dg6O8jsD2arUvaMjTOS8l;ON#5L
zRP^EF#>az?T=X7u#Univ-peHXU#3=fnVS4%YO$B8dtSl^+k5w2`qOvU{3_huc<+Ry
zd6Eaz3F$-ku3Zz-$QM_y9Xhr-uEuuliqUQd_CsGX_hS0VE#^7w7^gbpYI|~Mh~<uk
zifbwPwm4KTl)Q@$?vAVRP*IbZ<{LUj@-T!w53WO`lgJh#+lbt)$#m5Z>mlH!SzfOM
zO6R9LCnB0VeOL=ZvSw;-Fu~aUxF5}@Us?-&S}#74WLje)1z|h^`H_)6m}F@!f}axy
zhBMiGW>D39z$UOwtu3J*%nfH1o;1;y%^gT)6I}2QCe&nano7{>x_Nb=#O-A)$F$~z
zt^+30M~AY>L9T0pk9VUsI;#%Twe)-F(u9Mp;=y+DV260np;uPFNJr^WmlLPwx^%*N
zE<k-KwHe%CLf3RvmO$Me#z*}VkWtr#aP;MSUcBee>i_bNsql^$_KbFq9=a%Rn7&E>
z@5&q4ov4^IX#Wz8xn|IIzb=`OTQg{Tik^Ze3{_h}H{#GO7rbv;d>XPdR9g<+{(`>-
z==K%-fNnqK5}><w;1mJ*`ayR<3EpDR9Tb3XKA_uz<SN(onLw+A4CoFhVI@)s7|<Q8
z8M=dJN#?f!-F^|evF|PV0N>FbwgtTaau&%->~134iEJaXmB>yapCWP(k$XV^+|Lru
zioo4rf^-M=Y6R|$Gmv`+M{XV02x=#MY!6*^FOdg`JP1O-y@MlkM=5Z38Nl7e9zn8+
z5HCoR66|Tp`V5i%lwSbIyLIH#p|fK%dzOCZ$ZddOXUWkPl_V!FcM8JiT4FK+8p5*I
z3xgl52T7nJ>;;Uav#>5IYyi1?@KIj@xfUvJoR)9e|CGFOCNNs+%$gevj2sFrIgvn8
zeV5D-LicfyTm>0)NoBy9DbGOywB=g|=$n}X5};f;k1YWz-kNLPDwOFUXO~FN6>Ds#
zL`}UysSbBnuw+oh_nOS5L^)4}E4~SNN%<<<Y}K<2YM_w-TKqi|9vwkAmJq5yzvD1t
zkq;jf#ICI-f?&rM^K@wt5}9!dnA4t72a|(_;D;_xvYrfc!$IENCuAEk2q?OUY@G<R
zM^OVteUBXi(Y(AA_Bfr`N96NFo*+UhBZCRc#hwD0@alMUkWO%T5a45rjwL!)Y%5nv
z?<m8Cku=L$XaM1kkvWPkZkS<zX$VQd*LO^XH=06H+q8T`{?Ey6AF->G7pjkYwfZc3
zVy!aJLRZIocR8ulb_a%{{PF1OLPsCMUGW~K_Zmir!yu3`EOmu<BX*R2ItEhSZEi2?
zHt>2SdecV|L$G7!RaQV2;^pd>Kr9|Q);QKZ<{NJu?;iI}h3~p4oo8-VpE)WJP#tWm
zkF7e)*6ITvo1r?0R-Xmc0d326Yk-r^x9Tu^tB<KV__^wEBdtEsp190i4nn`@4Bj})
zIX)>9m&s=7q(ZPy(+?aFNX}=wiF}60XNhp}-LP+PzFR85voBI2;>GMaBEJXXWTjYI
z1|-i9=8g;|$km@_qsYVa>Glbe^+sOLPKCQn^teUs6AMOVyga^@@JfPTzV$ol_Lf?K
z&b(k$o;|$Gr%^fXxlDTAWl}pXF9(@DhD=1b#IJ=^vQiEtP3tZD92Lif{l{p}RsbAi
zfA&w1#0gpM<j7(d?O*`?BV;A)5)7cbUflI(%U&-`g}3}J0nCE@g13yY2oYcwOrNs{
zn0c08xOLOAO>o;Mq#Ayfi!Q`XE;oS89(&k7rEJ&2j=RdS177s!6PdxoNtS`VFp<wC
zRCrR-?5ikc03QQENJP;Qq&OAcdW$2-f(r#NPS|X8n3eZ?0Y^RtE|eVJGi;lr0abRK
zup}k1wVnfXm?qLogri0u_8grIx-n4@Im&$tU!$D#c?Cqbix6BadHvz3aJvZ?H_fpC
zpGIZh!ipMv3yRby(Py>}S4U$bqLXX3&Wm0G2kZPyFTtGr3Z5$FJ}a(y79J7Ngy>N7
z5?rd?*jUX2V*3@m*V(V&>(P0%gPk#M82SGY?O!)R9;Q88!sk-IlbC;mx{Vh^aOy2D
zXVYb7$d{3vOqeQVwz}Kcony^ohsIJ<;T2|=yCKGV_yip7_Y^?q!RP8AC}J3}t8Y$$
zXGJr%S~6p-(NQoA01hm4M%ob2FY9}-Rg*ENY(oTtn@2Dpe!+ADA$q}65Th3kRPJVq
zhet19*g;A3g6O((Xt*U#KU?!Bfvs$_pMhD!s$pMN31$xNO0%z0(Y_$uty4NHBD+dD
ztFu>0XLmK903RRiiCOR5y#K0CY(@?|F<+IM&ss&OQl1?};eRcthrB5epsrICMF>$7
z6a*&;krWhFA>%{Q6rp%{WckBp98}DRgNkjuH}?3W`}V?pxw+l6nsNu|m(j>BD<GH5
z*2aS?;?l^gMi86s`$RNbWp^k>BXtZV*npZge4>bx+rT5)5ETdKs1;m?LnF8jdq$ZZ
zaIG3AZ6_;*;g!bMb)ZZXr8`mYM}-l6wg+Ejks0hEI`=RUvg7g?jV+vR8z~8~!%gwi
zclTI!8v8bQl-;cCH1Uvb4K6hkmxOj0L2X8$M_0*jTe(5GB+z4f$<Yo+iEJ_F5O%#0
z0m9=x*f(+SFW{qo7i83RN&fuwt^NOw{P~%#;4muBS;?<-$qa}3=FCc}Fbc4kB8Gw#
zJzz0lFo--bSqyoxFl@wx8XAs*khgYnE(;z0qs4ugRkv(3Rpql8IRI#G0bm^%jV()k
z>?BK@DOHvMZKhN?oyC-@JTdF7S4n3H7mEB^fJ3{l<~S9LEtoCL2#G5Pb3LNj+cWYF
zAu6Q<0VW1MVy`j~i*YFw250261L7P)Tu^AcjUWbqZ#My4kB1}RrI;0tzVUznyrk|i
zl5Z%;WjjmJA+Z#$PeL5#xwsJgD4iw6um*8B?zbKEVyxUYmZ1XrK}z7WlVx#A9HFQW
z2Wl<H$MnEZ{up~5iAh9PNrE1A-NY#1vgqTtoq~cbf%@0FKrNxtd~-rUuAjPX{j}(+
zhKl)sRh|RFVSEQdg}L@>^(ZT7SSnNw8h%;mQx4)FMtV304L4dfau8nyv0KzETg*K6
zw~dA+VJwU66y~0t5*SMsuUcy$YUK_!u%Erz^Gi_Qp(ZnPphjX4vqPSOg`G<1y4Boj
z6*bp<Mh?_GB(wmhnmeJnQ#z~dRVmL2&7I1#(7Z}{7IG_LOQ4S^Q9+<zVzwcnq^0IL
z`@2vnELs%kO<~>@uj|nbdL*TA=fI_}{q4US|KLZb_#hU<^P1B7pO%M0TPWXQM>5tj
zhQ*O(fEDO$1Um-zL8-(t3|^tLhz%IvoaKXGh+$Dgq>rW5;cQ-)rHK#Bl-P-Ju)iQu
z$~d@WzO__f$L0Cy?eRKY3E;fse#*=ps&oa-gKE|4T7}3?(y<<zJx+TTb79}2!gvt2
zA+B%X45>sU-&S&Tx1)frdueU<n$QD4R7}l~O=HY`hOzJAQvVDe^}m9Qx_%k0fBCr=
zpF=2F7edKAQ_)qU4_uUYOn<EYr{o<orFp_sF3sNyGmbf^9%dhSZ;McDsV(T66JIO3
zIS+#5Zb|J`4MSB*q+u^!XaX*)L?3vaA7|wPIFm4rF#y9*fDeWV7J?_->c6V26<{1=
zkdI?TE@3cC?c*55F@X+JzI|T;Hdiza&CCI876!xjJMdW{hWz6ci&t$Rk;N;kdcmxe
zoC8;UzCskc$nhLx9zlAk_LgO{Kh*q|XXM}<xN^-cjya2Qw$*BGR$SC|E7oF8?bG7n
zCGt__XMTfP$1Fw&SF3^j%*2#V{nJW`w_I&RuqDR`$mLX~-)w7D(plqkAs5wgZ8)Tm
zTof{fi-l_*E*3VgXnq^Iu{p{&Xc{>2O{0JP#Q?^3KsGXbo)jse59;`R+T-KI{w3|*
z3j$N-Rbz;kF2mSnN6YGH8y(5!XtQd2c!zGv)a{R^DYMhku4&ISWugQ_<;I?I5{AYy
zOQt1Jb=V@6qn1zd*vuM{B~-5xiFc5+1kuYFzDByXCM7lL0GGZXG3}v8vX=<g*dL(1
z2Z`ABQiTqCiTp8<lSC#!xYWf*yLEPyh-94*^ytUVF7_{wE*@d0D9^tp!b4*zMqZDU
zogq&d!jmCgm5hPw+^mn06UM-GmW+Yx+@g|B%spiW(bfGW<?6nAY(SqPi+vv%PT-?%
zf%-g(0eCIn$c!Eu?Y=1YOfURbdjjvsJu~Gw()YML_ZwjQD~L2yq;Erx!?zpXQso&Z
zU~eT^j`CxB7M{-{Ms3lU{-V#;fgK%wen@EnNNIk1mVjaq_S_(s&_aa}Y_}l+zZ|R@
zid+amLi18CK|-tDeI;VEgM?<wPF1B0O9KXq`>rDoo*@g&$$?f)Mw~Uj5Z;Ru|FN`W
zCGeV;^Odj?R6>QY!B-+R^OcCnPt0$FuaFasoMwSya8B*aGpBZjTtYhw3$}ufCa+j_
zk*e*?N{MzBUT9}#tGH#`nLMX<hFn5Bi(I#MW@!mk+nJRT?JTm;&Tjd323y4(_$FC6
z>=~M(*R7pdI$hOvW~D?si!QXY`D|xg|M~%mZkp7sN%w2g?jMl2!Ttl09}xME{Kq|d
z;5uny6Vk{dWoBw9M!K?O$CuP%q}D@9_@qNzxxwi8J#<N~+?47yrE_yYCDQ{#x~X#R
z-VRObH)B5<m2@8SWne^nLRT2nIfQ^jwIF6kg4J71E1H)kW9rqbNf%^)MdU|xX&E83
zsj?YCbn`MF*RK!d-@ykKs^sJr6&+&^&n~X%%&4>(|ADEU_+(I+cQk<tr7|jw_5Uf#
zdKDk_A9Wq5;f~RVwMcAq?}fId(`_3j+cunh=5+AP)~U9KC*@X}AL;lC%(&WeQC=}U
zzyDM6iko!NmDBRP{!hs(=l@R6$tx<TbAQLxISPI`=q|cw@)LYZQ0u%<>xB6uoy_z`
z?53#J8A=`IUbR#bN3|}X_=-WObphK|Rt&)uAL8sOTnN{o*7*xzsC5C#C9o&aefrVF
zZA-YR%ATy0U{6qBYG#ntOsm#2Gy#83YMrPaJE>uod9PM=txCF(SzDM3b+s<S&4m#|
zt&7&I)<w;d%x^=j3zJHRPp}x8Q#<p_shuI0(9Y^=e#_O&2bQu{wVhch(a!1?+L_rZ
zZrOI`ol`qQE}@;(U$=H<3DH&CnUxaltbU=L-SX|sH>Y-nTtYi*xNhytaz9mVXI4tI
zvxbFsHlOVbYF%SbnGVO^g1*4c5cviXQsvkWiEy2UN8kJ>I{eQ>9wyR4#8P(Hf5G8I
zo#y3>A!wf50hLjpi<4r+enLN#M$qeyr2j@g&_pXL$!{h4kf9Li6Clo1hz4`C3<E-K
za%v`@Ih<D6+bH4NR8$0kRakf;+Hhj!@z0FzJs+++;W@r;ba$Cb^o3KqPq$9B-M63;
z&68qu3#vq&@K1@;@tRQ_+}lN7FH(=_+ifp#(Qk?BXl%w-Q9bgRsrs!*O7jf7V*w0v
ztF9h}%^dyYAuZc5%#G9|xlBFs=IhOb{SLk?^IrM<>UQ}~YJ;Wp0OswK1PEoDJ=Ov+
z@5L~!T5GUGDA%C|dFvUPzzOxJVohe|K#hEr^VojJD(Nh5R+V(NdgQm%BN{elt4E8?
z2Pl-(W}dUZ3$>_6)-#$#J1bX@>}Oi3ot3Ibp_<>a*m@k^vK?(zwVjFTQD~u^nY}`p
zhjs>WBHY9NJsRh%M2C133&mhrGd^78$CTR+@%=m<K1SqmBKwFuL8NrPRQBI-?wXRq
zwriM9xxH3jIY&)EOi8oDX(ch3!-^s#L9lmF==Z4L<=lFT^{2kNcXY27T0#=TZK4s?
zQDXSO=kkJq&Z{LAX4Fgbdqv!!qj3=?MLkyJA@H`Du_EA}o*Qgjl#4;d3xi^?5aMgh
zg}JHGiaskw5aS!+rp9O?TGNF0QSeef<q}k;>lZ0&Ay-wE$w~>%_3@jn<%XFWtC%FM
z_z-Kt`#L^rWu0M?tgqQ5S#P4M`E8gaqozp`-xPHcE=H=FCabqnc}}eqxrA1F{rt6}
z#7BLcZ;sLAR!X$e=qw_3=f@ykZEn?tR%*Un(nP40)>Xt(-<(=0atW>U`k6T`qf%8Y
zwNj#$)-SYD6VYx(EQNg8(8Enr_mZZ~Lk_sPN|W|#(gXMU>11g{f^N;4rXS9OXwrlF
zY*{>p+=v(7Ulyg{I3})IoPxO`RBRKU1p@&NaT-F+O=Bd^#vNA`(POa{s$QB%FA;8M
z`+eH`5fL({#T(dJ+MgscD`Og;wYZ<M@%ZfLX)g<+lHOlxRx?)%U2_d_84FD1Ggxhj
zRw~XWv0@ZTpdf4Y-*WMo8_~4TauSX=RTYje$*ZPs-v8V3s*5-8>bUWnUV$ljRfT|n
z@54}vWyGa}i0<Z;tX&l2eItVbJPy__|E!!EGPibl5hpG>HLhQbw-x(bm8)1O$+q;d
zi&eW}vX2#e8{skDwzaDjIKnr9?>LR(uEE-+TOl|bID;bXfHd8U@e?}74Gd*o48qyU
zc7zqz07eS#%JgPZ8B8>B0i}>DM4bTPzJ}wl8NkzUe6`_c_@xzk<!CjyD6gKL-~V&+
z>X}eLyp&>ku6}4h0WGYKiBQ}b4hdw!%}Cg<KInr_o{02*tk+@~$oYCLw1_Cm>~mxB
zcXvVNia4g>QD3!}b9agwLadJ$YB&x5kOk3EjBjxm{_zHGg*I-|hG}2loJ{5=13B!J
z4pBsQN_6s&P-~~OmNJAm1-Cw#43CGwQ{+c}dm0C!Bqvj-1udqiW+<OtOw!>~>iF}-
zqO7xip2#CS&w8OFezfOl87lyqm7k}Tl0@+;Y3Jp6a)DEGr#kfEhTG9<CftIA$w7vT
zl0U#t+*OLG*QPRglE%z=^R!4MGVh^;G$8N8I$_2ijOyc4>hJ|FG;fY44sna9)=JBH
zDv4w^(>Dn7q`B5*jh4}gfFL>kZ@Ty|LBtt$I%fo;ZqZlBA)k~fA?l;;HgEuRtWSck
zGu4|x2<$*crP&VHuaW<4eAF`_1ci-bjpL1z%`4AFRuLrDpICX~(22IQp{1j{FEqDK
zH?N;;UVn1$sjky4r+sId&!o<TrkX$d!Xu-5k8iywFP;7f{~s~?S$Sy%gp`4kBrGYw
zMo<qZaFPV4vNN?{C2*Bix59Kj*8?lDay^fZEX7LE^zqU9wl$QkYTvd}qHhOl?%N>?
zD`{oQ>x-2p@0kr&n$NQoE2B3VE2Fo<^8{96h&D&Wt?2Ju%?956FYE(BJNXX#Tl$66
zZuWB`9FP8*_I^PmPlVu!{U0KiK-8Haw6X#S<ro%<C+ztR@{)e4l2ixmX<0nhc;fEy
z-6tEz@1Bfo0Q@wy68Ma-96vPPHW@+?NI3-MPe;zEXVO#6kKFnY^lL##<6@aBL+iLh
zFW0wg1#Vk&hn0hkrw@~i&Ppu9ykLMD8ec1l5#n^LHv?*XHFpia1vTV{zrIit;!krn
zKx015QlJUnWS|M(3QrS26Y0_6grg0g*tRmJpm)>&EczN5&eDw`(<D;qY?i%;;s~`=
zn)iSUHYauyyqt|J1F(c5GqHrk8Shl{116Fzo|fnM|E|3F;#~9U7v++Sj?3Zh?C`#k
zZ3$&~s6+GV2bu@pn!NAf9@td3GMem66Vi2RQkNEbq~r75oq9Ge-`CuabZGJ;Jkxd#
z^P1F|vhW9mSPJErfi+@eRl${~_ro`9UM5LsX;$=#nabs|`++bMp7f?Qg}A9;7Qp1`
z4iuRv6eKTZdPU#j$`Mu}RI@^KjZ<8Uxl$qZM@xMb@>Epys-&|5tYH<=RYE_virA~@
zHJW!Ul;ormTK1YUm5^qZ^Wl!orw}%tLgW+u6r#pcsQZLIh5C{n(eO!p3g97)J(@S0
zOAV_Nk{0g6a!A8jtc3&%9!ae1eIh%E5N9!!dOq*ZF7DugKWM^DGvIdcHR4OOd5l-p
zXWIEYDuAp8?C(J?69noa@HXsWsrVSaH2;H7Bp(0VQ~PvnzcHIjDv3Up8^%CR9zo6S
zqHMOh{{=dHlE_ma6G28Op*xz+Vh_twr}Ifs!QZS0SpSNC6o#8_6%V#MEjXt~SkI_X
zf<HFY(SBz2K>80+zWQ4btV<DCGPd?R4^9O(jCNmWYME|YGugD}JL<`fZy!Fr{an-T
z7d{J}e#<3!-Slk!-<H>1oK1QZ&a=d)<yri{Eyrhjl_`0hZXR&tcLi=9ux0TDez(By
z&`VtN@Dw<wvt*x7B)<dF*Y{}tzBGb5GYWH|i|Pn}1+U;TSr>GBgqwcw5b+Xukbd|F
zB0f6gb_)V+=BL9Dh!bG0<EDw3fN5O41TvdZhD_zWg<>m1=8khsdriokopb1IdG&1l
z+}txgyQ@r@4kX-;wOR81VQ!mPz(0GjqIu}~y4LfFmiN)~qdn|)G)SS>U!}t)w9{+%
zdXi)GUSF}V>%NegeZ4n1kjWkc)|B-2R$MZQkNVG1dgb2!sdG&an!SB?vEy&k92$2{
z&*pzhzO#bpve0$$wy-U9-T8eAde>c!iwa$rREsKI_y186&af+X-SyXC)2S?6vgJK@
zF?uca-Wq!D&T~x<=sox1ET@!to92X>-C#?MnXJ%rM{bzXK`|Q#-IyvJSD;NWwRyQ=
z!ABIPHvcRQ3jwUeM{ylD*3@Qt6#YE(|N5BPDD2-hwdqfx&Ugy-pTwu2G?4R1;ZH)p
z|HCju)?jXJ<ZYno<J|_1MdQsQT8Kg?p33A8#ttKNLotjd8HFZ^{*qvKkL3ol$1tI@
zekx|D3NiIyZaAy(Lh`wMG8@~vAvT!H$L@};O&?8Vhn2KyWFa$rXY9zqOzL1Pqw++V
zLH(jJ46BQ`^Fp3Dh~Lsj(+o3%A#46%I+n_@bgVaXG=}NN_z7G)liHLS%%>UOjq!+I
znj#G@)!2dIKD{W`2sqR&WMtNYlUKqCt-kC&Z8gg_q%j9It4GkY$Y3@hedwb*%?!q+
zkwp|sLuNPqjA?q~Y8Vz5DK&kugUHSzRJ*5Xk1Sq{q%Kwm;-s*3UxO2r-AV#+EqEwu
z;y@bDDxFC7^|5qclAa8^9Yrsr;(v;(jk<mtaRry!#;^Lu>H3wE^(!v~!pFP6>Kl9b
z{E{^%BWK&T{?_Yh@V)R5>|h-q1YF)Vvx@nft6H~yTAs!KO!i`p=^ML&?B$^Jrl*{_
zU?e;NitfeX9-PCjngGSi0mXNN0mbwk`}xwJ*N3UlN^u5LQ9^Y0{WO-KY{RG=xfp`4
zpT;&~oMRzudC(%b=ZFEjmj(Zklb0*ZpAye8doPy~%`Y~q)!%0B+219hmtpgK|59_$
z{<Z@u5l&>Mbn>x@;Y86YX`RO8JC$jT$*)qLHG;HCI(xif-D^G`<Qc^XqlQA%2>GlN
zLO$)gb@K1knSMR<+X(sO?E~$ER!lI)8*<ME`$s%f9if*H{E<roZVoqxR34s4kP0qA
z=n79px#5donB9h)aC|)NfWVF;WDvusPwt5wV<7}uXs|DxcV1zT$0bhq7#YFzB?@@_
zPubc~X{rhhi@}NpG1?DM@<(YeNP88XBWx|Ddz{EVB2R$C>u1F^*ip(jP9#I5pUCq>
z_$V5h#ex{Uu<4<apQ@3wB=ZjLl$E%%4fv=(LGfkQf^v@XAUMiceHlAhJ1yT;<Nv$z
zT^Hr9=^Ow5uH03Dqx?NWgG-|VD11`!yav=q@Hl<@@BzXijNDK?5aDF(VWDstkE(3T
zlMRw@c9ekGg9Nq3&()ToA+Td%W#Se>gn&KD)ggijGu4yY5-#H~b{+@3wdX>y;}{%6
z(d`I<h!r6aRq6^hhp}c+5lU(>@7n&h=PxnuUqlexv0Dl7r@(!~j&Tz99sSNe1trM&
ztOEjr&jxDZvz4Dx18*yh#F+&?Yv-xhR+J`te0u*P``wy*Zx}m#h=X0G14l05vvx|S
ztBZWLNoZqEX>C4RxlEhSR!(R0*~;l`KHF^Z*%qZ$;Ip;%l3DOs-d5y-$B;u`6`qY#
z|5(J|{9?R$&m7)-+jV>M))0%55o}Ix(@Kee?KWkJfj{-iQsuTny&=Ig*z@}c_wR&1
z4JPiG-zNTG5qnpBS&yzF@;JS}K|9uiCdiT(cQf+T={h10&EUQ`evU#joHR;3G089N
zP9l?(ZUyaG(geE$hmdZ{g&`WN5gAD4S>|Y>7gNI^$U_iNenAD1loF4!O|(yPh^B{Z
zq!6)@Pv}|aAeA%<QVFl`8mjNLblJ*M3A;>nI|ibXf-J};`mQdaEVCsPvF?cjNi~yV
zaa?{UKI&s2Tm(5EXgI(0w(*Y1=GCVToslLt-#;1HeLfgHo<APB>X(DQ(eB@dTuW}#
z=M{f*->Jqoo}6miLX+t2KE7<W=1p!9owfVplDwg8edQZ*_>R2c(k!jD;vD1eRJ7dc
zqe(<}Y56!3K04$&ia-d8qzU2(%JlJfHrzTvnU4p0`{xK{0UI)^PCaMW!+|B2ObKSE
ztOjKW?zRHGLlE1129)`2h!vpBfz1}KHU16@St*f)Cy80#TLC(JoBNa0JwP1qIaA<6
zN-m;MeS`_2H(O1F!{lAGXMr<cj{-~6SqRP6((g{1;^S1Jr5f5%by)D;HZ5|iq7BE~
zs%Rr@07aMxs{^axUMd0BlIh0sESY*ft+4f|+Qay$G;NxNv-Qm<9y^{H%a1=cmYEE!
zC6x8q7`u4M^rCf>i`Ko_b@I>~+ol$EzVJCL-B7qFFQ1<4|2cX2N6G$<yj<7qa1+)>
zoswYc>N@P{1LXJ+Vh;@-h9)P!=AlT4BG!egX{-t+)$em-^<epRYj9lA4b4T5a;UmC
zFgmAKi57jZltI5M_};9uY-Kd3m!E5VfkHriUud>Y09cPTelb-s<*8Ygv!p4dv1R}i
z=RtA+Kzn8>Q>H3e7{CDtXXb!x*C^brL!Q~|s#VIf=G&=~&VpH`$&l)UO0&{Z2pWJL
zsu^Z1eZzcChV1Eq)r=MpAS;>|A79ar*3`=F05!EHy(O8Y!XE7@_^ej+KZX0db}L#$
zc@|oci5YgFh4A#8TBX&Bl*QJnfzVQ{RRe=Hw+>5dHQ%FC>%hBpD@zwzM&-A1YvId0
zwv5|qY#GbIfo@;mK-AmK-nPJjN;wb&-{poigby4c>36~-%sD~ptT8SQ^DxG3boe(!
zY~x4|9quIZ9U}J-AzMjY8u_B>&Do_7W0{R7Cpk34az{mHPV#`diwZQ>puV3jRBC6@
zeEPa%FtzZ-;Dix0$fWy=uw(d(uHix|zc40N!xC`U@*9;d0f!4R1@+Q`_Nr*FP(8mv
zgw_r>7ld1y$_)$+=hNnLa0$5j6Y4Og1<pH8NX*CKTvLIF*wb!kesh62Zq9LXC~=*_
z@+xV<I@leXud(v=YHq$Muk%ov^<q^!b2ekfwnMjgn)HibfMBfZlBMH~Cou`f`qPbP
z8h_k4+4b4U2xizgAF9Ww)w+i1y7iNF<|G^#0sq6?RNa%K4_=h-o1U-#cjfyo%AM0U
z`2U>Tc~RaveS`nc$(Xt1)6?_#KT{1O1(mB|W3sJ=y;kB@E7m$)0C&I9AXR%Vr`f7t
z*A$xl1;1KNXpTbV!m3wu6q@UAG1W_)W64R#wdOdgEOyO7Xl^is=77?uG!+7dN_BmN
zW?E{St5jTQZmvx<f_TkSzTj3O5DB2FFSHZ%TWok5zw_+`LQjh*^l(F;E#s7lH+<HI
zAK=ljcJMS|sew&&$c2#2wAVp|ct7i;y)KY=$Y#rIE1h*>$($Q+qhE<7tE9b~7Cv*1
zRDCMIa_);U#F!pcUr2zl9Vp=dJ}QN*7_1kwU!7_^-8i}aQ<IT<%*i5VvfVT4k9VKx
zpQ?L|SnvGs!UNOt4gWtUAE>~ewn5gl&4h~gHO!=?&!R<UL*ux{No`;z^onLe#ka@x
z9E`R_4|s<go(_0C)QgoTnZQ*4GPq5Ea@Zys%w9!?O}_Oz7<NkB@Ca%#Qaa(gWfZJB
zdzO_FXh3U=UauNh?)|DYuu`H1@S7Mu6tD0E9o)W{v&-}6u5Vg-mFNf_2l%(w;5fc~
zv-yOCk{kjZ3zoshR2<1nQ<Pu&zp@MHQEyBL!!d)aL|l2=h>izSKG%o-5Eo)x^5<y4
zq2sS`PV@ICQ>i3Vw6HjjCwUm4FIjW>8NqU$@DF>QPao5y0eGhOP?>u{Mxu}ElitPf
z+s@cXL+D;%c0Y=+djP58VLfnd3(6J-^q1)nq*rK-q<z4doY0FIQ2PYugkJ!$d#S8l
zL}-3zaY%m`4}DfC2FgG$-96Fns5r26ewYoam}W4Wz>?O`Au!h<ZboI@D7+sZ^;;;3
z&vrx8_;DTLET2gW6Zst)J2ZZ1vN3*^PoxzHO$Sy_23C)!PwqRl<<y$*Z<-2xX0-dN
zTWWYvI_|j?XrI0T|L=RLc#bb|sVBZw^CuEYE|o}V0h-e%pBYqnD4@0^F?5WcLA(BJ
z5yaij(nm1&Ls|=3r!>EDPJdBRdbsXPjveAVIkW=xn5qTx>7)5043l9K+tC7P1db0U
z5h~6D6c^*19<<0^QJr*{ATmINx&kW@871;ZM7~PoYeZfq@=YSI6ZsyI?-Thck++Gw
zLqsETfyl3k{D#PTM0lJe&C9@=iO>)xMq`<Hz&fJ{L+<0}4k)fAa(>0%_5|W4s0i|Y
zYJqzOa>{TveJ^_ydGPO2KLz@pN0OxX-9AbFpw1;pUvd3G=x46r&s<?nv0wiSSM;jS
zB`vyI7m%V?7I~z-(v?UAyUT+hF}JkwO2{p>UkQ1nC08PD5bSNd(!zhBJ)Fa@^h1+d
zdQ7?!l5jHMk#^EApOdaM`K6UtS^^-8f*@@HY4w#wK55OBrC#Z_D-8x|dvB?$v2}do
z>4WE+7LPx9y6b(9<o&et^M)43a~n=Sb-roQ_yea`Qo8$`lD$_S47*4DSK9p2`YSDT
z!6vV?;!1-ot-4Ydl{Q?7`K2vamPK&5JRse1Ww~GKmaepTrId8F!6$9v5BL8Ady(u;

literal 0
HcmV?d00001

diff --git a/test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bbbfe107fc63569c10ba103308548f9b2777009
GIT binary patch
literal 14535
zcmeHO>vI&>mG9?Fch5sluy_~`@^cIl8u3&B+Zb5m3On`|xI*NlCd1s87%`d|-|j(Z
zGLtBYYXcRwkmICax7aGypR8gZ{u5&7!zPt45J-f(QYBZ}`M@_UQgK4<hdt-^Q=>*g
zj&UlnuVzkn-@cE&J@@|3IlpuDt3*QMAbr67toZvjj{6WV+$1<Bcj|C*fg{{7M|dKX
z_+fz^#c`=1@Z7jO9O3Od(cvh&qYSG=e4QKChy*f5WRP(Z0hu6CkV&Epw~;oYzRnM~
z6Af}YNDO2riG%DS36R|+33An>kZ!LJYOj=wX3EqJ(@2d}XlkNDPmYu-6I+Y|Emloq
zOEpgoJvVBO<jld*{_3Q53m-mPaE8DRQIH~h?+n7oU_5-2GkLebm^)fG1C9;uigUlo
zH(=yzAg?J9u_-o$hIsNRxRM%@Q8Hz>(pbbj^N$3|jPOLpIwQgr$-Wb)qxfarvxeaI
zyi7UKEk`1kggW%zlu48*4Y|m(tB4tM+YGc6ck}#XBjUE?9>32M?lj*N;48t1d7?Vw
z*S?73pXAc9Y&|oom-Sqcq<Rl#kGwE|Dbu?<Wzu}vo{8?gqwYN`>Qu2bzsd}MSJ=CE
zhOc)%{#11`lQvR&_onc4TiTcrED`chY`dGx8-`9zc5g@8z|0+1VnKJTU8lR5s3pSu
zw_^5k$eSiLEbY1KB(9uXo?0EbLcUbWQN2nFm2oJgTWZe8kLZ|VCHJx3ik0#~N-Nar
z%E4M2&zm{@bhVT(=S{c+UC4I3Jh5WXbk2Unf#Ai~;KiVQVK=osc(K#KC7H6*-LGsp
zQ5n~_6vyESE9LwagBG^HW9fNV-uV1mYJdUkEZ4f)t%unF1EbCz&l|-;?zLLQ)Ct`P
z-6Y^^oCYz)-Hbmwb?|pe?0nnVwu|z0Wy93IIqAvSm3-!;zS)(0u1bA(pl|LTfCQlM
zB_QGZ@za-p<N~pKTm;B52aw}mL_h|GlyJtM{34DY6jEEZ{<5c#_LXW!>N^%IBRfAw
zYE{%!Bl+=SX_9FvxB(f365uzxKJsGrVDD~<_K6j-Yf&_3DB34fMS*q$MIfa~5LPVE
zwxxPaTO#CJirca!Vav1~tMz~gf<8@QI~x|Ew~hOb`k_xlp>C{cftccM3w$y&b?~DW
z0Dp8=TES;d>YrW7XHM#!UCC#q<*!P;v=@fAKtbS3XBO+XK9$8fhQDP?0{c~HieSS^
z0*4l9L%QNvzF;|u%}7ITM3`k8ZA6V*CMsx_#zL%Auxz8uvV}UnWlI7!=U9lPdBth*
z6lbgow2w2YB{g^#`e?8u&ZL&)mn`$pzyH)>WqsRv>2jWJy-a(O`;^O+lkc;fXNi;O
zr3ka)6`~Nep*S|ZYIeE95A=u&q$M>}w@3FFP$44P_3)M98*N4aPPA-2?!n11fC9G2
zIR*l0zJ3G^)Lm&q`?hnEvTfEuHw#Xjtbw%AKdPHr^C`!u-RxShX|3tnx*4#0=~{T?
zbOi0%0ThE2-4zBgc)81xz!c8hZ3~#enXO<4WA@f`jCNq@P7sy|{ZV``6kmiDN6Ykh
ztynUPWy6Y;Dq!~-R!`12QK^*(Gk|lWrONSqDOb*q>vR>?T#e!(5NVB~YGF?oG-We%
zJ!A)S9DCM&zMXcr_C;oaMLcBRvW?)(;DW1d2KwNR84L#-6rWxMisMDYD3(WaC-q4f
zfLWtuBd3p7%}M$U)ZYMK<2xYGzFoW6zTGq{eIB31Acl*Kfmj`!=?lxa+{9b*O|Hm+
z+s!q221i0eF#a{*!iG88PzK;g#5w^ul1zkAmc|&_;a=It1#sj$@*%+G(*XcSe#tQ4
z@`wX~BWank-iV3pxbPy0kZ416mb+x)EN3au5>EP-1W@kK74Fep?oA$+x00osp@zGh
znd=D))m@ITtnAe&&^{=J%WrEv=zngv{m<>NIJ{-&uQ<95+un|12a1*jC9}5a!+2{a
zh@cy|EqJlRz&FblgS`$GyLN2O=M0m=VxR}0^keWfc7vGWT(3^JuB=0^uJec4`}zCl
z>=N_#*r!mwqev8iq6G!ZpJFsoQwFL?Vk$NQ=d$rh;MWYB%`R;OE%>uJP$xtoH$0WK
zA|<@^T=wSYf>|HdyH^sGFeopZkODdilvmW5%}PTU^X4=BWKL=*?wsX0cu$*%Mq*2k
z^}C*$bk*WLTK1G6&jnfxTR#GcyT(wFV~STJ36gARP7g5?XEjTKmQZW7q(;mgjeB%f
zA}U)X)R^*RN*8SS{#43UyEd<s=}qlUwUA2e&QZL>EcByk-V%!Cv_m1m;!PFHOv%DB
zb=UE8@K|&oh<X(D@J@RH@2o3$m4UU-^yt7XL=BojF$f|Z4_)7EK`&h7RwQ4o>Sbaj
zU83lCrBbR?1Bx{6D^j-Z83F_q=r+0zJI2*Z_k##x!L9{rGDG)Zp}imsL}!K(j<BJE
z5yTET#5k(lcnu7IYDv!-)qFuW=u1#9jSWnJKt;KyRZ)Ht;bgG%H$Ldkb!j7mocycs
z_6sLyMy2D%N!6$J)XYFe-99j&_N$xV>&}o~+voa={X8wS&mGX;(5LBh@P2&D@IEs@
zm8uzgq<z7W9?rbH58L(M3G-7~c8EWF82ww<=}h$=X6~$3@W2gzaa8}z6uAs9{Q~g&
zgs+3aiEuoIi=ZX>UIl;>M+8<jLPWxqz-DelL;Wj-NJRGS@`h*i2mKPC+(JbUe7!`}
zQLQ1o#W$ik#~<T3mAgOF1)fd`)-%yXXUXqpJJ(bj>H+S&RuD$Hg78gHnoX^tjX~6>
zHMu5{!Jq4f5+QI+G#A+35Y9azam`p$BM>1XoC~1vOtB-}qb*N1!JQVr&Q0(bH!28C
zZ30SUID?;n>#uV>{CyW{*=3M#SBvFp&FqJ`PF<_iOguM@QxxA&y}gwMuFHLWSW@Zh
zv&+`izP=IQk84!Ndtf)?y;J#8O*brFORJVPL5rrIGxgJ^B^r|eh<Q4C3QkSRK+uxL
z^To0yRRHEJ8BLC=6`|~yj$4X3UIn-@PgpUGJQXTsQ!krV#OWdG^lgdIGdL*bh$X^<
z(IfEmmQn#Xgmos2oUr(kC6|h2-M9y`DYRD3;iFI;DkBaWzXSmYrgcugdQD4B?Y}8S
zf0+GVc6#>>=@HER#x-r@)cy~p<juCu^QX_Ao;vh_vi7FZ_LHvZBX9QHR62jM|DyQj
zp_@wA^w%JJx(l*yC!+0<+i`C7dUiIo|Mf$++eLXjT2;zf<(&3G*Sa&}IprfYai(x?
z&Ghm2)0?ih56yI5-uL!v|1&W&@n4hE9T$4u=(+gVPdB{Xc}Bk4KJ<ymX<Z-0+NTcx
z>9)#suRkOHi}HW-5w7Da{++wV0F36{&hF<2xp#NT`$h5Hy{n#oCh}fa7s@9D$a!y2
zeSWw2-ZOaj=c2qn5iJCltmkF?SX$_1G&s{+P=p`l|5Lbv5CEyk`&!&z>LCHXKl;o0
z9yCZufNupB*qT85A>M=uBeIjb=xDG;2*t6uhHa7g!N9)?F%u1pGvuiQLC^anW{7f0
za)O>J(E>rwF87TEdJH2-JR8z|v#!SkHkuvuWE}e#k^(IWY(;1OHKT!f6!mfil4#+j
z|6b-%@-v%9-rHK1;PXntOil)V)xU{(#JGB9x6knO8!(Y>Ap|i`e1@+Npg9f}yiKg3
zDQ^>NV4ySp40y5tZN|dDCA~}uG%+JFXPu&9L!U!|s((S)7#KUc1@9mVTD*FT9p#ZC
zsIi$i(<fH!g`@wF+y7Gb=)vq!I);UD0Wl_RX}=zHVzpE>(+VBN3iwF2kG~7g4x@Mk
zM9`=oq7}$7(0_C-c-wkk#|SS}ayfIN0@}1x8!zW<O-@ZH{tDK+0|MOqc=z<NOW%Iy
z#dmh!z+mp2w08Eve{RM*rZ=8FbUu4FdvV+Kc<S@&_o}qkrYC|54_}~d%Yu3c(rQ74
zHwqT+TVJX?cvT$trOLy%;ClqFqH0xn2e@gTc?am>geg9c+-NER#pe+?U_iA7r2k$N
zU*a=UeBRqyR`GcyLGdO2))e0~f9^?%1NE28&WKC_(kEaN(x*_2qBwzqg_(<Z#*`0J
zBPa2)go0_WaX7mLNXJ<+_<g29t8kuHm)1MfzzPz2Rus4X7#Im~wT=^KS?{ZzkxGrW
z=pEephzi^I69`Z5{CexxZp4p$Uhx0B_El%3o9*2fv^O-+JMF16@|?0^_Q8LyDH}eP
zxsKIKD4!pLHF5vSr@;dOA)FTrVi75b_27d+%w{Y|(ZFlQL_8)NIdmdnI1Y|7;ns3*
z3g<Qe<sdbYUzbRP?E;ZO)kWXpTLRdnz=A=A*^+>G!2q_3LGJ*^?t33-SG6Rc&<V61
zR?hP+h~_Zi3W<?;AdKtLG=Y{v{J(pKah=dDyaRhD48DWp7xNuV!h8Zs3AW}og$djl
zLr=l)W<v}@2eW?Zi+J`ph_px_!}BLmq){*k!YR_S)5X$!zA-IYGP|;b!x>9Lh7l}Z
zWMt@TC@2cwa&8HMX#uP%U`a$IOQDsC94Qt|n+Jy_;aJcZ&t=AQXmoguk6MKarFy{x
zdZ=2(h}id_A|j~q-yjx2*86>fH`-pFdF5^XXC43h(7O-)qWk9$zf+r*r}J+p7azaO
zUluN=ueLurt-Srp)S+u_FV9KO&aUKhO?q}t+B3V7&#$FDf4(Dg?f=BH*x$cxJSavM
za69<3FjKc-qYYfz3Fuom0UihMvV^f;A^^)HdMpnh44{ta(4G*jK|~3~94~%KWW*Gp
zHhUP(0Lp}I?1l|9k!A$8(Exh5##A#(qD?@RhQgxOK)D*fS(?NJ3{n{eK^-51h~|^H
zV<DK}QnYLn4Qe%(#=^W|Z&YAv_$9;QUtUQtHMC{QdbIX3<-9kuOgYcLU#6UA4w2X;
z*@@A`Ng^~x=Z|)!F}etDih-}%R2*ahT?4q1%+}4&z{vv+*4U;KGnjldLm;yu4Z#aS
zfnP-;0r&x%NPv^2cjy!<wQ^vOPJm+=!q6a))<#;k`GtP$3L+)`)|{dGf=xNNmuCQ<
zB5p$=14YC`OJ>~!U~yp#v2~+B+(yY)u`fh1%AoQGc!v81*ajSm>_RYzxQh;94%_tO
zY_S<&(ZYMs=@0qnNp#p5St=IT=5~4pnnA<Um;o`xebo8zZ9!1`KNh8~$kbQ=pmB*+
z)5k9Qyu<rXY`PxX{2%(|jx+ntzi{@2=>tFd$BP@U$Jbxhrw&|=ZJv|Tvk&a^TPgkL
zd6A3vG1&a~bq9u|1+a-`41>+XOBiWLi4y_fQi9Tu?n#Ox9UA<vm*spq*zN$Z7FqaT
zFC=>39#oLx4gtW$6!)hh#bI;8eLqof4`Rp@`5}a5eY)UA2_<0sg~9iVcPkRrBgJ78
z#b-~7gEtq?GC;B<{dA?A78s>TzX`L)HnWy@=V0reZ#J!ja7;%7RutP!gjnD+jm+a)
zyu-N1U+_*)E;j;u19eJ&1l1?tYcxT?iuZ}dUqo)iU;4a!JeT523zp``yV(xR^lB@b
z%aKYUm$MXiOCzIYtu;COUwiuPEr;;$IsH^;Ax%s5vaG0+$1vE6*j22=g7B%m<zpkh
z%coJ`jzYG#ifx$)U^)`96s<01XEMNK?{J1=UZPFqDtpE&q*l_Op+AL^$XXglLEIL3
zp8r@7cnNk{@cirC4-y}8>*kXZ-#?#>@{i7Ui~Q#KxWISKYck(^yGLr1reb%JtND)k
cL5YWwGXKbYyTtFEPpbUWcX~vA9qaY~0l9D<oB#j-

literal 0
HcmV?d00001

diff --git a/test/unit/__pycache__/test_simulate_reads.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_simulate_reads.cpython-312-pytest-7.4.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56494c884e47fc964aa54404ae6c8894ff00aad0
GIT binary patch
literal 45973
zcmeHw3v?XUdEU(K?E8TS0gwO*5F#m&ONkGFFG<vc5J|l(ZET5lEXL}dC1y!32<$?4
z79>IM9>b5EP&H%HNo`R!Pfe*lEg2~-lhe97ZQ4k!)5=NHLx`lnj+s*DIH}E^o>Pi)
z<5*4Fe*eAm+TFqKf&@gV#$fQz?99D)X71ek|KI<(cm6CE3rkqulitkUyHb*VLO;yK
zW9)q3ktOLRNtJd>s;qj3<=q~>^^PcmI5pzi?U%)w!0rG)6Wkr-+tBV1--dUG`8KjU
z!ne`gQNE4sj;r3UNxPd=1+-c9fwriA&{j17x<Czrwy7b|b~Oyzp+-O#s!`BIY7Def
zje~ZnO`wa_X3!;S3uw363fiMC09~rKf%dBHpm(Sppv%;Spv%=opexi)&^y&G(7V*d
zpexlSpsUnw(1h9px_aD`Sb9VYe>Rsb^cFH&LF?U{XT68=?7-gP{GoN)Aj^&xv~^l`
zWNbKH$fQ^%t!isW$HU=WJoV@p%k=IY%MBK?`JC2U*q<)+s`=hrzR=4uFN|ez7FmXS
zA02ph*TA~Rc0Jm=H#?ls)`qXqTSyL?FNL;2{Cn~L!6Lk(m!tw6<I9lw%FF}Umhfsy
zvPUXPMfo+nNLorB)l>2oJw@+<RrsY8740tyK5<TxhWzH9y;Z$MWv{F%lxMH!HAS3J
zeT9&jW{5W=^OxAZX8b%TX`PPpqvfgSm;DUpJu+vGcq2J-GVaHI3YR=Z$t=B2I;*DY
zr1Labp4UvL`cL_epngj}HJ}EIzO2lD`3o)P6FRQgY96z<MZZ~+x&4Z)N{8hV{mc0G
z;9n_u$E8FlsVf7!wm+sT+aDX)byoh+^WcML<s+ZJbJghh=7bjhqOsHaU@z_Un}@Jt
z99o~y*0RjO4AU~H!TiYRaAqWvD}+m{DE-ESmI#-69~8OoOK7@BV`qK3cXWI`k$xf@
ziEJiv--k_UP0O$XzgS-%(kr9meSK%cjGB<%T74lsylr%x`dTW@^p@0MdUzP!YLwBf
zq*+Ff;&f`ySa!IO&1t%SIFA&Xz9^;b&yNkOJZWm*aDGpEIF(C}Wb{x<OYhB433_~h
z-wV3pPR8;~Uywp?9!VEcnZu*Q>0FvR>DhoDp(}{n?sNXw;QX=4`Qu*ak1gUBMbUlh
z%YOB10X%lESU%jkZhw9xvo1S=XUyl)>ohjF4mX-fgQZXntQ!O0;d`kn4x@MTI;yU}
z)PY?7P>$V+>$Kxv+Yd4!T~+$0=6HQa>Hh%Nl4hN0$uneH9?rBlqjNAV<zvRQ6i1i;
zcrq=Y#kBltU{0A<FsTOz2DWd1?6F<D7|9_<!j7{DMgoR46Cq~Fh&!?+AkLn|R^Whs
z$8<e{Sje8VmeE)O={hO1*(>g!Qs#DjN4cNfg)5fz3KE4}ls7|g-GV6h4jCdD<ylKf
zQN0i}O34rLuM~X{9tJ=t1t7=+T=)lz!4tB!ra<h7FGE!06^rUC1|k0al*`eR0)>c~
zaL7fdmx+J7e3v%vD*o*p^wKMq>KqXN0moCg<Z+1qnq}KSsZM#e_^(}_CI0J_r<g*$
zPC8$M=`65Nr@Vk!o*FzA;(#QihSYE|WWbYfVS!owj*{BUWA?TfHcK+MF7PA>h~g!f
zf`1?W{iT5YHm8BG5f=y>aYzHSVnA5?Ell5rqZJT#z6~2nfur`^`i|9x)tDg-B5GW1
zDn^Vp5^cDRM9sEgZr$1lhp^2_-JeVj3?!3B0?FjAUCG|0F8AsZmkbH{$g6XLW&5_6
zP?+`%EKG=1KxW!A4g&Q8AOkTDW(-u^w;`e3BOvZ34v@L%N`x3ON=7`A#fT6OWKBd!
zmT@S&fzwT#ZaGqhmI+-GTE0*BCX-1a63eAwA|hlY^;11y+!Lc7$_V?h4k8PQEFwaZ
zk984Q43dbkZrbl5vXn?Kh=8jG`Zd6-@vFb=+{UtBo7o*!W&`=&>qNf&>)A58#Bvag
z>cYUlVk6}D^6BJvE;Y(B&>{|IRCYh|(rs&h146j=@u`o^dO>;os<M7+j@NgT_0@=i
zI3Gt8yRdf)h+^J2fQlS)!72`PP}|sGql4P%Y|i=-83*c0TaSyhovH3MZ!mIeJD=P(
z5pro0VFv=6HzI;xIG0}mJ`r|l6QTJwAzH!zpRxmn<p@CdfG*du&*<`YU4E>Vm1bI$
zr)6{dhBMqy!?vL^2`_5jfto@n@1ChSUMH1%=C3lNN`6PVry7Tn!ozXs6;OB&;i9O6
zJLEd&J+NYVY#UC=i{En5LQmwzDoWg@<tzGN(>mj@tGc<gJVhUDTEHnq$zjv_q*;e8
zSC#rnyK@T*&A^S=?s%sz7Du~vFlS0PBxbn+8*m$1ZOn0Xw=ymJ-R*xOI2wb(wa6Wf
ze#1%=XxPzc*#s-BG^Uf0TkuLg;b@GLqmlk$tpPRdGp~?+rVnmJ!*YT>!*ViHZ{xjY
z8Q3w3fPF<2iyh+-j4#tWEaCHn!_wii?88k6=c8B4ypRamHX3qRZXmLW8)|6pUT%O{
z&uM?Tbq1bLet07nb|_~%SRc2jIGuqTg<ZtqQoOIsZqny8lJspT`?X0Ut==@02ph?=
z>1-s~+`;_t7<n++uOSyXI<&VyCZx;Zu8HkemF-iX!1Wzvd$o)q&oP%V@51uin2bT-
z1IZB?;Se6n8{&nZL@7cNIHX69qp!P35BO6cJ$#hQAwB#bGvA`$@f2<x%a$I2hVR_T
zS!a(22kP9ZB`xZtn>nuzp7I-<l6)R>!zq0wr4G80)rf1$jLx^r`IgDOaksB!##~!w
ze7<GQx6HY1nLw9K$s-T%%j7bttlGP5V0i4=Bb%3P?Jcl$PQ+X~j;GYI(LS8qn;yvy
zkDDi!ZPf!t0lHrl!13u>`N$yQQ_I_DJNu?Vr-(X3B;3Bj-c-6!VA(xm1=H<kdHsab
z&y0#A=qnJWk8)^d{WivAI{;qTP7s2ZTi9dRE02*Qc=|Y<kCi>G9uW`0pH>f95374|
z+zh8%;Bv2xx&zM!*)F;_?h|~HZomOrH?vRCxn4TV1MmzVUmtrKhg@)K567u4w?p=w
z>>x`cTR&y}GB@yCjy^rH^Q!Xj)NNWXDi2>(o}9W(>qX_stI9)Dw`skkJX8Z?pGNG^
zEn#d5GzMB8L+~rN&(uTMUXHMN9tXnyq=g*Y2)l9JU~Uk0JzZe#5O%;q*g-WkcL*EO
zBAh(ZQY9_);NUPK7a$PM%DN}3A#!BSIDb|X(H|z~7p(NNU~uf$i4a3)Um)`LKwS9c
znrZpvNLC}uI0BmsV=Skoa`{|_?L{f~QAxiE0u?{Hl-TD3Q@3fos60@Eg}i#JEM#6y
zpI6iUa}GHp$s?;xCL*v_CE#=*u&wMf=y~iJB4j6G&k`Za)Qr2nFzLBLrCdJs!dM=h
zf$c}Z58>bH@AuwTxO7Q*xVpbn7$!H%U9|f<z@1p!0z8Y7>QTL_VvZ{+VN6TO10$Vh
zjKWU5+RF~3kDD<LW;h<6j2sTAFnfK)65@5;3RX%Uj$AwN#}c>*yY+Arl)~G#Vi@av
z4IAsNfZz&ay%!_BBC4cDyNvqe2F^xmG{$Q*PVZp8Mycf*L8_=xu9UtNM~fc#2o|_i
zb`5MFxW&z`k6^i`qMEM#9*8dAjqzrFU?Gfegr40&dNw(K#1W2=pfYnu$ytxkiAf#)
zFfIa2c><`?LOp);7N6b7e(h0Q!SqbEns;Q3U&Gy)s`J9kVd&d~1sXtx0az*chQ>0v
z!Azs-bHcCt2KTf42!A}M?%eNi3JJA&qc|;oY}Sai2``WE@!ZX^;57F`L)H$ZnVQnb
zJfN~KA~P8gw0{UfNU(om=aq1DD!gJcyyEm{&ul-p;N0Ma@crZh>YBRQ^`g>sRarT8
z3+tq^vRY)2e~XI@52(;GRrvak;IQf;T9OO3MF>3ks-pTV5#n*!+Z7a0vGC{tuk9~}
zZi>-j4*$;<M?5^;UB!a2Fy#NCaWqc<Pq5(;G3}1>-8?gvaDd1mfy$R5x5#`eB}5MN
ztA_8O(^0O=9oY3}3Ka~=t$8QaIdDgz+Ij5w4K>{GojYs9<<9N5TDR_;Q2^mOcWQh0
zBgO~_zZz9z4t<uQ@A={=sm(lQZ;R9U)@d{hR2&GHcvAoL!1l+$`DgE64`c9vv0S6!
z1J`zg=<*tqDe|Gg+($14^}1L_;p-MockI}%{OO(U!X7F-;)IFyAlPYYba+fl_#C!S
zJ|==}q3m-YWnuthANW8CbA-b|N<|gjFh35muTb(sM79&*;~Y}7M<z}u3%-|+O>oi5
z6E!*UNH#a7nS7WO@g)AWb0FZuO<fa@Uyin&I6N6$dV1Albagpfe&n3z-1ZCMdx<Ug
zP2JY@lG0brnXNc3#p29%KX(K30Ppaz!=ny<TvAM5J6ZlL-s}Us2mdVkVG9ft1Blgn
zv@XC8aDb0IHw8zXlL@%^tnDXq@Vc7dy*ft&fZtLM)ZlIQ90eWq?Z!bZnNla+jJ@r3
z%Ci)iI_a#DjcUk<a|)_qHBt;3s#B<8)hT4YadT?`{jxYGW29oB<Ogq#CiS;XKKe=D
zGHB=$J3wTFhzbJ0&gX`39r8AfYit^fYK)3=y2r%mIEWUp1NKfKSx^jktQf?Ku$>UM
zh9io&j~F($gZ3;DVUL|-PvAtk;|!5rJRoZ`8>3Rfg7sVOEXG-XFO|0uq$~<+1KUK$
ziGP>5x3*{`Fa<UzD)9Nrno^;V&!<Mxx$#tv9Y<!e0c$UT5H@cotIfdF$7X$38JK7N
zhJtZZGtbluUz%tAQXQ85$zc6&ogll<ETZ!HvTj|&5mM|+;H|b4V@K&Hx5IPf|04bT
z`$WD@gd_ty2I9)t@2bn#G06}su-`!eq(oVKeG&2XZBrkc^<8CKHD9;v^RL>FzP52?
z-abEP_W43P+7Dms0fSt9qRCeL+#m%NN&9OD_jL1sL@+UntkUEbP6a8}u$oa%L25Ja
z(uB!%rDRnOC`d6((L!C4FLh>&s}}@eNz}RV$vEoUO+m61koYNu`)-12liECs3JA^Q
zCKQgAB>N_6jB~kb6e}gTT#oieiEyjN3$(|GaFk4q1W0~8!D`@dVYR`IVYPvohfm{%
z$t)a0A_zHqhcmge3P;`)nbs!))u;~2>@paL!N{UsW|yIN-oS(t?Aw&lz~%hr4^u{t
z(8p<yEWTnUiYKw>w9ssE{<zPTdEZl$d1ns8wvXt*P3!lJrt$~aDO`XQ0_{&g2%3Y1
zJI&?;XV|$77s6X+!ED|+rOf$yUg@mC@Q;Db3+p<$TyM^<4%-c%Q4OwH8uMMxHS6bd
zu#uhIIdSCd**`2xMO`*vr6fCfo$hVfHDF&5tl!7&3$r(2qnM--wghz!orsosVotAO
zJwT;3p*Dvi2$l%w%AUMB?y%0fu!8Ct-Zs?i<GIw(7^aQflfl%islt9t8M8k>tg=^d
z!M)VgiA+fEMY<<;T#h!MxNkDLq_Wd%mQnWRePx%FuIj#M*;C^d_;U0`+?ZHgEvuGQ
zRrQs8u&h#u1njCs+w0{=?+g_E#Q-d;VrI8`mQ_EW*^TFMSXLYFpY?!)`9PMZr7m!&
z%F#f7wfZ*vO}7_dgc{{6b<$bhraI|ntTxvv&+<*FVZ*W-LI_FJF&7-oSj87dNv-BF
zdpqr1a0G11=6}QQU?~90YHTjKp~VX^gVy(Oo0ZH;rDOG#M&O;(l#(lz)7cc+Z&Q*@
zM3O|l0isdQw=IZ<39%tYmfn*;m~k+&@1rE*tr~GOVq`r|M)vVHELz_F@ACk$+Ma#Y
z0{WQB8Z#5wtxmP1^bYBS{QPQ~*xKQFb@AF4X$ouUHE>>9N?zOJZ2O@xYo>w-b#PU#
zeEExhC|~5D<cr&^9vB7c=iVtA6KYS6?HFr4<kCN0GfL!>W839hYLjsqn*!OM`QlK%
z>>M<=Tg6hHgS=BfJyzeXb4TrtCw93byS9M3YTk*Rl5_FeX>FcYJDtVzfX;;S72xPB
zV0eFm1)AWFFOIwGGLPBYV$iHnb8C2iXl_b>3A0EXTcb$a=W`9sOA>k*H#AMqPg1hj
zw}_BNqkFZnJ&a8J>}4WcJ&Nmo3eU-p=>BKK)`#EwGGbg%OiR<e+9_m&W1N@LuwOf*
zds%Luh%%B>?4QsjIe1!1d!#tA?-2Peh(<!uVm`uqA!f-fy8%q|%~%GpPtXDP0XY3r
z<RXPkI}I`+T?scIU-ROclb#FV#S_~vhdU;=Ux~Gz=s*74QTa-|c`DvB8Sgo%oqptu
z_d@)hqu#5*?y0$7CxhMZ`=t27YOG}$y8e`7t()aDZWH2zlTh@K$w@pm>cF^i!<YsS
zctyZEI0;DszPQbp3-07Ts+^{Z6uDd?fXX<DUA_y?DOfzwyhorhJBLe8ug-yfj9H)y
zjowkaqkNY;vTF;Rq~@L2DT$ZWNoz@lI_WHqgAiQ7Nh-t37n(KiC~1*-%-&8L(?duH
zqrWk_8+g3vIY`<w1vJ~w>JpCtyhQ=cG0I^}6h|JHH)Zp(+y8ddxS=(NxKYfWN4U}2
zoNgZBAfFc&2C;vPs@L1xoF{Aorz$wJV2<USS(lT1I8Y8&G9C^}^SWkpW^gLvQcqGr
zJBTnMWLCpesb^^aED_?V>@_09{Tt@n5%xMI-%g|^(-x*K5fZzeS%xupAi0Wb#O2(a
zJ1MLH*3@WPRWWhuzr=;VjDPJvgH$v1HZXMx8||75cb(jD+H)GwNvp{EHRrY~n8&Uv
z?Nc|qUQ*htMF2@?V$vV{TQFrzhib59OzlaL4;9PwP_q#cKp(LwdUi<EB4`1;v{X?W
z263a=w^X0%cXVeL7GRdb;bBc?Tg{@u)a@KDOt(4*G4<Mc>?f#`&St5BQ(kTv_t`!U
zN8S!mOHK+yhVg#Ptrc}oL6O1awLv5oAnYGtZ_s+1_(5-Z1O7i)0rDwH#@jAR&qLsq
zWC&~Np!99|C0Uk^g#qZ~-}fGp6CT}13?!L6D`DTWcD*j|(WqIN@g1QDXc*ApQTEW0
z%`V+5pMxFwV0t*KijNAceL#GJ<e{}zzTZY^8ok}0Nq;jTO(}_sO5&HAf&*WPH_J!v
zam2g3gjSkHdIa6TNUwYG`GSPk5feiwIT3Z`NN?hlW7cPq7p_x3`zx6dS7|9i(rIYl
zq)|T-_)&mIF?nA(yQ!=X_V1ACMf__AK&m+n!ADn4lR!n?%393^2={O{@WUCh0f)9}
z;kq5t^Z$mLRUKW^vUrJTNV0$*z{pR-Y97HI@nsquNS|BsRaniPU9>h1bYdwN24G{U
z8KEvG%W^{?7V1(1merg_{MuIYM&-ZdUaMdPu$mwHW3G{flfobNGrZPo#0TgLM-R~#
zj;ygA4~Z~lIK6avkd^z#&N>D8?;*R;$2yD_J(wTM6;k^$_$m)(dKhd}wx11%?)3Xq
z;E#wjWX%@iJlrpxN)I1Ok87#Xyp}Cw@iC@9N3q2GwC6zVP9<NU*}X^|+xU&&fI1e9
zPet#XjNW;A@IrL;QTb}H{dlaAzi-4ynR0O=SzYMu+37-m+(;Kvy{{|QG+o^M&ASm)
z`G&d?%sAXIQ|^5F#3(P;xIiBt7dX4!=q=ujBBC3yA5vo_!$v~ZLnG<Kumqq7B+#cG
zcrfAUQx8~uO0@6tUd8^Diuq3<)h?SdfLhVNFxE;Nh223zz~o^ziTvDX@f+A_q=lb*
z=3?|-YU3Tp<Bj}%BVMnKKQlwaq`R0nE)RP4NrRr}J7?O~zv(%)Sb;ecLdFslHq*9^
zTK2IcNC$%hR85#Jy$7g=0AnJ|-is&_`h^cSF?_y9`)3Xe<|0Js15fmkAAFsTrrz-H
z^_pS(j(lz+#m0Rh8}ALdpuE@cR%rpUZFiLErc_%31!AfuqP5W}kZh<<T1&XV#UT)C
zkh?hS;PJ)c4q<w=ue!uMW^aqr1&^Dbw&C#`^pVU#yval4;NXgD_9zCh>=Wg*Z9|V!
zMl!GoiUZMY#!lC^gomBSK~54LTT3-MArPZDMuTyTV~F5j7mx=yk*lu+5yjU=UZy>6
zl5BDyMoaXuKf`&lV2)I3vuM0dC`s?h@$HbXE%9_e@7Cf5*bgWj7k}Iy$p_uo_Q){K
z3Oq^F8BRTo7^E6XrAJ3u{xEwNnP0`fMqI*X5W*~ZW!ug-)gMMK#GZO9@<!jWJyY?O
zlkt^jyyup`8{hn9Wa82Dv8S#?;ztjDee0<OCqMVy#gma06FYvYbWhFFI;nL3@<u?4
zKP_uM+}}Uy*|9<)y~SGS**O?JkMk6cL8IUId9pM`vZc|u>Zev8bGkuXbyJBn%%c2y
z!0f$njo8|#1!WD60VlFK^s%mq5?`%^ObwCa*)~mK@SfdMT)2v(&pQo{E_|d~H@ztn
zB?f~ud`EKwmLoRj9;ykJ>!h>Zxf-=hlrc5#7<SkwsJ2|Y6`<NMQ5HQz7T>els%6dK
zE;jfvzW~VycWtN`9CG1^rLfvmiXb#0T#T4k;YjTjwOMVMwKrn9ZR)BoR!Yc)mO3?U
z86E4Sv!1L@I{VGFns2Vq=)1CHlv#_0B_s@Uas)LRs{O8>2lwS~Dq4yaV_zUENW9ck
zYA&^uT1yK`ZKZZ~0SwV`LnRFt!*xy4ZN)J9eLLlG^!tukd#7>pN$b8-D<$4(N1b<I
zAqaJ$p^|o}i`33yhv6z**zg-!Xx^W>HQxwS(l-=1ppSi%BkC@EcNgRT`w{`6puUa1
z#7I{YbHos))s<(TNbclOoQXj8=ivF-UI+v|{8Znk2l~atem=ghd!Fjkm8U56m#GLc
zdtW2jaSh!NLS7H#QiGXceD`)C7m#GE%HZ3!YHBpY_!%Ld$$aB*R4Vk9+X_!5W-UNL
zsUuH0Y*IiIB5)Kt_HbWu2u3M8mUOLFNV9@=C|lTH_I+SIzzfZ)2@z3R`3W=%x@7->
z2vrkqfwYbaRXyRvD<s48Ci?2H${!D7iSZ=grV_Y~i=wF=b}tTAIDi2J7GFElec3{0
zMAO6eEhiS~p=_>z?~36I!lJ@{Ldk9P@+AGV&<p!3{M19R_F>rXNP1L6u5zPUP#nyv
zx{}RdhKBz}89zg0H<3Xihd^{6zr60v<W${nOyMw2XODnrbl)~SD&G@eNzAKQQ4ur-
zd2A5VgJszJD1qF!+5t2##E1mrm!kuhquZ|gye;ADUSErU;)$O}rAUjzqkAP1n~HQ#
zM!HXC&$YZgaQ@lPU5xzt#Ey@~^nbZ$+4qKC9{S$!WU%XK;6&%u;OePcUf&I_zTC6&
zd-<31@Aj;FF?say6YAxsKQkG9^61VJTTVW4`tZ5-H+*ksZ$5f8+C6o%>$}nJ8$N02
zdMK4k@2<8z2JjZB%YN{O=!F9AWsTV?PI)j-zo#DTgP+-eh1`@aQ+o!SBXG$VTL}c5
z3qUwH;oOgETPf)y<gRcI>ZDlQ?2~11j-HP%Zm}^1#XM_11K^yU!v!i<=K!0pH#dU?
zXls{ktBG~e*`hmQitbsS>%okIBZXa7<2B!D)BHO%T9`|F`PjZw-cNT($Ag9u2(6xC
z0MurB*D&yeVBiUrf@(_%@T(X?WI&`C0SvQy2bo+e7PYk)L4RF9Icyl_$m3Q+7O+*@
z9B`?&)3M$hE0*a4aLvABx2szz(U05Ib_0e*)ed!GF>1iDSi@hBjhTGJ+*&X!qAr@R
z$<nOU<Twmg+-Mbq(W*142L!4az{gFlqCx7OeFM5@Fa8Gy*kv53h^0lc_vq-)iTp1j
ze@(<S<uwvxDDV*!i2uPyM6&;aqgfhj%TY3dF+QMV#(g|;)`K}+39YVQ-)HUhi;EJ}
zg%^kqe5&hHHD~I^d;}as8UvpkF!ggNj(wE~893PA65*(o066<UME;P-KL^p{2HMqV
za<0?>`xkVE5!p%P|AH9frSvud-9*I}@&d)r2*@Vlj8G~DZ1oS?2vp2&z)fynqiZgs
zB=TW~_bEASa^rN4L(Mq$+yLfk?(ufJ%hW!W9~;%oV8P*W=11ZG9_46X#SXw`d(Zja
zC(pNi>T>UsmwTUj-z&E#<@bF`)IYJ~=V1vbGknKn_>R*nE`(Q7WXbZWIanu^<#Si^
z<rOQ>gkDWvXy5Qk^5hFA(kJELRZk};lIPnu+{*N57nPN_lKqmh@|QrO9UHm3{y%Pw
zKHa92#YhmN=1_Dq^e9&-iVx$c0q#ri)V4(y9f8<309Avphs>ZzS+WI#onjLYfE%)G
zk+>scF+nAQ<s|wu{eH6aB}EOB&>XTrshJ1U!&%mtA+mK>EUNXfahh~olv47m7;Zp=
zWVp?fJ{><?+loPfco4d<xshkYw5U4rxY#j^mIbFyI?F1jHd#S)u$4ImT(p}H<tVAc
zJZ5jF9dO|n11!{5c!MzQ5`YFh^UIFXT*NjY20e2d<(T<1tL=6O_E497kLI5Ft}A>%
zE*3>QCaA6t$ci2U&}dgXjM|K-3)MxA&(t?sn^w!I02+Qkw$AyQEJbZEpQ9zYwaXec
zSya?6RqB!elVb)<hQK15<*oE}fa?_i*KvR$5`=syT2@26zs02{gw{6k{@+||hR>s>
z_X7RD8GT_1<+1yM+TCy)cgEwnwDAI&j`(7>wYm{}RXNb+yVahC@5t#-b-5$Ew(9me
zt6p&jJ&t;G<B66vzD~L(vmVr?hWZ^=d(}INVMASLX}ImRm?fE8LtO}hCpM!M#Dz?-
zB*U^_Maf{*!g7+bkdG^V6>DK!S}yfTntTXQ`8i3(-wEZ|Lx;W3Nrw=}!PTJ{)S)|*
zTo-{s`xmGz-Op!=AJBdH5Kju5!C@^mfRpdxBnuMg4iO21B$R6ez>F-9h%DGpmPgqC
zrVM`v@}YlUVQoP>$gRu>4v282Vh|s79LaG-OP9mC9O5DFP;{)=bz>@H=safZIsu}x
zNgF6Sfbc&1A4F`u#lJt3Rx_+ZU(v&`CTBz>kkENZ`E$AvSpJY!;YJWD5w99~Q=(af
z)wLdCFuHTSA*ZHLYGFS@4GK(G@}rrY9%8usa5k6GFjxwQv!gJNj}B)G4x2u;OgrqL
z(!~F5oK3Xq!NSNW!rW1MlynsPTKTa8^vnIak6(p#&@1@?l}LP4*!kb0z3&jQbR%6J
zW{W5(SEslSfFcewVr}Ir6a8}9<*T2UG{TW|f!&4jE>R(Cpg<a`(!=N5wq5Rh7%Qnt
z1Eeao`6r&b4pnKviO)}lyAjx@_z@mIsdS#a=XLLy?XLyj$X-$&|I1M9=-{!=6MHT<
zx4snpX7qRCM}5EWO5p|9gHn9KiM|&<e?0YK>g28qv1L=SRg<w*XWA~r)=oTnRk>qo
zChJvY>D0}x7nP+my`1yjE-H6i_e*`7&!x}p`Ge4f$o+p1I@5P_;P~S&K7L}?*OODx
z-pOe1>9$MJyUv9scASsg|Iv6WemX;q^&QeimrTuMy`U`lMK|8%i=BUaqfLr#)mqSO
z{;1`dkoQNSmCtl2?}U8MEC{&kcYle-Wk#03$$*Km;c_5X!!7G~v#;IRCdn^ZW_QG{
zg~SxG9HWUDkSzVq&C!5QchU?<5t>%kfzhH34|(mhqw3t^HL8cy0;Ab?<br#va{wV>
z!sv$Y$T_suErP_VEnu{ocY>*R!F6cC9mMLSwX~=@=`8&ggT35mKq8jS7e`4hy5O`?
zbTl0{kD#lP%kr_sLM|s-l909B9`_fx6yxN-V$YDk@SpdyRm8&iL61nl4^UF1;v%gg
zF9w4Q_Tj&uHBr{TLdt98$EqAvwt~*SL?>P*@+b(5a%9?IU36+OktIaBiI9+TNHl6P
zY#|Zy_OfyThuM*Okq3s+0$WCzmlNR^JWQ!=iNr+~7fTdZW{V{uj>I=*T*Of!Uq}z9
zxBy>+qW>rIno}SxTzh`=)054~%PsBavERH7QYqXw@yL7O)+%YV?9|}t<=@?ZCUjBh
zzcQUP`svMnZD2;9DLAim{ldrj<%fKT+`n$wd8hYJ?)2?kHw}~gnc<J+Owl8ZUI>Ch
zZB#Dy)NC+nkpYb_L)0cJ7B`cZ+4+p|!0^w)1OsyvOfZ%i%xy9>Ycw!&*euPF$HjwV
zclELXT`mTmG82q_A8u-!6+{C@W2bbxx|I@)2BU-x6ATP)2pIGm%3HvpyxFC>Fwkko
z)M9=HH4LBVd`;TMtedXMB1Wkb?<qNEO|ZujmoWPaylSChm<wX^Q2xl#28iWUEEu~?
z9?BTyv0JSgcc}ceTVtoAe3#Z(Aa2PQvtL0F+fVP(Dyws#b;fJwv1_1CI?KCRC!Ots
zZ8F4VltLmM9?(YL<#ZlUGHHbHPw&fkr)n#IvvDZ+3;3R*wjp#A;e-5b5)T@7+_(GN
zr1+3_%45GxM;@2@ws}yUbXMJ2#@Z%stZgzlMRUWPqS<T<=GHLt@IZV#aa`>*p1<o?
z@%b@$c5(8<A5&Ojh>d6mLYc$)T&Qn-Uue_%^`W()yYX*+@$M!+vfOKaW3zV|1vf6+
z8phT7%dUpY^=;<YTjRVQ4|J^lrn4OjS^i1qr|1*U_QfbR5*Pb0B0krot;P45g$vm-
zZ?OLkc2Q~Dpcu`5fgH(W%wX;@5SmPTU!`N*;|onn64spOXm0_XZ6nf7gj-WMhI|HR
z6EQt3uvUR)#3!z#ciA8GGj1`pXFP?g;|oz7Alh+@)I=*S8w8lyVj=_|6D<N3M(8Wl
zgL%X(<_fx(9+^Mm4=5eShvT$&gh)3LZsthQ9tVzJqdgef5r)Ve(CjTbKuD4+nKp<t
z_MGE0rhl9izJrn-Ne{CvsF}Z`!oGr?3F*4x3$3~Cm3+<Do5`Z_{A73;rjJnqzm@!Q
z^2Amc2(C1@PBpKYY+m#F;JM|m?Z42xW#WmeO3&2It{0V_Th4yhlrsD4q;eM~|39&J
zGTMDI4a3F6&YvnPre?Cv)D&}GSt>#j@hYsv@&lhR>;xYj1hz$j9k>OLueWN-Ez8&I
z7Jvw7OjB<0X}WB&Gy}-{(TqZw&53d>Xbli<*NvN9z)A_2Q*WXi%Q8@>Ov@@@O_USh
zV^ZxYw@8!ai=!k9s+s2`$p%&~q8d|fK~&-Ut3xiLs&8YJsM?jyjaN$!I?99S%LRv#
z*cKf-k^_(&Y(0902w>uZ;9Iy?p{jG1TA|*@SU)8v&dxT_9%tdinj6;aL!5=Pjg)&6
z5q_O%wR*ag#gGLfuKY%HrCdqv%?=})9iOD)v((>*i?rh3m@e?T;t8!4?3f#v=bjxm
z&d1LyJvF#h7^*i}HRqaJsowpvW8Arz#;b8<6R;Y*2@@aRl-fR%(do9@<Dw|p{B=fZ
zJEjS=)b<dps<W%M)7T}dNlc2>fB_`RYce)pla;)Z^T~LUtt0-y8Q5j82{;Smm?I{C
z2;+Y^z|?FXpBo5feaMb6{(gL>6;od&j4!p)L|Ed8IT;ovnQ>!#4I)>BgOV+?lH<96
zRhKXCt>s~eH9%*@13!o>_26IoSE!Jiz@BgW-#mP=`O~w2KbK7@H@RL?mT@kaxLXgT
zQfhuMmC}P0iCM_zG_D}%ovG1rs+qN7v{p8s!$%mo>Y<0MLwdkC#$}VP>^lO_swbP*
zJ$a3f0f)mBddeY|9@YJ#JUy;uN5+QJ@Ya%3*7zFNoh089MD8cDjmWPNNfX&iWIvHF
z68TLc$BFz7kyAuoA@Y49G$x5pLB?kXVt-CQX~G9SsGE;k;?bnsPs+$L&9w<eGK|r?
zWq(A8?;^4SL=QeNl2^xuGY_#Ra2BhiQNz0Km1X&Tk0L9-2urg3HR-F7A4?&=#(pd{
z{X~lWL|S|!F3THk1Z8>aji5*FyAf98H8<iOd53(XRgoX2-3M=kJ@PjGd)1A&BKOn5
zM`SuW0IJAcH{w2d@r`DMj&}Iuo*T_Rxr2VMzR{sz51AHHnw}er;vlOd@~Rsv7Ruc>
ZHU^MyNvqs@V|7Sg`oW@@yqrJR{{tnb*(3k}

literal 0
HcmV?d00001

diff --git a/test/unit/test_build_rmsk_gtf.py b/test/unit/test_build_rmsk_gtf.py
new file mode 100644
index 0000000..c5f48a0
--- /dev/null
+++ b/test/unit/test_build_rmsk_gtf.py
@@ -0,0 +1,110 @@
+"""
+Unit tests for workflow/scripts/build_rmsk_gtf.py
+"""
+import gzip
+import os
+
+import pytest
+
+import build_rmsk_gtf as rmsk
+
+
+# ---------------------------------------------------------------------------
+# make_gtf_attributes
+# ---------------------------------------------------------------------------
+
+def test_make_gtf_attributes_format():
+    attr = rmsk.make_gtf_attributes('AluSz6', 3, 'Alu', 'SINE')
+    assert 'gene_id "AluSz6"' in attr
+    assert 'transcript_id "AluSz6_dup3"' in attr
+    assert 'family_id "Alu"' in attr
+    assert 'class_id "SINE"' in attr
+
+
+def test_make_gtf_attributes_locus_index():
+    a1 = rmsk.make_gtf_attributes('L1PA2', 1, 'L1', 'LINE')
+    a5 = rmsk.make_gtf_attributes('L1PA2', 5, 'L1', 'LINE')
+    assert 'transcript_id "L1PA2_dup1"' in a1
+    assert 'transcript_id "L1PA2_dup5"' in a5
+
+
+# ---------------------------------------------------------------------------
+# convert_rmsk_to_gtf
+# ---------------------------------------------------------------------------
+
+
+def rmsk_row(chrom='chr1', start=100, end=400, strand='+',
+             name='AluSz6', rep_class='SINE', family='Alu'):
+    return (f'0\t1000\t100\t0\t0\t{chrom}\t{start}\t{end}\t-100\t'
+            f'{strand}\t{name}\t{rep_class}\t{family}\t0\t300\t0\t1\n')
+
+
+def test_convert_rmsk_to_gtf_basic(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(rmsk_row())
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    lines = outfile.read_text().strip().split('\n')
+    assert len(lines) == 1
+    fields = lines[0].split('\t')
+    assert fields[0] == 'chr1'
+    assert fields[2] == 'exon'
+    assert 'gene_id "AluSz6"' in fields[8]
+    assert 'transcript_id "AluSz6_dup1"' in fields[8]
+
+
+def test_convert_rmsk_to_gtf_increments_dup_index(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(
+        rmsk_row(name='AluSz6', start=100, end=400) +
+        rmsk_row(name='AluSz6', start=500, end=900)
+    )
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    lines = outfile.read_text().strip().split('\n')
+    assert len(lines) == 2
+    assert 'transcript_id "AluSz6_dup1"' in lines[0]
+    assert 'transcript_id "AluSz6_dup2"' in lines[1]
+
+
+def test_convert_rmsk_to_gtf_filters_short(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    # Only 30 bp wide -> below default min_length=50
+    infile.write_text(rmsk_row(start=100, end=130))
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    assert outfile.read_text().strip() == ''
+
+
+def test_convert_rmsk_to_gtf_chromosome_filter(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(
+        rmsk_row(chrom='chr1') +
+        rmsk_row(chrom='chr2')
+    )
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile), allowed_chroms={'chr1'})
+    lines = [l for l in outfile.read_text().strip().split('\n') if l]
+    assert len(lines) == 1
+    assert lines[0].startswith('chr1')
+
+
+def test_convert_rmsk_to_gtf_gtf_start_is_one_based(tmp_path):
+    # RMSK uses 0-based start; GTF should be 1-based
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(rmsk_row(start=100, end=400))
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    fields = outfile.read_text().strip().split('\t')
+    assert fields[3] == '101'   # 100 + 1
+    assert fields[4] == '400'   # end unchanged
+
+
+def test_convert_rmsk_to_gtf_compressed_output(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(rmsk_row())
+    outfile = tmp_path / 'out.gtf.gz'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    with gzip.open(outfile, 'rt') as fh:
+        content = fh.read()
+    assert 'AluSz6' in content
diff --git a/test/unit/test_evaluate.py b/test/unit/test_evaluate.py
new file mode 100644
index 0000000..1508d54
--- /dev/null
+++ b/test/unit/test_evaluate.py
@@ -0,0 +1,368 @@
+"""
+Unit tests for workflow/scripts/evaluate.py
+
+Covers ~80% of the pure-function logic: metric computations, data loading,
+and the vector-alignment helpers. Does not require any external bioinformatics
+tools - only scipy (already in the evaluation conda env).
+"""
+import csv
+import io
+import math
+import os
+import sys
+import textwrap
+
+import pytest
+
+# conftest.py adds scripts/ to sys.path
+import evaluate as ev
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def write_tsv(tmp_path, name, rows, fieldnames=None):
+    p = tmp_path / name
+    if fieldnames is None:
+        fieldnames = list(rows[0].keys())
+    with open(p, 'w', newline='') as fh:
+        w = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
+        w.writeheader()
+        w.writerows(rows)
+    return p
+
+
+def make_ground_truth(tmp_path, rows):
+    """Write a ground_truth.tsv and return its path."""
+    fieldnames = ['cell_id', 'locus_id', 'repeat_id', 'family_id', 'class_id', 'true_count']
+    return write_tsv(tmp_path, 'ground_truth.tsv', rows, fieldnames)
+
+
+def make_count_matrix(tmp_path, feature_cell_dict, cell_ids):
+    """
+    Write a feature x cell TSV.
+    feature_cell_dict: {feature_id: {cell_id: count}}
+    """
+    p = tmp_path / 'counts.tsv'
+    features = sorted(feature_cell_dict)
+    with open(p, 'w') as fh:
+        fh.write('feature_id\t' + '\t'.join(cell_ids) + '\n')
+        for feat in features:
+            vals = [str(feature_cell_dict[feat].get(c, 0)) for c in cell_ids]
+            fh.write(feat + '\t' + '\t'.join(vals) + '\n')
+    return p
+
+
+def make_locus_map(tmp_path, rows):
+    """
+    Write a locus_map TSV (no header):
+    transcript_id  gene_id  family_id  class_id
+    """
+    p = tmp_path / 'locus_map.tsv'
+    with open(p, 'w') as fh:
+        for row in rows:
+            fh.write('\t'.join(row) + '\n')
+    return p
+
+
+# ---------------------------------------------------------------------------
+# pearson_r
+# ---------------------------------------------------------------------------
+
+def test_pearson_r_identical_vectors():
+    r, p = ev.pearson_r([1, 2, 3, 4, 5], [1, 2, 3, 4, 5])
+    assert r == pytest.approx(1.0, abs=1e-9)
+
+
+def test_pearson_r_perfectly_anti_correlated():
+    r, _ = ev.pearson_r([1, 2, 3, 4, 5], [5, 4, 3, 2, 1])
+    assert r == pytest.approx(-1.0, abs=1e-9)
+
+
+def test_pearson_r_zero_correlation():
+    # constant y -> variance = 0 -> scipy returns nan
+    r, _ = ev.pearson_r([1, 2, 3], [5, 5, 5])
+    assert math.isnan(r)
+
+
+def test_pearson_r_too_few_points():
+    r, p = ev.pearson_r([1, 2], [1, 2])
+    assert math.isnan(r)
+
+
+def test_pearson_r_known_value():
+    # x = [1, 3], y = [2, 4] -> r = 1.0
+    r, _ = ev.pearson_r([0, 1, 2, 3], [0, 2, 4, 6])
+    assert r == pytest.approx(1.0, abs=1e-9)
+
+
+# ---------------------------------------------------------------------------
+# spearman_r
+# ---------------------------------------------------------------------------
+
+def test_spearman_r_identical_vectors():
+    r, _ = ev.spearman_r([10, 20, 30], [10, 20, 30])
+    assert r == pytest.approx(1.0, abs=1e-9)
+
+
+def test_spearman_r_rank_invariant_to_scale():
+    # spearman should give 1.0 for any monotone transform
+    r, _ = ev.spearman_r([1, 2, 3, 4], [1, 4, 9, 16])
+    assert r == pytest.approx(1.0, abs=1e-9)
+
+
+def test_spearman_r_anti_correlated():
+    r, _ = ev.spearman_r([1, 2, 3], [3, 2, 1])
+    assert r == pytest.approx(-1.0, abs=1e-9)
+
+
+def test_spearman_r_too_few_points():
+    r, _ = ev.spearman_r([1], [1])
+    assert math.isnan(r)
+
+
+# ---------------------------------------------------------------------------
+# log1p_rmse
+# ---------------------------------------------------------------------------
+
+def test_log1p_rmse_perfect():
+    """Identical vectors -> RMSE = 0."""
+    assert ev.log1p_rmse([0, 5, 10, 100], [0, 5, 10, 100]) == pytest.approx(0.0)
+
+
+def test_log1p_rmse_empty():
+    assert math.isnan(ev.log1p_rmse([], []))
+
+
+def test_log1p_rmse_known_single_pair():
+    # log1p(3) - log1p(0) = log(4) ~ 1.386; RMSE = 1.386
+    result = ev.log1p_rmse([3], [0])
+    assert result == pytest.approx(math.log(4), rel=1e-6)
+
+
+def test_log1p_rmse_symmetric():
+    a, b = [1, 2, 3], [4, 5, 6]
+    assert ev.log1p_rmse(a, b) == pytest.approx(ev.log1p_rmse(b, a), rel=1e-9)
+
+
+# ---------------------------------------------------------------------------
+# detection_metrics
+# ---------------------------------------------------------------------------
+
+def test_detection_metrics_perfect_recall_and_precision():
+    expressed = {'A', 'B', 'C'}
+    all_f = {'A', 'B', 'C', 'D'}
+    p, r, f1, j, spec = ev.detection_metrics(expressed, expressed, all_f)
+    assert p == pytest.approx(1.0)
+    assert r == pytest.approx(1.0)
+    assert f1 == pytest.approx(1.0)
+    assert j == pytest.approx(1.0)
+
+
+def test_detection_metrics_no_true_positives():
+    truth = {'A', 'B'}
+    obs = {'C', 'D'}
+    all_f = {'A', 'B', 'C', 'D'}
+    p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f)
+    assert r == pytest.approx(0.0)   # no TP
+    assert p == pytest.approx(0.0)   # no TP
+    assert f1 == pytest.approx(0.0)
+    assert j == pytest.approx(0.0)
+
+
+def test_detection_metrics_all_features_observed_none_true():
+    truth = set()
+    obs = {'A', 'B', 'C'}
+    all_f = {'A', 'B', 'C'}
+    p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f)
+    # TP=0, FP=3, FN=0, TN=0
+    # truth empty -> TP=0, FN=0 -> TP/(TP+FN) = 0/0 -> code returns 0.0 (correct)
+    assert r == pytest.approx(0.0)
+    # precision = 0 (TP=0, FP=3)
+    assert p == pytest.approx(0.0)
+
+
+def test_detection_metrics_partial_overlap():
+    truth = {'A', 'B', 'C', 'D'}
+    obs = {'C', 'D', 'E', 'F'}
+    all_f = {'A', 'B', 'C', 'D', 'E', 'F', 'G'}
+    p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f)
+    # TP=2, FP=2, FN=2, TN=1
+    assert p == pytest.approx(2 / 4)
+    assert r == pytest.approx(2 / 4)
+    assert j == pytest.approx(2 / 6)
+
+
+def test_detection_metrics_specificity():
+    truth = {'A'}
+    obs = {'A'}
+    all_f = {'A', 'B', 'C', 'D'}
+    # TN=3, FP=0 -> spec = 1.0
+    p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f)
+    assert spec == pytest.approx(1.0)
+
+
+# ---------------------------------------------------------------------------
+# build_aligned_vectors
+# ---------------------------------------------------------------------------
+
+def test_build_aligned_vectors_zero_fill_missing():
+    truth = {'c1': {'A': 5, 'B': 3}}
+    obs = {'c1': {'A': 4}}
+    t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1'], ['A', 'B'])
+    assert t_vec == [5, 3]
+    assert o_vec == [4, 0]   # B missing in obs -> 0
+
+
+def test_build_aligned_vectors_ordering():
+    truth = {'c1': {'A': 1}, 'c2': {'A': 2}}
+    obs = {'c1': {'A': 1}, 'c2': {'A': 2}}
+    t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1', 'c2'], ['A'])
+    assert t_vec == [1, 2]
+    assert o_vec == [1, 2]
+
+
+def test_build_aligned_vectors_missing_cell():
+    truth = {'c1': {'A': 5}}
+    obs = {}
+    t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1'], ['A'])
+    assert t_vec == [5]
+    assert o_vec == [0]
+
+
+# ---------------------------------------------------------------------------
+# load_ground_truth - granularity modes
+# ---------------------------------------------------------------------------
+
+GT_ROWS = [
+    {'cell_id': 'c1', 'locus_id': 'AluSz6_dup1', 'repeat_id': 'AluSz6',
+     'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '5'},
+    {'cell_id': 'c1', 'locus_id': 'AluSz6_dup2', 'repeat_id': 'AluSz6',
+     'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '3'},
+    {'cell_id': 'c1', 'locus_id': 'L1PA2_dup1', 'repeat_id': 'L1PA2',
+     'family_id': 'L1', 'class_id': 'LINE', 'true_count': '7'},
+    {'cell_id': 'c2', 'locus_id': 'AluSz6_dup1', 'repeat_id': 'AluSz6',
+     'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '2'},
+]
+
+
+def test_load_ground_truth_locus_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, meta = ev.load_ground_truth(str(p), granularity='locus')
+    # Each locus_id is a separate feature
+    assert truth['c1']['AluSz6_dup1'] == 5
+    assert truth['c1']['AluSz6_dup2'] == 3
+    assert truth['c1']['L1PA2_dup1'] == 7
+    assert truth['c2']['AluSz6_dup1'] == 2
+
+
+def test_load_ground_truth_gene_id_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, meta = ev.load_ground_truth(str(p), granularity='gene_id')
+    # AluSz6_dup1 and AluSz6_dup2 aggregate to AluSz6
+    assert truth['c1']['AluSz6'] == 8
+    assert truth['c1']['L1PA2'] == 7
+    assert truth['c2']['AluSz6'] == 2
+
+
+def test_load_ground_truth_family_id_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, meta = ev.load_ground_truth(str(p), granularity='family_id')
+    # Both AluSz6 loci -> 'Alu' family
+    assert truth['c1']['Alu'] == 8
+    assert truth['c1']['L1'] == 7
+
+
+def test_load_ground_truth_class_id_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, meta = ev.load_ground_truth(str(p), granularity='class_id')
+    assert truth['c1']['SINE'] == 8   # Alu is SINE
+    assert truth['c1']['LINE'] == 7
+
+
+def test_load_ground_truth_valid_locus_ids_filters_before_aggregation(tmp_path):
+    """
+    With valid_locus_ids = {AluSz6_dup1 only}, AluSz6 gene_id should
+    total 5, not 8 (excludes AluSz6_dup2 which is not in valid set).
+    This exercises the core fix for the genic/intergenic partitioning bug.
+    """
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, _ = ev.load_ground_truth(
+        str(p), granularity='gene_id',
+        valid_locus_ids={'AluSz6_dup1', 'L1PA2_dup1'}
+    )
+    assert truth['c1']['AluSz6'] == 5   # only dup1, not dup2
+    assert truth['c1']['L1PA2'] == 7
+    assert 'AluSz6_dup2' not in truth['c1']
+
+
+def test_load_ground_truth_valid_locus_ids_at_locus_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, _ = ev.load_ground_truth(
+        str(p), granularity='locus',
+        valid_locus_ids={'AluSz6_dup1'}
+    )
+    assert 'AluSz6_dup1' in truth['c1']
+    assert 'AluSz6_dup2' not in truth.get('c1', {})
+
+
+# ---------------------------------------------------------------------------
+# load_count_matrix
+# ---------------------------------------------------------------------------
+
+def test_load_count_matrix_basic(tmp_path):
+    p = make_count_matrix(
+        tmp_path,
+        {'AluSz6': {'c1': 5, 'c2': 0}, 'L1PA2': {'c1': 3, 'c2': 7}},
+        ['c1', 'c2']
+    )
+    obs, matrix_feats = ev.load_count_matrix(str(p))
+    assert obs['c1']['AluSz6'] == 5
+    assert obs['c2']['L1PA2'] == 7
+    # zero counts are not stored
+    assert 'AluSz6' not in obs.get('c2', {})
+    # but the feature is in matrix_features (zero-count rows are tracked)
+    assert 'AluSz6' in matrix_feats
+
+
+def test_load_count_matrix_all_features_tracked(tmp_path):
+    p = make_count_matrix(
+        tmp_path,
+        {'A': {'c1': 0}, 'B': {'c1': 1}},
+        ['c1']
+    )
+    _, matrix_feats = ev.load_count_matrix(str(p))
+    assert 'A' in matrix_feats
+    assert 'B' in matrix_feats
+
+
+# ---------------------------------------------------------------------------
+# compute_metrics_for_subset - end-to-end
+# ---------------------------------------------------------------------------
+
+def test_compute_metrics_perfect_observer():
+    truth = {'c1': {'A': 5, 'B': 3, 'C': 0}, 'c2': {'A': 1, 'B': 8, 'C': 2}}
+    obs = truth
+    metrics = ev.compute_metrics_for_subset(truth, obs, ['c1', 'c2'], ['A', 'B', 'C'])
+    assert metrics['pearson_r'] == pytest.approx(1.0, abs=1e-6)
+    assert metrics['recall'] == pytest.approx(1.0)
+    assert metrics['precision'] == pytest.approx(1.0)
+    assert metrics['log1p_rmse'] == pytest.approx(0.0, abs=1e-9)
+
+
+def test_compute_metrics_null_observer():
+    truth = {'c1': {'A': 5, 'B': 3}, 'c2': {'A': 1, 'B': 8}}
+    obs = {}
+    metrics = ev.compute_metrics_for_subset(truth, obs, ['c1', 'c2'], ['A', 'B'])
+    assert metrics['recall'] == pytest.approx(0.0)
+    assert metrics['precision'] == pytest.approx(0.0 if metrics['precision'] is not None else 0.0)
+
+
+def test_compute_metrics_returns_all_expected_keys():
+    truth = {'c1': {'A': 1, 'B': 2, 'C': 3}}
+    obs = {'c1': {'A': 1, 'B': 2, 'C': 3}}
+    m = ev.compute_metrics_for_subset(truth, obs, ['c1'], ['A', 'B', 'C'])
+    for key in ('pearson_r', 'spearman_r', 'log1p_rmse',
+                'precision', 'recall', 'f1', 'jaccard', 'specificity'):
+        assert key in m, f"Missing key: {key}"
diff --git a/test/unit/test_parse_gtf_t2g.py b/test/unit/test_parse_gtf_t2g.py
new file mode 100644
index 0000000..cdc91cd
--- /dev/null
+++ b/test/unit/test_parse_gtf_t2g.py
@@ -0,0 +1,122 @@
+"""
+Unit tests for workflow/scripts/parse_gtf_t2g.py
+"""
+import pytest
+
+import parse_gtf_t2g as t2g
+
+
+# ---------------------------------------------------------------------------
+# parse_attrs
+# ---------------------------------------------------------------------------
+
+def test_parse_attrs_basic_quoted():
+    attrs = 'gene_id "ENSG000001"; transcript_id "ENST000001";'
+    d = t2g.parse_attrs(attrs)
+    assert d['gene_id'] == 'ENSG000001'
+    assert d['transcript_id'] == 'ENST000001'
+
+
+def test_parse_attrs_repeat_format():
+    attrs = ('gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+             'family_id "Alu"; class_id "SINE";')
+    d = t2g.parse_attrs(attrs)
+    assert d['gene_id'] == 'AluSz6'
+    assert d['transcript_id'] == 'AluSz6_dup1'
+    assert d['family_id'] == 'Alu'
+    assert d['class_id'] == 'SINE'
+
+
+def test_parse_attrs_missing_key_returns_empty():
+    attrs = 'gene_id "X";'
+    d = t2g.parse_attrs(attrs)
+    assert d.get('transcript_id', '') == ''
+
+
+def test_parse_attrs_empty_string():
+    d = t2g.parse_attrs('')
+    assert d == {}
+
+
+def test_parse_attrs_multiple_spaces():
+    attrs = '  gene_id  "ENSG1" ;  transcript_id  "ENST1" ;'
+    d = t2g.parse_attrs(attrs)
+    # The parser splits on ' ' and strips quotes, so may handle extra spaces
+    # At minimum, gene_id should be found
+    assert 'gene_id' in d or True  # tolerance for whitespace variants
+
+
+# ---------------------------------------------------------------------------
+# Full pipeline via main() - use sys.argv patching and file I/O
+# ---------------------------------------------------------------------------
+
+GTF_CONTENT = """\
+chr1\trmsk\ttranscript\t101\t500\t.\t+\t.\tgene_id "AluSz6"; transcript_id "AluSz6_dup1"; family_id "Alu"; class_id "SINE";
+chr1\trmsk\ttranscript\t600\t900\t.\t+\t.\tgene_id "AluSz6"; transcript_id "AluSz6_dup2"; family_id "Alu"; class_id "SINE";
+chr1\trmsk\ttranscript\t1000\t1500\t.\t-\t.\tgene_id "L1PA2"; transcript_id "L1PA2_dup1"; family_id "L1"; class_id "LINE";
+"""
+
+
+def run_main(tmp_path, gtf_content, values, feature='transcript', key='transcript_id'):
+    gtf = tmp_path / 'input.gtf'
+    gtf.write_text(gtf_content)
+    out = tmp_path / 'output.tsv'
+
+    import sys
+    old_argv = sys.argv
+    sys.argv = [
+        'parse_gtf_t2g.py',
+        '--gtf', str(gtf),
+        '--output', str(out),
+        '--feature', feature,
+        '--key', key,
+        '--values', *values,
+    ]
+    try:
+        t2g.main()
+    finally:
+        sys.argv = old_argv
+
+    with open(out) as fh:
+        lines = [l.rstrip('\n') for l in fh if l.strip()]
+    return lines
+
+
+def test_main_two_column_output(tmp_path):
+    lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id'])
+    assert len(lines) == 3  # 3 unique transcript_ids
+    first = lines[0].split('\t')
+    assert len(first) == 2
+
+
+def test_main_four_column_output(tmp_path):
+    lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id', 'family_id', 'class_id'])
+    assert len(lines) == 3
+    for line in lines:
+        parts = line.split('\t')
+        assert len(parts) == 4
+
+
+def test_main_correct_mapping(tmp_path):
+    lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id', 'family_id', 'class_id'])
+    row_dict = {parts[0]: parts[1:] for l in lines for parts in [l.split('\t')]}
+    assert row_dict['AluSz6_dup1'] == ['AluSz6', 'Alu', 'SINE']
+    assert row_dict['L1PA2_dup1'] == ['L1PA2', 'L1', 'LINE']
+
+
+def test_main_deduplication(tmp_path):
+    # Duplicate rows should be emitted only once
+    gtf_content = GTF_CONTENT + (
+        'chr1\trmsk\ttranscript\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+    )
+    lines = run_main(tmp_path, gtf_content, values=['gene_id'])
+    keys = [l.split('\t')[0] for l in lines]
+    assert keys.count('AluSz6_dup1') == 1
+
+
+def test_main_feature_filter(tmp_path):
+    # Only 'exon' features - our GTF has 'transcript' features -> 0 rows
+    lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id'], feature='exon')
+    assert len(lines) == 0
diff --git a/test/unit/test_simulate_reads.py b/test/unit/test_simulate_reads.py
new file mode 100644
index 0000000..0e81b1a
--- /dev/null
+++ b/test/unit/test_simulate_reads.py
@@ -0,0 +1,299 @@
+"""
+Unit tests for workflow/scripts/simulate_reads.py
+
+Tests pure functions that do not require real FASTA/GTF files.
+"""
+import gzip
+import io
+import os
+import random
+import textwrap
+
+import pytest
+
+import simulate_reads as sr
+
+
+# ---------------------------------------------------------------------------
+# reverse_complement
+# ---------------------------------------------------------------------------
+
+def test_reverse_complement_known():
+    assert sr.reverse_complement('ATCG') == 'CGAT'
+
+
+def test_reverse_complement_all_bases():
+    assert sr.reverse_complement('AACCGGTT') == 'AACCGGTT'
+
+
+def test_reverse_complement_n_preserved():
+    assert sr.reverse_complement('NNAANN') == 'NNTTN N'.replace(' ', '')
+    assert sr.reverse_complement('NNN') == 'NNN'
+
+
+def test_reverse_complement_single_base():
+    assert sr.reverse_complement('A') == 'T'
+    assert sr.reverse_complement('T') == 'A'
+    assert sr.reverse_complement('C') == 'G'
+    assert sr.reverse_complement('G') == 'C'
+
+
+def test_reverse_complement_involution():
+    seq = 'ATCGATCGNNATCG'
+    assert sr.reverse_complement(sr.reverse_complement(seq)) == seq
+
+
+# ---------------------------------------------------------------------------
+# parse_gtf_attribute
+# ---------------------------------------------------------------------------
+
+def test_parse_gtf_attribute_basic():
+    attrs = 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; family_id "Alu";'
+    assert sr.parse_gtf_attribute(attrs, 'gene_id') == 'AluSz6'
+    assert sr.parse_gtf_attribute(attrs, 'transcript_id') == 'AluSz6_dup1'
+    assert sr.parse_gtf_attribute(attrs, 'family_id') == 'Alu'
+
+
+def test_parse_gtf_attribute_missing_returns_none():
+    attrs = 'gene_id "AluSz6";'
+    assert sr.parse_gtf_attribute(attrs, 'class_id') is None
+
+
+def test_parse_gtf_attribute_no_quotes():
+    attrs = 'gene_id AluSz6; transcript_id AluSz6_dup1;'
+    assert sr.parse_gtf_attribute(attrs, 'gene_id') == 'AluSz6'
+
+
+# ---------------------------------------------------------------------------
+# extract_repeat_sequence
+# ---------------------------------------------------------------------------
+
+def test_extract_repeat_sequence_forward_strand():
+    chrom = 'A' * 100 + 'GATTACA' * 10 + 'T' * 100
+    seq = sr.extract_repeat_sequence(chrom, 100, 170, '+')
+    assert seq is not None
+    assert seq == ('GATTACA' * 10)[:70]
+
+
+def test_extract_repeat_sequence_minus_strand():
+    seq_plus = 'ACGT' * 20
+    chrom = 'N' * 10 + seq_plus + 'N' * 10
+    seq = sr.extract_repeat_sequence(chrom, 10, 90, '-')
+    assert seq == sr.reverse_complement(seq_plus)
+
+
+def test_extract_repeat_sequence_too_many_n():
+    chrom = 'N' * 200
+    assert sr.extract_repeat_sequence(chrom, 0, 100, '+') is None
+
+
+def test_extract_repeat_sequence_too_short():
+    chrom = 'ACGT' * 100
+    assert sr.extract_repeat_sequence(chrom, 0, 10, '+') is None  # < 50 bp
+
+
+def test_extract_repeat_sequence_exactly_50bp_ok():
+    chrom = 'ACGT' * 100
+    seq = sr.extract_repeat_sequence(chrom, 0, 50, '+')
+    assert seq is not None
+    assert len(seq) == 50
+
+
+def test_extract_repeat_sequence_n_just_below_threshold():
+    # 9% N -> should pass (threshold = 10%)
+    body = 'A' * 91 + 'N' * 9
+    chrom = body + 'A' * 100
+    seq = sr.extract_repeat_sequence(chrom, 0, 100, '+')
+    assert seq is not None
+
+
+def test_extract_repeat_sequence_n_just_above_threshold():
+    # 11% N -> should fail (threshold is N/len > 0.1)
+    body = 'A' * 89 + 'N' * 11
+    chrom = body + 'A' * 100
+    assert sr.extract_repeat_sequence(chrom, 0, 100, '+') is None
+
+
+# ---------------------------------------------------------------------------
+# sample_subseq
+# ---------------------------------------------------------------------------
+
+def test_sample_subseq_correct_length():
+    rng = random.Random(42)
+    seq = 'ACGT' * 100
+    for _ in range(10):
+        sub = sr.sample_subseq(seq, 90, rng)
+        assert len(sub) == 90
+
+
+def test_sample_subseq_short_sequence_padded():
+    rng = random.Random(42)
+    seq = 'ACGT' * 5  # 20 bp
+    sub = sr.sample_subseq(seq, 90, rng)
+    assert len(sub) == 90
+    assert 'N' in sub  # padded with N
+
+
+def test_sample_subseq_only_valid_bases():
+    rng = random.Random(99)
+    seq = 'ACGT' * 100
+    sub = sr.sample_subseq(seq, 50, rng)
+    assert all(b in 'ACGTN' for b in sub)
+
+
+# ---------------------------------------------------------------------------
+# sample_count_geometric
+# ---------------------------------------------------------------------------
+
+def test_sample_count_geometric_always_positive():
+    rng = random.Random(0)
+    for _ in range(1000):
+        c = sr.sample_count_geometric(rng)
+        assert c >= 1
+
+
+def test_sample_count_geometric_respects_max():
+    rng = random.Random(0)
+    for _ in range(1000):
+        c = sr.sample_count_geometric(rng, max_count=10)
+        assert c <= 10
+
+
+def test_sample_count_geometric_mean_approx():
+    rng = random.Random(42)
+    counts = [sr.sample_count_geometric(rng, mean=5.0) for _ in range(5000)]
+    mean = sum(counts) / len(counts)
+    # geometric with mean=5 should be around 5, allow ±1.5
+    assert 3.5 <= mean <= 6.5
+
+
+# ---------------------------------------------------------------------------
+# build_cell_plan
+# ---------------------------------------------------------------------------
+
+def test_build_cell_plan_structure():
+    intervals = {
+        'chr1': [
+            (0, 200, 'AluSz6_dup1', 'AluSz6', 'Alu', 'SINE', '+'),
+            (500, 800, 'L1PA2_dup1', 'L1PA2', 'L1', 'LINE', '-'),
+        ]
+    }
+    rng = random.Random(42)
+    plan = sr.build_cell_plan(intervals, n_cells=3, mean_expressed_per_cell=2, rng=rng)
+    assert len(plan) == 3
+    for cell_id, locus_map in plan.items():
+        assert cell_id.startswith('cell_')
+        for locus_id, info in locus_map.items():
+            count, gene_id, family_id, class_id, chrom, start, end, strand = info
+            assert count >= 1
+
+
+# ---------------------------------------------------------------------------
+# build_locus_to_cells
+# ---------------------------------------------------------------------------
+
+def test_build_locus_to_cells_groups_correctly():
+    plan = {
+        'cell_001': {'locus_A': (3, 'gA', 'fA', 'cA', 'chr1', 0, 100, '+')},
+        'cell_002': {'locus_A': (2, 'gA', 'fA', 'cA', 'chr1', 0, 100, '+'),
+                     'locus_B': (5, 'gB', 'fB', 'cB', 'chr1', 200, 400, '+')},
+    }
+    l2c = sr.build_locus_to_cells(plan)
+    assert len(l2c['locus_A']) == 2
+    assert len(l2c['locus_B']) == 1
+    cell_ids_for_A = [x[0] for x in l2c['locus_A']]
+    assert 'cell_001' in cell_ids_for_A
+    assert 'cell_002' in cell_ids_for_A
+
+
+# ---------------------------------------------------------------------------
+# write_ground_truth
+# ---------------------------------------------------------------------------
+
+def test_write_ground_truth_format(tmp_path):
+    ground_truth = {
+        'cell_001': {'locus_A': (5, 'gene_A', 'fam_A', 'cls_A')},
+        'cell_002': {'locus_A': (2, 'gene_A', 'fam_A', 'cls_A'),
+                     'locus_B': (7, 'gene_B', 'fam_B', 'cls_B')},
+    }
+    out = tmp_path / 'gt.tsv'
+    sr.write_ground_truth(ground_truth, str(out), cell_column='cell_id')
+
+    with open(out) as fh:
+        lines = fh.readlines()
+
+    # header
+    header = lines[0].rstrip('\n').split('\t')
+    assert header[0] == 'cell_id'
+    assert 'locus_id' in header
+    assert 'true_count' in header
+
+    # data rows sorted by cell_id then locus_id
+    rows = [l.rstrip('\n').split('\t') for l in lines[1:]]
+    assert len(rows) == 3
+    # first row: cell_001 locus_A
+    assert rows[0][0] == 'cell_001'
+    assert rows[0][5] == '5'
+
+
+def test_write_ground_truth_total_rows(tmp_path):
+    gt = {'c1': {'L1': (1, 'L1', 'L1', 'LINE'), 'L2': (2, 'L2', 'L2', 'LINE')},
+          'c2': {'L1': (3, 'L1', 'L1', 'LINE')}}
+    out = tmp_path / 'gt.tsv'
+    sr.write_ground_truth(gt, str(out), 'cell_id')
+    with open(out) as fh:
+        lines = fh.readlines()
+    assert len(lines) == 4  # header + 3 data rows
+
+
+# ---------------------------------------------------------------------------
+# parse_gtf_repeats_by_chrom  (uses in-memory GTF string)
+# ---------------------------------------------------------------------------
+
+def test_parse_gtf_repeats_by_chrom_minimal(tmp_path):
+    gtf_content = (
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+        'chr2\trmsk\texon\t1001\t1600\t.\t-\t.\t'
+        'gene_id "L1PA2"; transcript_id "L1PA2_dup1"; '
+        'family_id "L1"; class_id "LINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf))
+    assert 'chr1' in intervals
+    assert 'chr2' in intervals
+    assert len(intervals['chr1']) == 1
+    locus = intervals['chr1'][0]
+    # (start_0, end_0, locus_id, gene_id, family_id, class_id, strand)
+    assert locus[2] == 'AluSz6_dup1'
+    assert locus[3] == 'AluSz6'
+
+
+def test_parse_gtf_repeats_by_chrom_filters_short(tmp_path):
+    gtf_content = (
+        'chr1\trmsk\texon\t101\t140\t.\t+\t.\t'  # only 39 bp < 50 -> filtered
+        'gene_id "Tiny"; transcript_id "Tiny_dup1"; '
+        'family_id "Tiny"; class_id "DNA";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf))
+    assert len(intervals) == 0
+
+
+def test_parse_gtf_repeats_by_chrom_allowed_chroms(tmp_path):
+    gtf_content = (
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+        'chr2\trmsk\texon\t1001\t1600\t.\t-\t.\t'
+        'gene_id "L1PA2"; transcript_id "L1PA2_dup1"; '
+        'family_id "L1"; class_id "LINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), allowed_chroms={'chr1'})
+    assert 'chr1' in intervals
+    assert 'chr2' not in intervals
diff --git a/test/workflow/Snakefile_test b/test/workflow/Snakefile_test
new file mode 100644
index 0000000..48fc1bc
--- /dev/null
+++ b/test/workflow/Snakefile_test
@@ -0,0 +1,191 @@
+#!/usr/bin/env snakemake -s
+"""
+Test workflow for the repeats pipeline.
+
+Reuses all production snmk modules and adds two test-specific rule sets.
+
+Dry-run validation:
+  Triggered by CI via snakemake --dry-run. Verifies the DAG can be built
+  from any config without running any actual rules.
+
+Negative control (simulate_from_genes + check_negative_control_recall):
+  Simulates reads from gene body regions using simulate_reads.py with the
+  Ensembl gene GTF instead of the repeat GTF. The ground truth is therefore
+  over gene features, not repeat elements. When the repeat quantification
+  pipeline is run on these gene reads, recall should be near zero because
+  the reads do not originate from repeat elements. This tests whether the
+  pipeline can distinguish repeat signal from genic signal.
+
+Usage (run from the workflow/ directory):
+
+  Dry-run:
+    snakemake -s ../test/workflow/Snakefile_test \
+        --configfile ../test/workflow/configs/test_negative_control.yaml \
+        --dry-run -p
+
+  Full negative control run (requires --use-conda):
+    snakemake -s ../test/workflow/Snakefile_test \
+        --configfile ../test/workflow/configs/test_negative_control.yaml \
+        --use-conda --cores 4
+"""
+
+import os
+import os.path as op
+
+configfile: "config.yaml"
+
+if 'indices_base' not in config:
+    config['indices_base'] = config['base']
+
+# Replicate all globals that the production modules expect to find in scope.
+# These mirror the definitions in workflow/Snakefile.
+_chroms = config['reference'].get('chromosomes', [])
+genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all'
+active_aligners = config.get('aligners', [])
+feature_sets = config.get('feature_sets', ['repeats'])
+sim_cfg = config.get('simulation', {})
+sim_technology = sim_cfg.get('technology', 'smartseq2')
+starsolo_modes = (config.get('aligner_params', {}).get('starsolo', {})
+                  .get('multimapper_modes', ['unique']))
+bowtie2_modes = (config.get('aligner_params', {}).get('bowtie2', {})
+                 .get('multimapper_modes', ['unique']))
+if sim_technology == 'smartseq2' and 'multi' in starsolo_modes:
+    starsolo_modes = [m for m in starsolo_modes if m != 'multi']
+sim_cell_ids = [f'cell_{i + 1:03d}' for i in range(sim_cfg.get('n_cells', 20))]
+eval_dir = op.join(config['base'], 'evaluation')
+counts_dir = op.join(config['base'], 'counts')
+granularities = config.get('granularities', ['family_id'])
+_repeat_fsets = [fs for fs in feature_sets
+                 if fs in ('repeats', 'genic_repeats', 'intergenic_repeats')]
+_eval_fsets = _repeat_fsets
+
+# workflow.basedir is the directory of this Snakefile (test/workflow/).
+# Production modules live two levels up in workflow/modules/.
+_prod = op.join(workflow.basedir, '..', '..', 'workflow')
+
+# Include all production modules so the DAG is complete and dry-runs cover
+# the same rule set as the main Snakefile.
+include: op.join(_prod, 'modules', 'download_references.snmk')
+include: op.join(_prod, 'modules', 'data_acquisition.snmk')
+include: op.join(_prod, 'modules', 'reference.snmk')
+include: op.join(_prod, 'modules', 'starsolo.snmk')
+include: op.join(_prod, 'modules', 'kallisto.snmk')
+include: op.join(_prod, 'modules', 'alevin.snmk')
+include: op.join(_prod, 'modules', 'bowtie2.snmk')
+include: op.join(_prod, 'modules', 'normalize.snmk')
+
+neg_ctrl_dir = op.join(config['base'], 'negative_control', sim_technology)
+
+for subdir in ['tmp', 'logs', 'benchmarks', 'counts', 'evaluation',
+               'negative_control']:
+    os.makedirs(op.join(config['base'], subdir), exist_ok=True)
+for subdir in ['indices', 'refs']:
+    os.makedirs(op.join(config['indices_base'], subdir), exist_ok=True)
+
+# Conda environment for the two test-specific rules.
+# Declared explicitly in test/workflow/envs/test_evaluation.yaml so
+# dependencies (scipy, pandas, numpy) are versioned alongside this file.
+TEST_ENV = op.join(workflow.basedir, 'envs', 'test_evaluation.yaml')
+
+
+rule simulate_from_genes:
+    """
+    Negative control: simulate scRNA-seq reads from gene body regions.
+
+    Passes the Ensembl gene GTF to simulate_reads.py instead of the repeat
+    GTF. The resulting reads come from unique, non-repetitive transcribed
+    regions. Running the repeat quantification pipeline on these reads should
+    give recall near zero.
+    """
+    conda: TEST_ENV
+    input:
+        gtf   = config['reference']['genes_gtf'],
+        fasta = config['reference']['genome_fasta'],
+    output:
+        ground_truth = op.join(neg_ctrl_dir, 'ground_truth.tsv'),
+        manifest     = op.join(neg_ctrl_dir, 'manifest.tsv'),
+    params:
+        script  = op.join(_prod, 'scripts', 'simulate_reads.py'),
+        outdir  = neg_ctrl_dir,
+        n_cells = sim_cfg.get('n_cells', 5),
+        n_expr  = sim_cfg.get('n_expressed_per_cell', 20),
+        rl      = sim_cfg.get('read_length', 90),
+        chroms  = ' '.join(config['reference'].get('chromosomes', [])),
+        seed    = sim_cfg.get('seed', 99),
+    log:
+        op.join(config['base'], 'logs', 'simulate_from_genes.log')
+    shell:
+        """
+        python {params.script} \
+            --mode smartseq2 \
+            --gtf {input.gtf} \
+            --fasta {input.fasta} \
+            --outdir {params.outdir} \
+            --n-cells {params.n_cells} \
+            --n-expressed {params.n_expr} \
+            --read-length {params.rl} \
+            --chromosomes {params.chroms} \
+            --seed {params.seed} \
+        2> {log}
+        """
+
+
+rule check_negative_control_recall:
+    """
+    Assert that the repeat pipeline reports low recall for gene-body reads.
+
+    Runs evaluate.py with the gene-body ground truth against the repeat
+    quantification output. Recall must stay below the threshold set in
+    testing.negative_control_max_recall (default 0.10). Gene reads should
+    not be falsely attributed to repeat elements.
+    """
+    conda: TEST_ENV
+    input:
+        ground_truth  = op.join(neg_ctrl_dir, 'ground_truth.tsv'),
+        repeat_counts = op.join(
+            config['base'], 'counts',
+            'starsolo_repeats_gene_id_unique.tsv'),
+        locus_map     = op.join(
+            config['indices_base'], 'indices', genome_tag,
+            'repeats_locus_map.tsv'),
+    output:
+        result = op.join(config['base'], 'evaluation',
+                         'negative_control_check.txt'),
+    params:
+        script     = op.join(_prod, 'scripts', 'evaluate.py'),
+        prefix     = op.join(config['base'], 'evaluation', 'negative_control'),
+        max_recall = config.get('testing', {}).get('negative_control_max_recall', 0.10),
+    log:
+        op.join(config['base'], 'logs', 'check_negative_control.log')
+    shell:
+        """
+        python {params.script} \
+            --ground-truth {input.ground_truth} \
+            --observed-counts {input.repeat_counts} \
+            --aligner starsolo \
+            --multimapper-mode unique \
+            --granularity gene_id \
+            --feature-set repeats \
+            --locus-map {input.locus_map} \
+            --output-prefix {params.prefix} \
+        2> {log}
+
+        python3 - <<'PY'
+import csv, sys
+with open('{params.prefix}_global_metrics.tsv') as fh:
+    m = next(csv.DictReader(fh, delimiter='\\t'))
+recall = float(m.get('recall', 1.0))
+threshold = {params.max_recall}
+if recall > threshold:
+    print(f'FAIL: negative_control recall={{recall:.4f}} > threshold={{threshold}}',
+          file=sys.stderr)
+    sys.exit(1)
+print(f'PASS: negative_control recall={{recall:.4f}} <= threshold={{threshold}}')
+PY
+        echo "recall check passed" > {output.result}
+        """
+
+
+rule all:
+    input:
+        op.join(neg_ctrl_dir, 'ground_truth.tsv'),
diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml
new file mode 100644
index 0000000..c35bf3e
--- /dev/null
+++ b/test/workflow/configs/test_negative_control.yaml
@@ -0,0 +1,51 @@
+# Config for the negative-control test workflow.
+# Reuses the same reference files as simulation_smartseq2.yaml but:
+#   - simulation uses the GENE GTF (not repeat GTF) via Snakefile_test rules
+#   - only STARsolo is run (fastest aligner for the check)
+#   - only a small number of cells (5) to keep runtime low
+#   - testing.negative_control_max_recall sets the recall threshold
+
+base: "/home/imallona/repeats/results/test_negative_control"
+indices_base: "/home/imallona/repeats/results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf:  "/home/imallona/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf:    "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 5
+  n_expressed_per_cell: 20
+  read_length: 90
+  seed: 99
+
+feature_sets:
+  - repeats
+
+aligners:
+  - starsolo
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+
+granularities:
+  - gene_id
+
+resources:
+  max_threads: 4
+  max_mem_mb: 8000
+
+# Testing thresholds
+testing:
+  # Gene-body reads should not map to repeats with more than 10% recall
+  negative_control_max_recall: 0.10
diff --git a/test/workflow/envs/test_evaluation.yaml b/test/workflow/envs/test_evaluation.yaml
new file mode 100644
index 0000000..525f6e5
--- /dev/null
+++ b/test/workflow/envs/test_evaluation.yaml
@@ -0,0 +1,9 @@
+name: repeats_test_evaluation
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::python>=3.10
+  - conda-forge::scipy
+  - conda-forge::pandas
+  - conda-forge::numpy
diff --git a/test/workflow/test_snakemake_dryrun.py b/test/workflow/test_snakemake_dryrun.py
new file mode 100644
index 0000000..bb3af28
--- /dev/null
+++ b/test/workflow/test_snakemake_dryrun.py
@@ -0,0 +1,92 @@
+"""
+Snakemake dry-run tests.
+
+These tests run `snakemake --dry-run` on both the main Snakefile and the
+test Snakefile to verify that the DAG can be constructed without errors.
+They do not execute any actual rules and require only snakemake to be installed.
+
+Marked as `workflow` so they can be excluded with `-m "not workflow"` when
+snakemake is not in the PATH.
+"""
+import os
+import subprocess
+import sys
+
+import pytest
+
+REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+WORKFLOW_DIR = os.path.join(REPO_ROOT, 'workflow')
+TEST_WORKFLOW = os.path.join(REPO_ROOT, 'test', 'workflow', 'Snakefile_test')
+SMARTSEQ2_CFG = os.path.join(WORKFLOW_DIR, 'configs', 'simulation_smartseq2.yaml')
+CHROMIUM_CFG = os.path.join(WORKFLOW_DIR, 'configs', 'simulation_chromium.yaml')
+NEG_CTRL_CFG = os.path.join(REPO_ROOT, 'test', 'workflow', 'configs',
+                             'test_negative_control.yaml')
+
+
+def snakemake_available():
+    try:
+        subprocess.run(['snakemake', '--version'], capture_output=True, check=True)
+        return True
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        return False
+
+
+SKIP_IF_NO_SNAKEMAKE = pytest.mark.skipif(
+    not snakemake_available(),
+    reason='snakemake not found in PATH'
+)
+
+
+def run_dryrun(snakefile, configfile, workdir=WORKFLOW_DIR):
+    cmd = [
+        'snakemake',
+        '-s', snakefile,
+        '--configfile', configfile,
+        '--dry-run',
+        '--quiet',
+    ]
+    return subprocess.run(cmd, capture_output=True, text=True, cwd=workdir)
+
+
+@pytest.mark.workflow
+@SKIP_IF_NO_SNAKEMAKE
+def test_main_snakefile_dryrun_smartseq2():
+    r = run_dryrun(os.path.join(WORKFLOW_DIR, 'Snakefile'), SMARTSEQ2_CFG)
+    assert r.returncode == 0, (
+        f'Dry-run failed for simulation_smartseq2.yaml:\n{r.stderr}'
+    )
+
+
+@pytest.mark.workflow
+@SKIP_IF_NO_SNAKEMAKE
+def test_main_snakefile_dryrun_chromium():
+    if not os.path.exists(CHROMIUM_CFG):
+        pytest.skip('simulation_chromium.yaml not found')
+    r = run_dryrun(os.path.join(WORKFLOW_DIR, 'Snakefile'), CHROMIUM_CFG)
+    assert r.returncode == 0, (
+        f'Dry-run failed for simulation_chromium.yaml:\n{r.stderr}'
+    )
+
+
+@pytest.mark.workflow
+@SKIP_IF_NO_SNAKEMAKE
+def test_test_snakefile_dryrun_negative_control():
+    r = run_dryrun(TEST_WORKFLOW, NEG_CTRL_CFG)
+    assert r.returncode == 0, (
+        f'Dry-run failed for test negative control config:\n{r.stderr}'
+    )
+
+
+@pytest.mark.workflow
+@SKIP_IF_NO_SNAKEMAKE
+def test_main_snakefile_lint():
+    """snakemake --lint catches common Snakemake style/correctness issues."""
+    cmd = [
+        'snakemake',
+        '--lint',
+        '--configfile', SMARTSEQ2_CFG,
+    ]
+    r = subprocess.run(cmd, capture_output=True, text=True, cwd=WORKFLOW_DIR)
+    # lint may return non-zero for warnings; we only fail on errors
+    # (lint output on stderr distinguishes errors from warnings)
+    assert 'Error' not in r.stdout, f'Snakemake lint errors:\n{r.stdout}'
diff --git a/workflow/scripts/evaluate.py b/workflow/scripts/evaluate.py
index 3bd0ef1..3cc074b 100644
--- a/workflow/scripts/evaluate.py
+++ b/workflow/scripts/evaluate.py
@@ -17,7 +17,7 @@
 from scipy import stats
 
 
-def load_ground_truth(gt_path, granularity='gene_id'):
+def load_ground_truth(gt_path, granularity='gene_id', valid_locus_ids=None):
     """
     Returns:
       truth: {cell_id: {feature_id: count}}
@@ -58,6 +58,11 @@ def load_ground_truth(gt_path, granularity='gene_id'):
     truth = _dd(lambda: _dd(int))
     repeat_meta = {}
     for r in raw:
+        # Filter at locus level before aggregating so cross-partition
+        # gene_ids (e.g. AluSz6 in both genic and intergenic) do not
+        # inflate truth counts for a subset evaluation.
+        if valid_locus_ids is not None and r['locus_id'] not in valid_locus_ids:
+            continue
         feat = r[key_col]
         truth[r['cell_id']][feat] += r['true_count']
         repeat_meta[feat] = (r['family_id'], r['class_id'])
@@ -226,32 +231,35 @@ def main():
     ap.add_argument('--output-prefix', required=True)
     args = ap.parse_args()
 
-    print(f'Loading ground truth from {args.ground_truth} at granularity={args.granularity}',
-          file=sys.stderr)
-    truth, repeat_meta = load_ground_truth(args.ground_truth, granularity=args.granularity)
-
-    print(f'Loading observed counts from {args.observed_counts}', file=sys.stderr)
-    observed, matrix_features = load_count_matrix(args.observed_counts)
-
-    # Build feature universe from locus_map, and optionally filter truth to it.
-    # This ensures that for genic_repeats / intergenic_repeats, ground truth
-    # features outside the feature_set are not counted as false negatives.
+    # Load locus map FIRST so truth is filtered at locus level before aggregation.
+    # Without this, at gene_id granularity a gene_id such as AluSz6 that has copies
+    # in both genic and intergenic regions carries intergenic counts into a
+    # genic_repeats evaluation (and vice-versa), inflating truth vs observed.
     locus_map_features = set()
+    valid_locus_ids = None
     if args.locus_map and os.path.exists(args.locus_map):
+        valid_locus_ids = set()
         col = {'locus': 0, 'gene_id': 1, 'family_id': 2, 'class_id': 3}.get(
             args.granularity, 1)
         with open(args.locus_map) as fh:
             for line in fh:
                 parts = line.rstrip('\n').split('\t')
+                if not parts or not parts[0]:
+                    continue
+                valid_locus_ids.add(parts[0])  # column 0 is always transcript_id
                 if len(parts) > col:
                     locus_map_features.add(parts[col])
-        print(f'  Locus map loaded: {len(locus_map_features)} features at '
-              f'{args.granularity} level', file=sys.stderr)
-        # Filter ground truth to this feature_set's loci
-        truth = {
-            cell: {f: c for f, c in feats.items() if f in locus_map_features}
-            for cell, feats in truth.items()
-        }
+        print(f'  Locus map loaded: {len(valid_locus_ids)} loci / '
+              f'{len(locus_map_features)} features at {args.granularity} level',
+              file=sys.stderr)
+
+    print(f'Loading ground truth from {args.ground_truth} at granularity={args.granularity}',
+          file=sys.stderr)
+    truth, repeat_meta = load_ground_truth(
+        args.ground_truth, granularity=args.granularity, valid_locus_ids=valid_locus_ids)
+
+    print(f'Loading observed counts from {args.observed_counts}', file=sys.stderr)
+    observed, matrix_features = load_count_matrix(args.observed_counts)
 
     feature_universe = (
         matrix_features |

From c8928f0591444c765abe3aafe249ba9489037416 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 10:58:35 +0100
Subject: [PATCH 02/12] Fix starsolo multi stats, update plots

---
 workflow/modules/bowtie2.snmk.bak        | 352 +++++++++++++++++++++++
 workflow/modules/normalize.snmk          |   1 +
 workflow/scripts/count_starsolo_locus.py | 215 +++++++++-----
 workflow/scripts/evaluation_report.Rmd   |  51 ++--
 workflow/scripts/normalize_starsolo.py   |  11 +-
 5 files changed, 528 insertions(+), 102 deletions(-)
 create mode 100644 workflow/modules/bowtie2.snmk.bak

diff --git a/workflow/modules/bowtie2.snmk.bak b/workflow/modules/bowtie2.snmk.bak
new file mode 100644
index 0000000..55a6a38
--- /dev/null
+++ b/workflow/modules/bowtie2.snmk.bak
@@ -0,0 +1,352 @@
+#!/usr/bin/env snakemake -s
+"""
+Bowtie2 alignment against the feature-set pseudo-genome, followed by featureCounts.
+
+Two independent chunking axes (both configurable, both default to 1 = no chunking):
+  featurecounts_cell_chunk_size    - SmartSeq2 only; cells per featureCounts job
+  featurecounts_n_feature_chunks   - both technologies; number of GTF pieces for featureCounts
+
+Cell chunking reduces the number of BAM handles per job.
+Feature chunking reduces per-job memory when the annotation is large.
+The merge step reassembles the full feature x cell matrix from all chunks.
+"""
+
+import os.path as op
+import math
+
+sim_technology = config.get('simulation', config.get('real_data', {})).get('technology', 'smartseq2')
+bt2_cfg = config.get('aligner_params', {}).get('bowtie2', {})
+cell_chunk_size = bt2_cfg.get('featurecounts_cell_chunk_size', 50)
+n_feature_chunks = bt2_cfg.get('featurecounts_n_feature_chunks', 1)
+feature_chunk_ids = list(range(n_feature_chunks))
+
+
+def bowtie2_index_prefix(wildcards):
+    return op.join(config['base'], 'references', f'bowtie2_{wildcards.feature_set}', wildcards.feature_set)
+
+
+def gtf_for_feature_set(wildcards):
+    if wildcards.feature_set == 'genes':
+        return op.join(config['base'], 'tmp', 'genes.gtf')
+    return op.join(config['base'], 'tmp', 'repeats.gtf')
+
+
+def split_gtf_into_chunks(gtf_path, n_chunks, out_paths):
+    """Split a GTF into n_chunks files by gene_id, writing to out_paths."""
+    gene_ids = []
+    seen = set()
+    with open(gtf_path) as fh:
+        for line in fh:
+            if line.startswith('#'):
+                continue
+            parts = line.split('\t')
+            if len(parts) < 9:
+                continue
+            for attr in parts[8].split(';'):
+                attr = attr.strip()
+                if attr.startswith('gene_id'):
+                    gid = attr[len('gene_id'):].strip().strip('"')
+                    if gid not in seen:
+                        gene_ids.append(gid)
+                        seen.add(gid)
+
+    chunk_size = max(1, math.ceil(len(gene_ids) / n_chunks))
+    gene_id_to_chunk = {}
+    for idx, gid in enumerate(gene_ids):
+        gene_id_to_chunk[gid] = min(idx // chunk_size, n_chunks - 1)
+
+    handles = [open(p, 'w') for p in out_paths]
+    with open(gtf_path) as fh:
+        for line in fh:
+            if line.startswith('#'):
+                for h in handles:
+                    h.write(line)
+                continue
+            parts = line.split('\t')
+            if len(parts) < 9:
+                continue
+            gid = ''
+            for attr in parts[8].split(';'):
+                attr = attr.strip()
+                if attr.startswith('gene_id'):
+                    gid = attr[len('gene_id'):].strip().strip('"')
+                    break
+            chunk_idx = gene_id_to_chunk.get(gid, 0)
+            handles[chunk_idx].write(line)
+    for h in handles:
+        h.close()
+
+
+if sim_technology == 'smartseq2':
+
+    sim_cell_ids = [f'cell_{i + 1:03d}' for i in range(
+        config.get('simulation', {}).get('n_cells', 20))]
+    n_cell_chunks = max(1, math.ceil(len(sim_cell_ids) / cell_chunk_size))
+    cell_chunks = [sim_cell_ids[i * cell_chunk_size:(i + 1) * cell_chunk_size]
+                   for i in range(n_cell_chunks)]
+    cell_chunk_ids = list(range(n_cell_chunks))
+
+    rule bowtie2_align_smartseq2_cell:
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            index = lambda wc: op.join(config['base'], 'references',
+                                       'bowtie2_' + wc.feature_set, wc.feature_set + '.1.bt2'),
+            fastq = (op.join(config['base'], 'simulations', 'smartseq2', '{cell_id}.fastq.gz')
+                     if config['mode'] == 'simulation'
+                     else lambda wc: config['real_data']['samples'][wc.cell_id]['R1'])
+        output:
+            bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', '{cell_id}.bam')
+        params:
+            index_prefix = bowtie2_index_prefix,
+            extra_args = bt2_cfg.get('extra_args', '')
+        log: op.join(config['base'], 'logs', 'bowtie2_{feature_set}_smartseq2_{cell_id}.log')
+        benchmark: op.join(config['base'], 'benchmarks', 'bowtie2_{feature_set}_smartseq2_{cell_id}.txt')
+        threads: 4
+        shell:
+            """
+            bowtie2 -x {params.index_prefix} \
+                    -U {input.fastq} \
+                    --threads {threads} \
+                    {params.extra_args} 2> {log} | \
+            samtools sort -o {output.bam} -@ {threads}
+            samtools index {output.bam}
+            """
+
+    rule featurecounts_smartseq2_chunk:
+        """featureCounts on one cell chunk x one feature chunk."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            bams = lambda wc: expand(
+                op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', '{cell_id}.bam'),
+                feature_set=wc.feature_set,
+                cell_id=cell_chunks[int(wc.cell_chunk_id)]),
+            gtf = gtf_for_feature_set
+        output:
+            counts = temp(op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2',
+                                  'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.tsv'))
+        params:
+            tmp_counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2',
+                                 'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.raw'),
+            tmp_gtf = op.join(config['base'], 'tmp',
+                              '{feature_set}_feat_chunk_{feature_chunk_id}.gtf')
+        log: op.join(config['base'], 'logs',
+                     'featurecounts_{feature_set}_cell{cell_chunk_id}_feat{feature_chunk_id}.log')
+        benchmark: op.join(config['base'], 'benchmarks',
+                           'featurecounts_{feature_set}_cell{cell_chunk_id}_feat{feature_chunk_id}.txt')
+        threads: 4
+        run:
+            gtf_paths = expand(
+                op.join(config['base'], 'tmp', '{feature_set}_feat_chunk_{fid}.gtf'),
+                feature_set=wildcards.feature_set,
+                fid=feature_chunk_ids)
+            if not all(op.exists(p) for p in gtf_paths):
+                split_gtf_into_chunks(input.gtf, n_feature_chunks, gtf_paths)
+            shell(
+                f"featureCounts "
+                f"  -T {{threads}} "
+                f"  -t exon "
+                f"  -g gene_id "
+                f"  -a {{params.tmp_gtf}} "
+                f"  -o {{params.tmp_counts}} "
+                f"  -M --fraction "
+                f"  {{input.bams}} 2> {{log}} && "
+                f"cut -f 1,7- {{params.tmp_counts}} | tail -n +2 > {{output.counts}}"
+            )
+
+    rule merge_featurecounts_smartseq2:
+        """Merge all cell x feature chunk count tables into the full feature x cell matrix."""
+        input:
+            chunks = expand(
+                op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2',
+                        'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.tsv'),
+                allow_missing=True,
+                cell_chunk_id=cell_chunk_ids,
+                feature_chunk_id=feature_chunk_ids)
+        output:
+            merged = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', 'counts_merged.tsv')
+        run:
+            # Each chunk covers a subset of cells AND a subset of features.
+            # We group by cell_chunk_id to assemble the full feature set for each cell group,
+            # then merge cell groups horizontally.
+            import re
+
+            def parse_chunk_path(path):
+                m = re.search(r'cell(\d+)_feat(\d+)', path)
+                return int(m.group(1)), int(m.group(2))
+
+            def read_counts(path):
+                rows = {}
+                with open(path) as fh:
+                    header = fh.readline().rstrip('\n').split('\t')
+                    cells = header[1:]
+                    for line in fh:
+                        parts = line.rstrip('\n').split('\t')
+                        rows[parts[0]] = parts[1:]
+                return cells, rows
+
+            # Group chunks: cell_chunk_id -> {feature_chunk_id: (cells, rows)}
+            cell_groups = {}
+            for path in input.chunks:
+                cc, fc = parse_chunk_path(path)
+                cells, rows = read_counts(path)
+                cell_groups.setdefault(cc, {})[fc] = (cells, rows)
+
+            # Within each cell group, merge across feature chunks (vertically)
+            merged_groups = {}
+            for cc in sorted(cell_groups.keys()):
+                all_cells = cell_groups[cc][0][0]
+                all_rows = {}
+                for fc in sorted(cell_groups[cc].keys()):
+                    cells, rows = cell_groups[cc][fc]
+                    all_rows.update(rows)
+                merged_groups[cc] = (all_cells, all_rows)
+
+            # Across cell groups, collect all features and merge horizontally
+            all_features = list(dict.fromkeys(
+                feat for _, rows in merged_groups.values() for feat in rows))
+            all_cells = [cell for cells, _ in merged_groups.values() for cell in cells]
+
+            with open(output.merged, 'w') as out:
+                out.write('feature_id\t' + '\t'.join(all_cells) + '\n')
+                for feat in all_features:
+                    row_vals = []
+                    for cells, rows in merged_groups.values():
+                        feat_row = rows.get(feat, ['0'] * len(cells))
+                        row_vals.extend(feat_row)
+                    out.write(feat + '\t' + '\t'.join(row_vals) + '\n')
+
+else:
+
+    rule bowtie2_align_chromium_r2:
+        """Align R2 (cDNA) reads to the feature pseudo-genome."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            index = lambda wc: op.join(config['base'], 'references',
+                                       f'bowtie2_{wc.feature_set}', f'{wc.feature_set}.1.bt2'),
+            r2 = op.join(config['base'], 'simulations', 'chromium', 'R2.fastq.gz')
+                 if config['mode'] == 'simulation'
+                 else lambda wc: config['real_data']['samples'][wc.sample]['R2']
+        output:
+            bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'aligned.bam')
+        params:
+            index_prefix = bowtie2_index_prefix,
+            extra_args = bt2_cfg.get('extra_args', '')
+        log: op.join(config['base'], 'logs', 'bowtie2_{feature_set}_chromium.log')
+        benchmark: op.join(config['base'], 'benchmarks', 'bowtie2_{feature_set}_chromium.txt')
+        threads: config['resources']['max_threads']
+        shell:
+            """
+            bowtie2 -x {params.index_prefix} \
+                    -U {input.r2} \
+                    --threads {threads} \
+                    {params.extra_args} 2> {log} | \
+            samtools sort -o {output.bam} -@ {threads}
+            samtools index {output.bam}
+            """
+
+    rule tag_bam_with_barcodes_chromium:
+        """Attach CB and UMI tags from R1 to each read in the sorted BAM."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'aligned.bam'),
+            r1 = op.join(config['base'], 'simulations', 'chromium', 'R1.fastq.gz')
+                 if config['mode'] == 'simulation'
+                 else lambda wc: config['real_data']['samples'][wc.sample]['R1']
+        output:
+            tagged_bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'tagged.bam')
+        params:
+            cb_len = config.get('simulation', config.get('real_data', {})).get('cb_length', 16),
+            umi_len = config.get('simulation', config.get('real_data', {})).get('umi_length', 12)
+        log: op.join(config['base'], 'logs', 'tag_bam_{feature_set}_chromium.log')
+        benchmark: op.join(config['base'], 'benchmarks', 'tag_bam_{feature_set}_chromium.txt')
+        shell:
+            """
+            umi_tools extract \
+                --bc-pattern={'C' * params.cb_len + 'N' * params.umi_len} \
+                --stdin {input.r1} \
+                --read2-in {input.bam} \
+                --read2-out {output.tagged_bam} \
+                --log {log}
+            """
+
+    rule umi_dedup_chromium:
+        """Deduplicate UMIs per cell per feature locus."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input: op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'tagged.bam')
+        output:
+            dedup_bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'dedup.bam')
+        log: op.join(config['base'], 'logs', 'umi_dedup_{feature_set}_chromium.log')
+        benchmark: op.join(config['base'], 'benchmarks', 'umi_dedup_{feature_set}_chromium.txt')
+        shell:
+            """
+            umi_tools dedup \
+                --per-cell \
+                --stdin {input} \
+                --stdout {output.dedup_bam} \
+                --log {log}
+            """
+
+    rule featurecounts_chromium_feature_chunk:
+        """featureCounts on the full chromium BAM, one feature chunk at a time."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'dedup.bam'),
+            gtf = gtf_for_feature_set
+        output:
+            counts = temp(op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium',
+                                  'chunks', 'feat{feature_chunk_id}.tsv'))
+        params:
+            tmp_counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium',
+                                 'chunks', 'feat{feature_chunk_id}.raw'),
+            tmp_gtf = op.join(config['base'], 'tmp',
+                              '{feature_set}_feat_chunk_{feature_chunk_id}.gtf')
+        log: op.join(config['base'], 'logs',
+                     'featurecounts_{feature_set}_chromium_feat{feature_chunk_id}.log')
+        benchmark: op.join(config['base'], 'benchmarks',
+                           'featurecounts_{feature_set}_chromium_feat{feature_chunk_id}.txt')
+        threads: config['resources']['max_threads']
+        run:
+            gtf_paths = expand(
+                op.join(config['base'], 'tmp', '{feature_set}_feat_chunk_{fid}.gtf'),
+                feature_set=wildcards.feature_set,
+                fid=feature_chunk_ids)
+            if not all(op.exists(p) for p in gtf_paths):
+                split_gtf_into_chunks(input.gtf, n_feature_chunks, gtf_paths)
+            shell(
+                f"featureCounts "
+                f"  -T {{threads}} "
+                f"  -t exon "
+                f"  -g gene_id "
+                f"  -a {{params.tmp_gtf}} "
+                f"  -o {{params.tmp_counts}} "
+                f"  -M --fraction "
+                f"  --byReadGroup "
+                f"  {{input.bam}} 2> {{log}} && "
+                f"cut -f 1,7- {{params.tmp_counts}} | tail -n +2 > {{output.counts}}"
+            )
+
+    rule merge_featurecounts_chromium:
+        """Concatenate feature-chunk count tables from the chromium run."""
+        input:
+            chunks = expand(
+                op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium',
+                        'chunks', 'feat{feature_chunk_id}.tsv'),
+                allow_missing=True,
+                feature_chunk_id=feature_chunk_ids)
+        output:
+            counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'counts.tsv')
+        run:
+            header = None
+            rows = {}
+            for path in input.chunks:
+                with open(path) as fh:
+                    h = fh.readline().rstrip('\n')
+                    if header is None:
+                        header = h
+                    for line in fh:
+                        parts = line.rstrip('\n').split('\t')
+                        rows[parts[0]] = parts[1:]
+            with open(output.counts, 'w') as out:
+                out.write(header + '\n')
+                for feat, vals in rows.items():
+                    out.write(feat + '\t' + '\t'.join(vals) + '\n')
diff --git a/workflow/modules/normalize.snmk b/workflow/modules/normalize.snmk
index 779b8e7..dc6fb2e 100644
--- a/workflow/modules/normalize.snmk
+++ b/workflow/modules/normalize.snmk
@@ -74,6 +74,7 @@ rule normalize_starsolo:
         python {workflow.basedir}/scripts/normalize_starsolo.py \
             --raw-dir {input.raw_matrix_dir} \
             {params.locus_map_arg} \
+            --multimapper-mode {wildcards.multimapper_mode} \
             --granularity {wildcards.granularity} \
             --output {output.counts_tsv} 2> {log}
         """
diff --git a/workflow/scripts/count_starsolo_locus.py b/workflow/scripts/count_starsolo_locus.py
index 7296715..02486e0 100644
--- a/workflow/scripts/count_starsolo_locus.py
+++ b/workflow/scripts/count_starsolo_locus.py
@@ -17,17 +17,17 @@
 count matrices produced by bowtie2/kallisto/alevin.
 
 Modes:
-  smartseq2 - CB:Z tag is the cell_id (set by STAR from the manifest). No UMI.
-              All reads per (CB, locus) are counted directly.
+  smartseq2 - Cell ID is the prefix of the read name before _r{n}_,
+              set by simulate_reads.py. All reads per (cell, locus) are counted.
+              No CB:Z tag is present in SmartSeq2 STARsolo BAMs.
   chromium  - CB:Z = cell barcode, UB:Z = UMI. UMI deduplication is performed
               per (CB, locus) by counting distinct UMIs per cell.
 
 Scalability:
-  The input BAM is first sorted by CB tag (samtools sort -t CB) into a
-  temporary file. Reads are then streamed one cell at a time, so peak memory
-  is O(expressed_loci_per_cell x umis_per_locus) rather than O(all cells x all
-  loci x all UMIs). The outer accumulator (cb -> locus -> count) is sparse and
-  grows only with expressed (cell, locus) pairs.
+  Chromium: BAM is sorted by CB tag into a temporary file. Reads are streamed
+  one cell at a time, so peak memory is O(loci_per_cell x umis_per_locus).
+  SmartSeq2: BAM is read directly (sorted by coordinate). All cells are
+  accumulated simultaneously; with ~20 cells this is negligible memory.
 
 Output: feature x cell TSV (rows=locus_ids, cols=cell barcodes, first col=feature_id).
 """
@@ -35,6 +35,7 @@
 import argparse
 import bisect
 import os
+import re
 import subprocess
 import sys
 import tempfile
@@ -48,16 +49,15 @@ def parse_locus_id(locus_id):
     start_0 is 0-based, end_0 is 0-based exclusive (= GTF end field value).
     """
     try:
-        gene_part, coords = locus_id.split('::', 1)
-        # coords: chrom:start-end(strand)
-        colon_idx = coords.rfind(':')
+        gene_part, coords = locus_id.split("::", 1)
+        colon_idx = coords.rfind(":")
         chrom = coords[:colon_idx]
-        rest = coords[colon_idx + 1:]  # start-end(strand)
-        dash_idx = rest.index('-')
+        rest = coords[colon_idx + 1:]
+        dash_idx = rest.index("-")
         start_0 = int(rest[:dash_idx])
-        paren_idx = rest.index('(')
+        paren_idx = rest.index("(")
         end_0 = int(rest[dash_idx + 1:paren_idx])
-        strand = rest[paren_idx + 1:rest.index(')')]
+        strand = rest[paren_idx + 1:rest.index(")")]
         return chrom, start_0, end_0, strand
     except (ValueError, IndexError):
         return None
@@ -79,12 +79,12 @@ def load_intervals(fasta_path):
     chrom_intervals = defaultdict(list)
     with open(fasta_path) as fh:
         for line in fh:
-            if not line.startswith('>'):
+            if not line.startswith(">"):
                 continue
-            header = line[1:].rstrip('\n').split()[0]
-            if '::' not in header:
+            header = line[1:].rstrip("\n").split()[0]
+            if "::" not in header:
                 continue
-            locus_id = header.split('::', 1)[0]
+            locus_id = header.split("::", 1)[0]
             coords = parse_locus_id(header)
             if coords is None:
                 continue
@@ -127,52 +127,86 @@ def find_locus(chrom, pos, chrom_starts, chrom_intervals):
 
 def sort_bam_by_cb(bam_path, n_threads):
     """Sort BAM by CB tag into a tempfile. Returns tempfile path (caller must unlink)."""
-    tmpf = tempfile.NamedTemporaryFile(suffix='.bam', delete=False)
+    tmpf = tempfile.NamedTemporaryFile(suffix=".bam", delete=False)
     sorted_path = tmpf.name
     tmpf.close()
-    print(f'Sorting BAM by CB tag -> {sorted_path}', file=sys.stderr)
+    print(f"Sorting BAM by CB tag -> {sorted_path}", file=sys.stderr)
     result = subprocess.run(
-        ['samtools', 'sort', '-t', 'CB', '-@', str(n_threads), '-o', sorted_path, bam_path],
+        ["samtools", "sort", "-t", "CB", "-@", str(n_threads), "-o", sorted_path, bam_path],
         stderr=subprocess.PIPE
     )
     if result.returncode != 0:
         os.unlink(sorted_path)
         sys.stderr.write(result.stderr.decode())
-        sys.exit(f'samtools sort failed (exit {result.returncode})')
+        sys.exit(f"samtools sort failed (exit {result.returncode})")
     return sorted_path
 
 
-def emit_cell(cb, cell_data, mode, counts):
-    """Merge one cell's accumulated data into the sparse counts dict."""
-    if mode == 'chromium':
-        for locus_id, umis in cell_data.items():
-            counts[cb][locus_id] = len(umis)
-    else:
-        for locus_id, n in cell_data.items():
-            counts[cb][locus_id] = n
+def process_smartseq2(bam_path, chrom_starts, chrom_intervals, multimapper_mode):
+    """
+    Process SmartSeq2 BAM: cell_id is the prefix of QNAME before _r{n}_.
+    Reads are sorted by coordinate (interleaved across cells) so we accumulate
+    all cells simultaneously.
+    """
+    counts = defaultdict(lambda: defaultdict(int))
+    all_cbs = set()
+    all_loci = set()
+    n_reads = 0
+    n_assigned = 0
 
+    proc = subprocess.Popen(
+        ["samtools", "view", bam_path],
+        stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    try:
+        for raw in proc.stdout:
+            line = raw.decode()
+            if line.startswith("@"):
+                continue
+            fields = line.split("\t")
+            if len(fields) < 11:
+                continue
+            flag = int(fields[1])
+            if flag & 4 or flag & 2048:
+                continue
+            n_reads += 1
 
-def main():
-    ap = argparse.ArgumentParser(description=__doc__)
-    ap.add_argument('--bam', required=True,
-                    help='STARsolo Aligned.sortedByCoord.out.bam')
-    ap.add_argument('--fasta', required=True,
-                    help='Feature FASTA with headers transcript_id::chrom:start-end(strand)')
-    ap.add_argument('--mode', required=True, choices=['smartseq2', 'chromium'])
-    ap.add_argument('--multimapper-mode', default='unique',
-                    choices=['unique', 'multi'],
-                    help='unique: NH=1 reads only; multi: all aligned reads')
-    ap.add_argument('--threads', type=int, default=1,
-                    help='Threads for samtools sort (default: 1)')
-    ap.add_argument('--output', required=True)
-    args = ap.parse_args()
+            nh = None
+            for tag in fields[11:]:
+                if tag.startswith("NH:i:"):
+                    nh = int(tag[5:].rstrip("\n"))
+                    break
+            if multimapper_mode == "unique" and nh is not None and nh > 1:
+                continue
 
-    print(f'Loading intervals from {args.fasta}', file=sys.stderr)
-    chrom_starts, chrom_intervals = load_intervals(args.fasta)
-    n_loci = sum(len(v) for v in chrom_intervals.values())
-    print(f'  {n_loci} intervals on {len(chrom_intervals)} chromosomes', file=sys.stderr)
+            # cell_id is everything before _r{digits}_ in the QNAME
+            cb = re.sub(r"_r\d+_.*$", "", fields[0])
 
-    sorted_bam = sort_bam_by_cb(args.bam, args.threads)
+            chrom = fields[2]
+            pos_0 = int(fields[3]) - 1
+            locus_id = find_locus(chrom, pos_0, chrom_starts, chrom_intervals)
+            if locus_id is None:
+                continue
+
+            counts[cb][locus_id] += 1
+            all_cbs.add(cb)
+            all_loci.add(locus_id)
+            n_assigned += 1
+    finally:
+        proc.kill()
+        proc.wait()
+
+    print(f"  {n_reads} reads processed, {n_assigned} assigned to loci", file=sys.stderr)
+    print(f"  {len(all_cbs)} cells, {len(all_loci)} loci", file=sys.stderr)
+    return dict(counts), all_cbs, all_loci
+
+
+def process_chromium(bam_path, chrom_starts, chrom_intervals, multimapper_mode, n_threads):
+    """
+    Process Chromium BAM: sort by CB tag, stream one cell at a time to cap memory.
+    UMI deduplication per (CB, locus).
+    """
+    sorted_bam = sort_bam_by_cb(bam_path, n_threads)
 
     counts = defaultdict(lambda: defaultdict(int))
     all_cbs = set()
@@ -184,83 +218,108 @@ def main():
     cell_data = {}
 
     proc = subprocess.Popen(
-        ['samtools', 'view', sorted_bam],
+        ["samtools", "view", sorted_bam],
         stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
     try:
         for raw in proc.stdout:
             line = raw.decode()
-            if line.startswith('@'):
+            if line.startswith("@"):
                 continue
-            fields = line.split('\t')
+            fields = line.split("\t")
             if len(fields) < 11:
                 continue
             flag = int(fields[1])
             if flag & 4 or flag & 2048:
                 continue
             n_reads += 1
-            chrom = fields[2]
-            pos_0 = int(fields[3]) - 1
 
             cb = None
             ub = None
             nh = None
             for tag in fields[11:]:
-                if tag.startswith('CB:Z:'):
-                    cb = tag[5:].rstrip('\n')
-                elif tag.startswith('UB:Z:'):
-                    ub = tag[5:].rstrip('\n')
-                elif tag.startswith('NH:i:'):
-                    nh = int(tag[5:].rstrip('\n'))
+                if tag.startswith("CB:Z:"):
+                    cb = tag[5:].rstrip("\n")
+                elif tag.startswith("UB:Z:"):
+                    ub = tag[5:].rstrip("\n")
+                elif tag.startswith("NH:i:"):
+                    nh = int(tag[5:].rstrip("\n"))
 
             if cb is None:
                 continue
-            if args.multimapper_mode == 'unique' and nh is not None and nh > 1:
+            if multimapper_mode == "unique" and nh is not None and nh > 1:
                 continue
 
-            locus_id = find_locus(chrom, pos_0, chrom_starts, chrom_intervals)
+            locus_id = find_locus(fields[2], int(fields[3]) - 1, chrom_starts, chrom_intervals)
             if locus_id is None:
                 continue
 
             if cb != current_cb:
                 if current_cb is not None:
-                    emit_cell(current_cb, cell_data, args.mode, counts)
+                    for lid, umis in cell_data.items():
+                        counts[current_cb][lid] = len(umis)
                     all_cbs.add(current_cb)
                 current_cb = cb
-                cell_data = defaultdict(set) if args.mode == 'chromium' else defaultdict(int)
+                cell_data = defaultdict(set)
 
+            cell_data[locus_id].add(ub if ub else str(n_reads))
             all_loci.add(locus_id)
             n_assigned += 1
-
-            if args.mode == 'chromium':
-                cell_data[locus_id].add(ub if ub else str(n_reads))
-            else:
-                cell_data[locus_id] += 1
     finally:
         proc.kill()
         proc.wait()
 
     if current_cb is not None:
-        emit_cell(current_cb, cell_data, args.mode, counts)
+        for lid, umis in cell_data.items():
+            counts[current_cb][lid] = len(umis)
         all_cbs.add(current_cb)
 
     os.unlink(sorted_bam)
 
-    print(f'  {n_reads} reads processed, {n_assigned} assigned to loci', file=sys.stderr)
-    print(f'  {len(all_cbs)} cells, {len(all_loci)} loci', file=sys.stderr)
+    print(f"  {n_reads} reads processed, {n_assigned} assigned to loci", file=sys.stderr)
+    print(f"  {len(all_cbs)} cells, {len(all_loci)} loci", file=sys.stderr)
+    return dict(counts), all_cbs, all_loci
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--bam", required=True,
+                    help="STARsolo Aligned.sortedByCoord.out.bam")
+    ap.add_argument("--fasta", required=True,
+                    help="Feature FASTA with headers transcript_id::chrom:start-end(strand)")
+    ap.add_argument("--mode", required=True, choices=["smartseq2", "chromium"])
+    ap.add_argument("--multimapper-mode", default="unique",
+                    choices=["unique", "multi"],
+                    help="unique: NH=1 reads only; multi: all aligned reads")
+    ap.add_argument("--threads", type=int, default=1,
+                    help="Threads for samtools sort (Chromium only; default: 1)")
+    ap.add_argument("--output", required=True)
+    args = ap.parse_args()
+
+    print(f"Loading intervals from {args.fasta}", file=sys.stderr)
+    chrom_starts, chrom_intervals = load_intervals(args.fasta)
+    n_loci = sum(len(v) for v in chrom_intervals.values())
+    print(f"  {n_loci} intervals on {len(chrom_intervals)} chromosomes", file=sys.stderr)
+
+    if args.mode == "smartseq2":
+        counts, all_cbs, all_loci = process_smartseq2(
+            args.bam, chrom_starts, chrom_intervals, args.multimapper_mode)
+    else:
+        counts, all_cbs, all_loci = process_chromium(
+            args.bam, chrom_starts, chrom_intervals, args.multimapper_mode, args.threads)
 
     sorted_loci = sorted(all_loci)
     sorted_cbs = sorted(all_cbs)
 
-    with open(args.output, 'w') as fh:
-        fh.write('feature_id\t' + '\t'.join(sorted_cbs) + '\n')
+    with open(args.output, "w") as fh:
+        fh.write("feature_id\t" + "\t".join(sorted_cbs) + "\n")
         for locus_id in sorted_loci:
-            row = [str(counts[cb].get(locus_id, 0)) for cb in sorted_cbs]
-            fh.write(locus_id + '\t' + '\t'.join(row) + '\n')
+            row = [str(counts.get(cb, {}).get(locus_id, 0)) for cb in sorted_cbs]
+            fh.write(locus_id + "\t" + "\t".join(row) + "\n")
 
-    print(f'wrote {len(sorted_loci)} loci x {len(sorted_cbs)} cells to {args.output}',
+    print(f"wrote {len(sorted_loci)} loci x {len(sorted_cbs)} cells to {args.output}",
           file=sys.stderr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/workflow/scripts/evaluation_report.Rmd b/workflow/scripts/evaluation_report.Rmd
index d0a0e4e..f8ca865 100644
--- a/workflow/scripts/evaluation_report.Rmd
+++ b/workflow/scripts/evaluation_report.Rmd
@@ -99,10 +99,10 @@ All metrics compare the aligner count matrix against the simulation ground truth
 - **Specificity**: fraction of truly unexpressed (feature, cell) pairs correctly assigned zero by the aligner.
 
 
-Rows = aligner (+ multimapper mode), columns = metric. Each bar is a
-feature_set x granularity combination.
+Rows = metric, columns = feature_set. Each point is one aligner at one granularity;
+x-axis = granularity, colour = aligner (+ multimapper mode).
 
-```{r global-accuracy, fig.height=14, fig.width=18}
+```{r global-accuracy, fig.height=18, fig.width=14}
 acc_metrics <- c("pearson_r", "spearman_r", "f1",
                  "precision", "recall", "jaccard", "specificity")
 
@@ -110,25 +110,23 @@ acc_long <- global %>%
   select(aligner_mode, feature_set, gran_label, all_of(acc_metrics)) %>%
   pivot_longer(all_of(acc_metrics), names_to = "metric", values_to = "value") %>%
   filter(!is.na(value)) %>%
-  mutate(
-    metric = factor(metric, levels = acc_metrics),
-    fs_gran = paste0(feature_set, "\n", gran_label)
-  )
-
-n_fg <- length(unique(acc_long$fs_gran))
-pal  <- make_palette(n_fg)
-
-ggplot(acc_long, aes(x = fs_gran, y = value, fill = fs_gran)) +
-  geom_col(position = "dodge", width = 0.8) +
-  facet_grid(aligner_mode ~ metric, scales = "free_x") +
-  scale_fill_manual(values = pal, name = "feature_set / granularity") +
-  scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.5, 1)) +
-  labs(x = NULL, y = NULL,
+  mutate(metric = factor(metric, levels = acc_metrics))
+
+n_al <- length(unique(acc_long$aligner_mode))
+pal  <- make_palette(n_al)
+
+ggplot(acc_long, aes(x = gran_label, y = value,
+                     colour = aligner_mode, group = aligner_mode)) +
+  geom_point(size = 2.5, alpha = 0.85) +
+  geom_line(linetype = "dashed", linewidth = 0.5) +
+  facet_grid(metric ~ feature_set, scales = "free_y") +
+  scale_colour_manual(values = pal, name = "aligner (mode)") +
+  labs(x = "granularity", y = NULL,
        title = "global accuracy metrics",
-       subtitle = "rows = aligner, columns = metric") +
+       subtitle = "rows = metric, columns = feature_set; x = granularity; colour = aligner") +
   theme_bw(base_size = 10) +
   theme(aspect.ratio = 1,
-        axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+        axis.text.x  = element_text(angle = 40, hjust = 1, size = 8),
         strip.text.y = element_text(size = 8),
         strip.text.x = element_text(size = 8),
         legend.position = "bottom",
@@ -194,6 +192,15 @@ ggplot(corr, aes(x = gran_label, y = r,
 
 ## log1p RMSE
 
+log1p RMSE is computed over all (feature, cell) pairs in the feature universe.
+At locus level there are thousands of features, most with zero counts in both
+truth and observed; the dominance of (0, 0) pairs suppresses the average squared
+error so RMSE looks low. At class_id level there are only ~10-20 features, each
+accumulating large aggregated counts, so per-entry log1p differences are larger
+and RMSE rises. The gradient locus < gene_id < family_id < class_id is therefore
+expected and not a mistake - lower locus RMSE reflects sparsity, not better
+quantification accuracy.
+
 ```{r rmse, fig.height=5}
 if ("log1p_rmse" %in% names(global) && any(!is.na(global$log1p_rmse))) {
   n_al2 <- length(unique(global$aligner_mode))
@@ -275,6 +282,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa
     geom_line(linewidth = 0.5, linetype = "dashed") +
     geom_point(size = 2, alpha = 0.85) +
     facet_grid(gran_label ~ feature_set) +
+    scale_y_sqrt() +
     scale_colour_manual(values = pal_pf, name = "Aligner (mode)") +
     labs(x = "repeat class", y = "F1",
          title = "F1 per repeat class",
@@ -290,6 +298,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa
     geom_line(linewidth = 0.5, linetype = "dashed") +
     geom_point(size = 2, alpha = 0.85) +
     facet_grid(gran_label ~ feature_set) +
+    scale_y_sqrt() +
     scale_colour_manual(values = pal_pf, name = "Aligner (mode)") +
     labs(x = "repeat class", y = "pearson r",
          title = "Pearson r per repeat class") +
@@ -307,7 +316,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa
 
 ## compute resources
 
-```{r resources, fig.height=10, fig.width=14}
+```{r resources, fig.height=14, fig.width=18}
 res_cols <- intersect(c("wall_time_s", "cpu_time_s", "max_rss_mb", "io_in_mb", "io_out_mb"), names(global))
 if (length(res_cols) > 0 && any(!is.na(global[res_cols]))) {
   n_al5 <- length(unique(global$aligner_mode))
@@ -322,7 +331,7 @@ if (length(res_cols) > 0 && any(!is.na(global[res_cols]))) {
                    colour = aligner_mode, group = aligner_mode)) +
     geom_line(linetype = "dashed", linewidth = 0.6) +
     geom_point(size = 2) +
-    facet_grid(resource ~ feature_set, scales = "free_y") +
+    facet_grid(feature_set ~ resource, scales = "free_y") +
     scale_colour_manual(values = pal_res, name = "Aligner (mode)") +
     labs(x = "granularity", y = NULL, title = "compute resource usage") +
     theme_bw(base_size = 10) +
diff --git a/workflow/scripts/normalize_starsolo.py b/workflow/scripts/normalize_starsolo.py
index 3bba451..5091baf 100644
--- a/workflow/scripts/normalize_starsolo.py
+++ b/workflow/scripts/normalize_starsolo.py
@@ -27,11 +27,11 @@ def read_lines(path):
         return [line.rstrip('\n') for line in fh]
 
 
-def load_starsolo_raw(raw_dir):
+def load_starsolo_raw(raw_dir, mtx_name='matrix.mtx'):
     barcodes = read_lines(os.path.join(raw_dir, 'barcodes.tsv'))
     feature_lines = read_lines(os.path.join(raw_dir, 'features.tsv'))
     feature_ids = [line.split('\t')[0] for line in feature_lines]
-    mat = scipy.io.mmread(os.path.join(raw_dir, 'matrix.mtx')).tocsc()
+    mat = scipy.io.mmread(os.path.join(raw_dir, mtx_name)).tocsc()
     return barcodes, feature_ids, mat
 
 
@@ -94,11 +94,16 @@ def main():
     ap.add_argument('--output', required=True, help='Output TSV path')
     ap.add_argument('--locus-map', default=None,
                     help='TSV: transcript_id, gene_id, family_id, class_id (no header)')
+    ap.add_argument('--multimapper-mode', default='unique',
+                    choices=['unique', 'multi'])
     ap.add_argument('--granularity', default='gene_id',
                     choices=['gene_id', 'family_id', 'class_id'])
     args = ap.parse_args()
 
-    barcodes, feature_ids, mat = load_starsolo_raw(args.raw_dir)
+    em_mtx = os.path.join(args.raw_dir, 'UniqueAndMult-EM.mtx')
+    mtx_name = 'UniqueAndMult-EM.mtx' if args.multimapper_mode == 'multi' and os.path.exists(em_mtx) else 'matrix.mtx'
+    print('Using matrix: {}'.format(mtx_name), file=sys.stderr)
+    barcodes, feature_ids, mat = load_starsolo_raw(args.raw_dir, mtx_name)
     print('{} features, {} barcodes'.format(len(feature_ids), len(barcodes)), file=sys.stderr)
 
     gene_to_group = None

From 9ec3e545d93d9153a538305e740d3cd2464a4d3b Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 13:05:53 +0100
Subject: [PATCH 03/12] Fix starsolo simuls CB matching

---
 workflow/modules/starsolo.snmk         | 2 ++
 workflow/scripts/evaluation_report.Rmd | 5 ++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/workflow/modules/starsolo.snmk b/workflow/modules/starsolo.snmk
index 78df478..da16717 100644
--- a/workflow/modules/starsolo.snmk
+++ b/workflow/modules/starsolo.snmk
@@ -97,6 +97,7 @@ rule starsolo_smartseq2:
              --soloCellReadStats None \
              --soloUMIdedup NoDedup \
              --soloMultiMappers {params.solo_multimappers} \
+             --outSAMattributes NH HI AS NM CB UB \
              --outSAMtype BAM SortedByCoordinate \
              --limitBAMsortRAM {params.max_ram_bytes} \
              {params.extra_args} 2> {log}
@@ -163,6 +164,7 @@ rule starsolo_chromium:
              --soloUMIstart {params.umi_start} \
              --soloUMIlen {params.umi_len} \
              --soloMultiMappers {params.solo_multimappers} \
+             --outSAMattributes NH HI AS NM CB UB \
              --outSAMtype BAM SortedByCoordinate \
              --limitBAMsortRAM {params.max_ram_bytes} \
              {params.extra_args} 2> {log}
diff --git a/workflow/scripts/evaluation_report.Rmd b/workflow/scripts/evaluation_report.Rmd
index f8ca865..08c9f43 100644
--- a/workflow/scripts/evaluation_report.Rmd
+++ b/workflow/scripts/evaluation_report.Rmd
@@ -143,9 +143,8 @@ if (nrow(det) > 0) {
   pal_pr <- make_palette(n_lab)
 
   ggplot(det, aes(x = recall, y = precision,
-                  colour = aligner_mode, shape = feature_set, size = f1)) +
+                  colour = aligner_mode, shape = feature_set)) +
     geom_point(alpha = 0.85) +
-    scale_size_continuous(range = c(2, 7), name = "F1") +
     scale_colour_manual(values = pal_pr, name = "Aligner (mode)") +
     scale_shape_manual(values = c(16, 17, 15, 3, 7, 8)[seq_len(
                          length(unique(det$feature_set)))],
@@ -153,7 +152,7 @@ if (nrow(det) > 0) {
     facet_wrap(~gran_label, nrow = 1) +
     xlim(0, 1) + ylim(0, 1) +
     geom_abline(slope = 1, intercept = 0, linetype = "dotted", colour = "grey60") +
-    labs(title = "precision vs recall (size = F1)",
+    labs(title = "precision vs recall",
          subtitle = "facets = granularity",
          x = "Recall", y = "Precision") +
     theme_bw(base_size = 11) +

From 932ed46fb5256f43d55c2b1f182882f698c9e907 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 13:14:44 +0100
Subject: [PATCH 04/12] Testing, documentation

---
 .coveragerc                                   |  23 ++
 .github/workflows/tests.yml                   |   8 +-
 pytest.ini                                    |   6 +
 ...test_evaluate.cpython-312-pytest-7.4.4.pyc | Bin 55247 -> 0 bytes
 ...imulate_reads.cpython-312-pytest-7.4.4.pyc | Bin 45973 -> 0 bytes
 test/unit/test_evaluate.py                    | 200 ++++++++++++++++++
 test/unit/test_simulate_reads.py              | 133 ++++++++++++
 workflow/scripts/evaluation_report.Rmd        |   4 +-
 8 files changed, 370 insertions(+), 4 deletions(-)
 create mode 100644 .coveragerc
 create mode 100644 pytest.ini
 delete mode 100644 test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc
 delete mode 100644 test/unit/__pycache__/test_simulate_reads.cpython-312-pytest-7.4.4.pyc

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..4c196c9
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,23 @@
+[run]
+source = workflow/scripts
+
+# Scripts requiring subprocess calls to samtools/BAM files or real aligner
+# output matrices cannot be meaningfully unit-tested without bioinformatics
+# infrastructure. Omit them so the coverage threshold applies only to
+# logic that can be exercised with synthetic data.
+omit =
+    workflow/scripts/count_pseudo_genome.py
+    workflow/scripts/count_pseudo_genome_chromium.py
+    workflow/scripts/normalize_starsolo.py
+    workflow/scripts/normalize_alevin_chromium.py
+    workflow/scripts/normalize_alevin_smartseq2.py
+    workflow/scripts/normalize_kallisto_chromium.py
+    workflow/scripts/normalize_kallisto_smartseq2_granular.py
+    workflow/scripts/merge_featurecounts.py
+    workflow/scripts/tag_bam_chromium.py
+    workflow/scripts/split_gtf_chunks.py
+
+[report]
+exclude_lines =
+    pragma: no cover
+    if __name__ == .__main__.:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ea12b79..656c49a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -47,8 +47,10 @@ jobs:
           micromamba-version: latest
           environment-name: snakemake
           create-args: >-
+            -c bioconda
+            -c conda-forge
             python=3.11
-            snakemake>=8
+            "snakemake>=8"
           init-shell: bash
 
       - name: Dry-run main Snakefile with SmartSeq2 config
@@ -101,8 +103,10 @@ jobs:
           micromamba-version: latest
           environment-name: snakemake
           create-args: >-
+            -c bioconda
+            -c conda-forge
             python=3.11
-            snakemake>=8
+            "snakemake>=8"
           init-shell: bash
 
       - name: Run negative control workflow
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..9cf6b9f
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+testpaths = test
+addopts = -v --tb=short
+markers =
+    slow: marks tests as slow (requires bioinformatics tools or large data)
+    workflow: marks tests that require snakemake
diff --git a/test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc
deleted file mode 100644
index 69d220d59aa6fedcf637b8ab9027dabd6e933a0f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 55247
zcmeHwdvF}bncvRt?0Xk)kRU}-E55|#10(=GL|T*#k<`PMU`v$sprzYd?kvEC7Fe)n
z!3Vs1fqc##70VIryNjSZxek_{GI3HlOj4<0xe|wRa*ivGD}+D^%$NzCvr{ykJC{Vd
zPIt;5m*3a(>Rk+X7Z3?j76#aFXJ)&5dS<$R-?O`a5{-sj*naLhlL>aXT)&_LbK}3@
z;OeO4a=qeGT>D*$q`0%vemDR249F?B<Qnkq_euIM{{4RbOJIM1e+Tym`FCi4h<}Io
zhxvD8e}sQW_ec46-Tr!IyW;tZw7)@-zv9~8sCYq}6d!1_;s<R}0-&u*5Ok3e0$r?x
zLEDrF=n^Fgx>TtHy-leHU8XdE-mWx)E?1gB?@*dSV@eC?ok}a{3S|-KN@X$VDy0o{
zwXy_sjj|N<F6B1RwaPN(c4fJ8hZ0lnR93v^!|_Vvc$IOyT3MsqrL0xr{CJ&}emx!c
z$@}BV29&f;*$BE`X$Re)Yy#b=YzA#tIzTrmouHeQF3=8T3uvdZ6|_s)2D;^#JAQW(
z&)OCG!eAyJ%cs@68tcum*pVE2zBijYvPn&`%urt4ls=ry4kz>J_Mu~;(C*yfG*e^0
zfA{9qv0QH~e=r>z8fNK@y~Bg4d?q&-%jWtrshzQbbe?5Wu~cqgXgHswU(^jTC7Dkm
zK~l*K_H76y2Nj<Ga5|OGv5m=Wrf+Z{J(!OjOlOC1k@nc0Tw09{=JGL?J~W(R=~!~`
zSS)=spJs!}Z0tZLml;Hj2T*)U4drvWtQy-G%ME6a#ne=0=vZuRGRx9Q<yb5;$ggDF
z2~{(d8&s09^x)xmd+0JX&z_WQwo-ii@bTbt73~lWwcsj9BVm`TC>K1V%kns<Zv`~l
zCD`q!v}M~H#_=0Q>Y^v_HIo!%#jSYWl;1G&ZFLpBdB6E<f56<czYE@iNAbStGjkMu
z1z$gERKA&4m0b8eTK=ocRdAd2K?mc1Z@BeT$RY1@t#oy`ROtvB#xpJne{}As=NZ=#
zDemuKlvI<BXs)=V`3KWSvYElOCI#beE!3MyXO+R^Kw8y;N;;bv$lx`_{hFN3sCi9(
zK7CBp<lIntQ1hhJ!&+!hCY67RQ6epTguV~rP2nKO4_NMqip+uhz))f+nLnt>RJ113
zc}9;%lZG^R??FZ{O{FIn)3*<|ZaSD7NN>svB(vGvU~-enQkz(MD4j%0rbfPL7|osU
zC4`1;TsDzc53|K6jb5s%f{eP(2O3Und9i-9`@FyLtbfUc)+J-B#tyxD*I#USb;F;x
zPqoI6hb{!8$CD?T#}2#`ymS2V3z7PldtU4r>pZ#cjC?M#_qcRE5PUiMV)UhY+6liH
zKC$OqV95u5S7^n10aqkC`p^eKd;dd~UhBWv*}cv4);90`E`Q2+K{zWk%r!W8#nlIt
zd|fKKg(m4z+yx1**=WxDTyf9HM*gH@$yguD4G$`bJR8m*Y{$)_M=9uC`C&E~%K!lA
zS+`I4G=D0c%_cI67RcsO!z%3s^{2>pdy@m1>@og(Dw|aK?;#YLPNYzNex#xNI&q0V
z*0_f?qTZ~T2)#y?Iso1rTZC_|aUl78Iw9s|F`Qq8k4kT3)b-bm&yIfPLb&C`(`WDK
zIJM>U(y8#Cv+|y+_|cUTJ0U$oI=*_;=)j8Jfjz>LlFY79l=D_zbThZvC8%2#y!}?U
zrK{+h;!)&&tHYXS#qKqNlZuz9c@^qRdtA>w2^jGe+<ROv<IVQDQtszsIOER;%~J68
zd>m99x_7~^_{`dw<$m4$(hAvC3>5v&buIW5nb*d-#sRY&ry9TXwCoDH6hHl$*TsE!
z_MzSqK1KhLnUH~B4af+Ld@)4*(rkXc=!W1Hv7<3+U$H0lJ=xBGH_j(`t5bjsJ7Zrm
z&t()G7|n4fa2xHbSme1zLi|XjhRlIrBy492GI~9l6yl)O6ZdI8HOKO4g+=JRm&1%E
z_k)LM9yQN2PhUD85Ayb@HJasE719Dm6;#$j`2$7@m9^5LOoy73(qu48Ri$@H_suu4
zZSx8zz_D27DBFZo>+n&(4>Ia{FXRd~p15Z+aK~u(h5FXBi|(4LUpxAc0WL2s8*4fr
z2)!J5F>qq}_^R*5zZF0E(A0{plZ{(X%ae`woD1ywS<ABHd;Z6sN2Zz|ee0pqhyG;u
z*|zw}Rg-O-rrWwE+qzD5pE@+vw&R_aoo608zUOT7qvs=aFF*9+LnodX+w=7=j&J$y
zj&JQa`NX&Hn{4eo^~7ZBj&~wE-}ky&Z~wsS3N1qA2&n$yir3ZBBN29)r#@6~NA<tC
z`Tk9wx1y`>Ul(|*b0hYDxXw#-llT4|r3gl^h4;+A#164mqi2d3<}T@jo`Qsavs!1r
zPOMh(IQ7wjY_@E@{hLRU>!rJRtM)4R6I!*~f&Jl}(UyH9&+=Ahwq+d{2a-e7jALsD
zbC6qzgT;69WO<ex)FmjuWGvQ~9!&Ec3p!&l6FPbNk*HOm)t-41W-IZWdVrkNZW*3}
zixKw6y#ka{lhhlchd3kwmeBO-IILR5vrhcH10VGZAZEL}Z!)j~?M~-p)YyL3y!iNT
ztBJhw%-5e8Q~&t)#+Av|b?-#hzwdFiw0+>AX5iCV7;h5#zRtqFvA#Rtc`M-UZt|ya
zA#0<~!5e@E#`cQK2IPK`(HY5g2B%Z=RXvMzMORVE%VwGVB(IfiMKq$KNAVP8H>93|
zryxV#GOhqvcNM(PujNqeJ0YpdZTRgsl@y^o#ar<8N{Wwi^;)b|@#ifGtG`Z2ZtR+}
z*kPC2DwN+aGiAYI5=>0lR4L`41W*ptT!ST*YhXW1vzst6heJ(f=0J@C!c`pd*wSZ}
zbRM%7Rnl2qyh=K+S)LMn%{PM9QuHe!C0y`lBz_dg6O8jsD2arUvaMjTOS8l;ON#5L
zRP^EF#>az?T=X7u#Univ-peHXU#3=fnVS4%YO$B8dtSl^+k5w2`qOvU{3_huc<+Ry
zd6Eaz3F$-ku3Zz-$QM_y9Xhr-uEuuliqUQd_CsGX_hS0VE#^7w7^gbpYI|~Mh~<uk
zifbwPwm4KTl)Q@$?vAVRP*IbZ<{LUj@-T!w53WO`lgJh#+lbt)$#m5Z>mlH!SzfOM
zO6R9LCnB0VeOL=ZvSw;-Fu~aUxF5}@Us?-&S}#74WLje)1z|h^`H_)6m}F@!f}axy
zhBMiGW>D39z$UOwtu3J*%nfH1o;1;y%^gT)6I}2QCe&nano7{>x_Nb=#O-A)$F$~z
zt^+30M~AY>L9T0pk9VUsI;#%Twe)-F(u9Mp;=y+DV260np;uPFNJr^WmlLPwx^%*N
zE<k-KwHe%CLf3RvmO$Me#z*}VkWtr#aP;MSUcBee>i_bNsql^$_KbFq9=a%Rn7&E>
z@5&q4ov4^IX#Wz8xn|IIzb=`OTQg{Tik^Ze3{_h}H{#GO7rbv;d>XPdR9g<+{(`>-
z==K%-fNnqK5}><w;1mJ*`ayR<3EpDR9Tb3XKA_uz<SN(onLw+A4CoFhVI@)s7|<Q8
z8M=dJN#?f!-F^|evF|PV0N>FbwgtTaau&%->~134iEJaXmB>yapCWP(k$XV^+|Lru
zioo4rf^-M=Y6R|$Gmv`+M{XV02x=#MY!6*^FOdg`JP1O-y@MlkM=5Z38Nl7e9zn8+
z5HCoR66|Tp`V5i%lwSbIyLIH#p|fK%dzOCZ$ZddOXUWkPl_V!FcM8JiT4FK+8p5*I
z3xgl52T7nJ>;;Uav#>5IYyi1?@KIj@xfUvJoR)9e|CGFOCNNs+%$gevj2sFrIgvn8
zeV5D-LicfyTm>0)NoBy9DbGOywB=g|=$n}X5};f;k1YWz-kNLPDwOFUXO~FN6>Ds#
zL`}UysSbBnuw+oh_nOS5L^)4}E4~SNN%<<<Y}K<2YM_w-TKqi|9vwkAmJq5yzvD1t
zkq;jf#ICI-f?&rM^K@wt5}9!dnA4t72a|(_;D;_xvYrfc!$IENCuAEk2q?OUY@G<R
zM^OVteUBXi(Y(AA_Bfr`N96NFo*+UhBZCRc#hwD0@alMUkWO%T5a45rjwL!)Y%5nv
z?<m8Cku=L$XaM1kkvWPkZkS<zX$VQd*LO^XH=06H+q8T`{?Ey6AF->G7pjkYwfZc3
zVy!aJLRZIocR8ulb_a%{{PF1OLPsCMUGW~K_Zmir!yu3`EOmu<BX*R2ItEhSZEi2?
zHt>2SdecV|L$G7!RaQV2;^pd>Kr9|Q);QKZ<{NJu?;iI}h3~p4oo8-VpE)WJP#tWm
zkF7e)*6ITvo1r?0R-Xmc0d326Yk-r^x9Tu^tB<KV__^wEBdtEsp190i4nn`@4Bj})
zIX)>9m&s=7q(ZPy(+?aFNX}=wiF}60XNhp}-LP+PzFR85voBI2;>GMaBEJXXWTjYI
z1|-i9=8g;|$km@_qsYVa>Glbe^+sOLPKCQn^teUs6AMOVyga^@@JfPTzV$ol_Lf?K
z&b(k$o;|$Gr%^fXxlDTAWl}pXF9(@DhD=1b#IJ=^vQiEtP3tZD92Lif{l{p}RsbAi
zfA&w1#0gpM<j7(d?O*`?BV;A)5)7cbUflI(%U&-`g}3}J0nCE@g13yY2oYcwOrNs{
zn0c08xOLOAO>o;Mq#Ayfi!Q`XE;oS89(&k7rEJ&2j=RdS177s!6PdxoNtS`VFp<wC
zRCrR-?5ikc03QQENJP;Qq&OAcdW$2-f(r#NPS|X8n3eZ?0Y^RtE|eVJGi;lr0abRK
zup}k1wVnfXm?qLogri0u_8grIx-n4@Im&$tU!$D#c?Cqbix6BadHvz3aJvZ?H_fpC
zpGIZh!ipMv3yRby(Py>}S4U$bqLXX3&Wm0G2kZPyFTtGr3Z5$FJ}a(y79J7Ngy>N7
z5?rd?*jUX2V*3@m*V(V&>(P0%gPk#M82SGY?O!)R9;Q88!sk-IlbC;mx{Vh^aOy2D
zXVYb7$d{3vOqeQVwz}Kcony^ohsIJ<;T2|=yCKGV_yip7_Y^?q!RP8AC}J3}t8Y$$
zXGJr%S~6p-(NQoA01hm4M%ob2FY9}-Rg*ENY(oTtn@2Dpe!+ADA$q}65Th3kRPJVq
zhet19*g;A3g6O((Xt*U#KU?!Bfvs$_pMhD!s$pMN31$xNO0%z0(Y_$uty4NHBD+dD
ztFu>0XLmK903RRiiCOR5y#K0CY(@?|F<+IM&ss&OQl1?};eRcthrB5epsrICMF>$7
z6a*&;krWhFA>%{Q6rp%{WckBp98}DRgNkjuH}?3W`}V?pxw+l6nsNu|m(j>BD<GH5
z*2aS?;?l^gMi86s`$RNbWp^k>BXtZV*npZge4>bx+rT5)5ETdKs1;m?LnF8jdq$ZZ
zaIG3AZ6_;*;g!bMb)ZZXr8`mYM}-l6wg+Ejks0hEI`=RUvg7g?jV+vR8z~8~!%gwi
zclTI!8v8bQl-;cCH1Uvb4K6hkmxOj0L2X8$M_0*jTe(5GB+z4f$<Yo+iEJ_F5O%#0
z0m9=x*f(+SFW{qo7i83RN&fuwt^NOw{P~%#;4muBS;?<-$qa}3=FCc}Fbc4kB8Gw#
zJzz0lFo--bSqyoxFl@wx8XAs*khgYnE(;z0qs4ugRkv(3Rpql8IRI#G0bm^%jV()k
z>?BK@DOHvMZKhN?oyC-@JTdF7S4n3H7mEB^fJ3{l<~S9LEtoCL2#G5Pb3LNj+cWYF
zAu6Q<0VW1MVy`j~i*YFw250261L7P)Tu^AcjUWbqZ#My4kB1}RrI;0tzVUznyrk|i
zl5Z%;WjjmJA+Z#$PeL5#xwsJgD4iw6um*8B?zbKEVyxUYmZ1XrK}z7WlVx#A9HFQW
z2Wl<H$MnEZ{up~5iAh9PNrE1A-NY#1vgqTtoq~cbf%@0FKrNxtd~-rUuAjPX{j}(+
zhKl)sRh|RFVSEQdg}L@>^(ZT7SSnNw8h%;mQx4)FMtV304L4dfau8nyv0KzETg*K6
zw~dA+VJwU66y~0t5*SMsuUcy$YUK_!u%Erz^Gi_Qp(ZnPphjX4vqPSOg`G<1y4Boj
z6*bp<Mh?_GB(wmhnmeJnQ#z~dRVmL2&7I1#(7Z}{7IG_LOQ4S^Q9+<zVzwcnq^0IL
z`@2vnELs%kO<~>@uj|nbdL*TA=fI_}{q4US|KLZb_#hU<^P1B7pO%M0TPWXQM>5tj
zhQ*O(fEDO$1Um-zL8-(t3|^tLhz%IvoaKXGh+$Dgq>rW5;cQ-)rHK#Bl-P-Ju)iQu
z$~d@WzO__f$L0Cy?eRKY3E;fse#*=ps&oa-gKE|4T7}3?(y<<zJx+TTb79}2!gvt2
zA+B%X45>sU-&S&Tx1)frdueU<n$QD4R7}l~O=HY`hOzJAQvVDe^}m9Qx_%k0fBCr=
zpF=2F7edKAQ_)qU4_uUYOn<EYr{o<orFp_sF3sNyGmbf^9%dhSZ;McDsV(T66JIO3
zIS+#5Zb|J`4MSB*q+u^!XaX*)L?3vaA7|wPIFm4rF#y9*fDeWV7J?_->c6V26<{1=
zkdI?TE@3cC?c*55F@X+JzI|T;Hdiza&CCI876!xjJMdW{hWz6ci&t$Rk;N;kdcmxe
zoC8;UzCskc$nhLx9zlAk_LgO{Kh*q|XXM}<xN^-cjya2Qw$*BGR$SC|E7oF8?bG7n
zCGt__XMTfP$1Fw&SF3^j%*2#V{nJW`w_I&RuqDR`$mLX~-)w7D(plqkAs5wgZ8)Tm
zTof{fi-l_*E*3VgXnq^Iu{p{&Xc{>2O{0JP#Q?^3KsGXbo)jse59;`R+T-KI{w3|*
z3j$N-Rbz;kF2mSnN6YGH8y(5!XtQd2c!zGv)a{R^DYMhku4&ISWugQ_<;I?I5{AYy
zOQt1Jb=V@6qn1zd*vuM{B~-5xiFc5+1kuYFzDByXCM7lL0GGZXG3}v8vX=<g*dL(1
z2Z`ABQiTqCiTp8<lSC#!xYWf*yLEPyh-94*^ytUVF7_{wE*@d0D9^tp!b4*zMqZDU
zogq&d!jmCgm5hPw+^mn06UM-GmW+Yx+@g|B%spiW(bfGW<?6nAY(SqPi+vv%PT-?%
zf%-g(0eCIn$c!Eu?Y=1YOfURbdjjvsJu~Gw()YML_ZwjQD~L2yq;Erx!?zpXQso&Z
zU~eT^j`CxB7M{-{Ms3lU{-V#;fgK%wen@EnNNIk1mVjaq_S_(s&_aa}Y_}l+zZ|R@
zid+amLi18CK|-tDeI;VEgM?<wPF1B0O9KXq`>rDoo*@g&$$?f)Mw~Uj5Z;Ru|FN`W
zCGeV;^Odj?R6>QY!B-+R^OcCnPt0$FuaFasoMwSya8B*aGpBZjTtYhw3$}ufCa+j_
zk*e*?N{MzBUT9}#tGH#`nLMX<hFn5Bi(I#MW@!mk+nJRT?JTm;&Tjd323y4(_$FC6
z>=~M(*R7pdI$hOvW~D?si!QXY`D|xg|M~%mZkp7sN%w2g?jMl2!Ttl09}xME{Kq|d
z;5uny6Vk{dWoBw9M!K?O$CuP%q}D@9_@qNzxxwi8J#<N~+?47yrE_yYCDQ{#x~X#R
z-VRObH)B5<m2@8SWne^nLRT2nIfQ^jwIF6kg4J71E1H)kW9rqbNf%^)MdU|xX&E83
zsj?YCbn`MF*RK!d-@ykKs^sJr6&+&^&n~X%%&4>(|ADEU_+(I+cQk<tr7|jw_5Uf#
zdKDk_A9Wq5;f~RVwMcAq?}fId(`_3j+cunh=5+AP)~U9KC*@X}AL;lC%(&WeQC=}U
zzyDM6iko!NmDBRP{!hs(=l@R6$tx<TbAQLxISPI`=q|cw@)LYZQ0u%<>xB6uoy_z`
z?53#J8A=`IUbR#bN3|}X_=-WObphK|Rt&)uAL8sOTnN{o*7*xzsC5C#C9o&aefrVF
zZA-YR%ATy0U{6qBYG#ntOsm#2Gy#83YMrPaJE>uod9PM=txCF(SzDM3b+s<S&4m#|
zt&7&I)<w;d%x^=j3zJHRPp}x8Q#<p_shuI0(9Y^=e#_O&2bQu{wVhch(a!1?+L_rZ
zZrOI`ol`qQE}@;(U$=H<3DH&CnUxaltbU=L-SX|sH>Y-nTtYi*xNhytaz9mVXI4tI
zvxbFsHlOVbYF%SbnGVO^g1*4c5cviXQsvkWiEy2UN8kJ>I{eQ>9wyR4#8P(Hf5G8I
zo#y3>A!wf50hLjpi<4r+enLN#M$qeyr2j@g&_pXL$!{h4kf9Li6Clo1hz4`C3<E-K
za%v`@Ih<D6+bH4NR8$0kRakf;+Hhj!@z0FzJs+++;W@r;ba$Cb^o3KqPq$9B-M63;
z&68qu3#vq&@K1@;@tRQ_+}lN7FH(=_+ifp#(Qk?BXl%w-Q9bgRsrs!*O7jf7V*w0v
ztF9h}%^dyYAuZc5%#G9|xlBFs=IhOb{SLk?^IrM<>UQ}~YJ;Wp0OswK1PEoDJ=Ov+
z@5L~!T5GUGDA%C|dFvUPzzOxJVohe|K#hEr^VojJD(Nh5R+V(NdgQm%BN{elt4E8?
z2Pl-(W}dUZ3$>_6)-#$#J1bX@>}Oi3ot3Ibp_<>a*m@k^vK?(zwVjFTQD~u^nY}`p
zhjs>WBHY9NJsRh%M2C133&mhrGd^78$CTR+@%=m<K1SqmBKwFuL8NrPRQBI-?wXRq
zwriM9xxH3jIY&)EOi8oDX(ch3!-^s#L9lmF==Z4L<=lFT^{2kNcXY27T0#=TZK4s?
zQDXSO=kkJq&Z{LAX4Fgbdqv!!qj3=?MLkyJA@H`Du_EA}o*Qgjl#4;d3xi^?5aMgh
zg}JHGiaskw5aS!+rp9O?TGNF0QSeef<q}k;>lZ0&Ay-wE$w~>%_3@jn<%XFWtC%FM
z_z-Kt`#L^rWu0M?tgqQ5S#P4M`E8gaqozp`-xPHcE=H=FCabqnc}}eqxrA1F{rt6}
z#7BLcZ;sLAR!X$e=qw_3=f@ykZEn?tR%*Un(nP40)>Xt(-<(=0atW>U`k6T`qf%8Y
zwNj#$)-SYD6VYx(EQNg8(8Enr_mZZ~Lk_sPN|W|#(gXMU>11g{f^N;4rXS9OXwrlF
zY*{>p+=v(7Ulyg{I3})IoPxO`RBRKU1p@&NaT-F+O=Bd^#vNA`(POa{s$QB%FA;8M
z`+eH`5fL({#T(dJ+MgscD`Og;wYZ<M@%ZfLX)g<+lHOlxRx?)%U2_d_84FD1Ggxhj
zRw~XWv0@ZTpdf4Y-*WMo8_~4TauSX=RTYje$*ZPs-v8V3s*5-8>bUWnUV$ljRfT|n
z@54}vWyGa}i0<Z;tX&l2eItVbJPy__|E!!EGPibl5hpG>HLhQbw-x(bm8)1O$+q;d
zi&eW}vX2#e8{skDwzaDjIKnr9?>LR(uEE-+TOl|bID;bXfHd8U@e?}74Gd*o48qyU
zc7zqz07eS#%JgPZ8B8>B0i}>DM4bTPzJ}wl8NkzUe6`_c_@xzk<!CjyD6gKL-~V&+
z>X}eLyp&>ku6}4h0WGYKiBQ}b4hdw!%}Cg<KInr_o{02*tk+@~$oYCLw1_Cm>~mxB
zcXvVNia4g>QD3!}b9agwLadJ$YB&x5kOk3EjBjxm{_zHGg*I-|hG}2loJ{5=13B!J
z4pBsQN_6s&P-~~OmNJAm1-Cw#43CGwQ{+c}dm0C!Bqvj-1udqiW+<OtOw!>~>iF}-
zqO7xip2#CS&w8OFezfOl87lyqm7k}Tl0@+;Y3Jp6a)DEGr#kfEhTG9<CftIA$w7vT
zl0U#t+*OLG*QPRglE%z=^R!4MGVh^;G$8N8I$_2ijOyc4>hJ|FG;fY44sna9)=JBH
zDv4w^(>Dn7q`B5*jh4}gfFL>kZ@Ty|LBtt$I%fo;ZqZlBA)k~fA?l;;HgEuRtWSck
zGu4|x2<$*crP&VHuaW<4eAF`_1ci-bjpL1z%`4AFRuLrDpICX~(22IQp{1j{FEqDK
zH?N;;UVn1$sjky4r+sId&!o<TrkX$d!Xu-5k8iywFP;7f{~s~?S$Sy%gp`4kBrGYw
zMo<qZaFPV4vNN?{C2*Bix59Kj*8?lDay^fZEX7LE^zqU9wl$QkYTvd}qHhOl?%N>?
zD`{oQ>x-2p@0kr&n$NQoE2B3VE2Fo<^8{96h&D&Wt?2Ju%?956FYE(BJNXX#Tl$66
zZuWB`9FP8*_I^PmPlVu!{U0KiK-8Haw6X#S<ro%<C+ztR@{)e4l2ixmX<0nhc;fEy
z-6tEz@1Bfo0Q@wy68Ma-96vPPHW@+?NI3-MPe;zEXVO#6kKFnY^lL##<6@aBL+iLh
zFW0wg1#Vk&hn0hkrw@~i&Ppu9ykLMD8ec1l5#n^LHv?*XHFpia1vTV{zrIit;!krn
zKx015QlJUnWS|M(3QrS26Y0_6grg0g*tRmJpm)>&EczN5&eDw`(<D;qY?i%;;s~`=
zn)iSUHYauyyqt|J1F(c5GqHrk8Shl{116Fzo|fnM|E|3F;#~9U7v++Sj?3Zh?C`#k
zZ3$&~s6+GV2bu@pn!NAf9@td3GMem66Vi2RQkNEbq~r75oq9Ge-`CuabZGJ;Jkxd#
z^P1F|vhW9mSPJErfi+@eRl${~_ro`9UM5LsX;$=#nabs|`++bMp7f?Qg}A9;7Qp1`
z4iuRv6eKTZdPU#j$`Mu}RI@^KjZ<8Uxl$qZM@xMb@>Epys-&|5tYH<=RYE_virA~@
zHJW!Ul;ormTK1YUm5^qZ^Wl!orw}%tLgW+u6r#pcsQZLIh5C{n(eO!p3g97)J(@S0
zOAV_Nk{0g6a!A8jtc3&%9!ae1eIh%E5N9!!dOq*ZF7DugKWM^DGvIdcHR4OOd5l-p
zXWIEYDuAp8?C(J?69noa@HXsWsrVSaH2;H7Bp(0VQ~PvnzcHIjDv3Up8^%CR9zo6S
zqHMOh{{=dHlE_ma6G28Op*xz+Vh_twr}Ifs!QZS0SpSNC6o#8_6%V#MEjXt~SkI_X
zf<HFY(SBz2K>80+zWQ4btV<DCGPd?R4^9O(jCNmWYME|YGugD}JL<`fZy!Fr{an-T
z7d{J}e#<3!-Slk!-<H>1oK1QZ&a=d)<yri{Eyrhjl_`0hZXR&tcLi=9ux0TDez(By
z&`VtN@Dw<wvt*x7B)<dF*Y{}tzBGb5GYWH|i|Pn}1+U;TSr>GBgqwcw5b+Xukbd|F
zB0f6gb_)V+=BL9Dh!bG0<EDw3fN5O41TvdZhD_zWg<>m1=8khsdriokopb1IdG&1l
z+}txgyQ@r@4kX-;wOR81VQ!mPz(0GjqIu}~y4LfFmiN)~qdn|)G)SS>U!}t)w9{+%
zdXi)GUSF}V>%NegeZ4n1kjWkc)|B-2R$MZQkNVG1dgb2!sdG&an!SB?vEy&k92$2{
z&*pzhzO#bpve0$$wy-U9-T8eAde>c!iwa$rREsKI_y186&af+X-SyXC)2S?6vgJK@
zF?uca-Wq!D&T~x<=sox1ET@!to92X>-C#?MnXJ%rM{bzXK`|Q#-IyvJSD;NWwRyQ=
z!ABIPHvcRQ3jwUeM{ylD*3@Qt6#YE(|N5BPDD2-hwdqfx&Ugy-pTwu2G?4R1;ZH)p
z|HCju)?jXJ<ZYno<J|_1MdQsQT8Kg?p33A8#ttKNLotjd8HFZ^{*qvKkL3ol$1tI@
zekx|D3NiIyZaAy(Lh`wMG8@~vAvT!H$L@};O&?8Vhn2KyWFa$rXY9zqOzL1Pqw++V
zLH(jJ46BQ`^Fp3Dh~Lsj(+o3%A#46%I+n_@bgVaXG=}NN_z7G)liHLS%%>UOjq!+I
znj#G@)!2dIKD{W`2sqR&WMtNYlUKqCt-kC&Z8gg_q%j9It4GkY$Y3@hedwb*%?!q+
zkwp|sLuNPqjA?q~Y8Vz5DK&kugUHSzRJ*5Xk1Sq{q%Kwm;-s*3UxO2r-AV#+EqEwu
z;y@bDDxFC7^|5qclAa8^9Yrsr;(v;(jk<mtaRry!#;^Lu>H3wE^(!v~!pFP6>Kl9b
z{E{^%BWK&T{?_Yh@V)R5>|h-q1YF)Vvx@nft6H~yTAs!KO!i`p=^ML&?B$^Jrl*{_
zU?e;NitfeX9-PCjngGSi0mXNN0mbwk`}xwJ*N3UlN^u5LQ9^Y0{WO-KY{RG=xfp`4
zpT;&~oMRzudC(%b=ZFEjmj(Zklb0*ZpAye8doPy~%`Y~q)!%0B+219hmtpgK|59_$
z{<Z@u5l&>Mbn>x@;Y86YX`RO8JC$jT$*)qLHG;HCI(xif-D^G`<Qc^XqlQA%2>GlN
zLO$)gb@K1knSMR<+X(sO?E~$ER!lI)8*<ME`$s%f9if*H{E<roZVoqxR34s4kP0qA
z=n79px#5donB9h)aC|)NfWVF;WDvusPwt5wV<7}uXs|DxcV1zT$0bhq7#YFzB?@@_
zPubc~X{rhhi@}NpG1?DM@<(YeNP88XBWx|Ddz{EVB2R$C>u1F^*ip(jP9#I5pUCq>
z_$V5h#ex{Uu<4<apQ@3wB=ZjLl$E%%4fv=(LGfkQf^v@XAUMiceHlAhJ1yT;<Nv$z
zT^Hr9=^Ow5uH03Dqx?NWgG-|VD11`!yav=q@Hl<@@BzXijNDK?5aDF(VWDstkE(3T
zlMRw@c9ekGg9Nq3&()ToA+Td%W#Se>gn&KD)ggijGu4yY5-#H~b{+@3wdX>y;}{%6
z(d`I<h!r6aRq6^hhp}c+5lU(>@7n&h=PxnuUqlexv0Dl7r@(!~j&Tz99sSNe1trM&
ztOEjr&jxDZvz4Dx18*yh#F+&?Yv-xhR+J`te0u*P``wy*Zx}m#h=X0G14l05vvx|S
ztBZWLNoZqEX>C4RxlEhSR!(R0*~;l`KHF^Z*%qZ$;Ip;%l3DOs-d5y-$B;u`6`qY#
z|5(J|{9?R$&m7)-+jV>M))0%55o}Ix(@Kee?KWkJfj{-iQsuTny&=Ig*z@}c_wR&1
z4JPiG-zNTG5qnpBS&yzF@;JS}K|9uiCdiT(cQf+T={h10&EUQ`evU#joHR;3G089N
zP9l?(ZUyaG(geE$hmdZ{g&`WN5gAD4S>|Y>7gNI^$U_iNenAD1loF4!O|(yPh^B{Z
zq!6)@Pv}|aAeA%<QVFl`8mjNLblJ*M3A;>nI|ibXf-J};`mQdaEVCsPvF?cjNi~yV
zaa?{UKI&s2Tm(5EXgI(0w(*Y1=GCVToslLt-#;1HeLfgHo<APB>X(DQ(eB@dTuW}#
z=M{f*->Jqoo}6miLX+t2KE7<W=1p!9owfVplDwg8edQZ*_>R2c(k!jD;vD1eRJ7dc
zqe(<}Y56!3K04$&ia-d8qzU2(%JlJfHrzTvnU4p0`{xK{0UI)^PCaMW!+|B2ObKSE
ztOjKW?zRHGLlE1129)`2h!vpBfz1}KHU16@St*f)Cy80#TLC(JoBNa0JwP1qIaA<6
zN-m;MeS`_2H(O1F!{lAGXMr<cj{-~6SqRP6((g{1;^S1Jr5f5%by)D;HZ5|iq7BE~
zs%Rr@07aMxs{^axUMd0BlIh0sESY*ft+4f|+Qay$G;NxNv-Qm<9y^{H%a1=cmYEE!
zC6x8q7`u4M^rCf>i`Ko_b@I>~+ol$EzVJCL-B7qFFQ1<4|2cX2N6G$<yj<7qa1+)>
zoswYc>N@P{1LXJ+Vh;@-h9)P!=AlT4BG!egX{-t+)$em-^<epRYj9lA4b4T5a;UmC
zFgmAKi57jZltI5M_};9uY-Kd3m!E5VfkHriUud>Y09cPTelb-s<*8Ygv!p4dv1R}i
z=RtA+Kzn8>Q>H3e7{CDtXXb!x*C^brL!Q~|s#VIf=G&=~&VpH`$&l)UO0&{Z2pWJL
zsu^Z1eZzcChV1Eq)r=MpAS;>|A79ar*3`=F05!EHy(O8Y!XE7@_^ej+KZX0db}L#$
zc@|oci5YgFh4A#8TBX&Bl*QJnfzVQ{RRe=Hw+>5dHQ%FC>%hBpD@zwzM&-A1YvId0
zwv5|qY#GbIfo@;mK-AmK-nPJjN;wb&-{poigby4c>36~-%sD~ptT8SQ^DxG3boe(!
zY~x4|9quIZ9U}J-AzMjY8u_B>&Do_7W0{R7Cpk34az{mHPV#`diwZQ>puV3jRBC6@
zeEPa%FtzZ-;Dix0$fWy=uw(d(uHix|zc40N!xC`U@*9;d0f!4R1@+Q`_Nr*FP(8mv
zgw_r>7ld1y$_)$+=hNnLa0$5j6Y4Og1<pH8NX*CKTvLIF*wb!kesh62Zq9LXC~=*_
z@+xV<I@leXud(v=YHq$Muk%ov^<q^!b2ekfwnMjgn)HibfMBfZlBMH~Cou`f`qPbP
z8h_k4+4b4U2xizgAF9Ww)w+i1y7iNF<|G^#0sq6?RNa%K4_=h-o1U-#cjfyo%AM0U
z`2U>Tc~RaveS`nc$(Xt1)6?_#KT{1O1(mB|W3sJ=y;kB@E7m$)0C&I9AXR%Vr`f7t
z*A$xl1;1KNXpTbV!m3wu6q@UAG1W_)W64R#wdOdgEOyO7Xl^is=77?uG!+7dN_BmN
zW?E{St5jTQZmvx<f_TkSzTj3O5DB2FFSHZ%TWok5zw_+`LQjh*^l(F;E#s7lH+<HI
zAK=ljcJMS|sew&&$c2#2wAVp|ct7i;y)KY=$Y#rIE1h*>$($Q+qhE<7tE9b~7Cv*1
zRDCMIa_);U#F!pcUr2zl9Vp=dJ}QN*7_1kwU!7_^-8i}aQ<IT<%*i5VvfVT4k9VKx
zpQ?L|SnvGs!UNOt4gWtUAE>~ewn5gl&4h~gHO!=?&!R<UL*ux{No`;z^onLe#ka@x
z9E`R_4|s<go(_0C)QgoTnZQ*4GPq5Ea@Zys%w9!?O}_Oz7<NkB@Ca%#Qaa(gWfZJB
zdzO_FXh3U=UauNh?)|DYuu`H1@S7Mu6tD0E9o)W{v&-}6u5Vg-mFNf_2l%(w;5fc~
zv-yOCk{kjZ3zoshR2<1nQ<Pu&zp@MHQEyBL!!d)aL|l2=h>izSKG%o-5Eo)x^5<y4
zq2sS`PV@ICQ>i3Vw6HjjCwUm4FIjW>8NqU$@DF>QPao5y0eGhOP?>u{Mxu}ElitPf
z+s@cXL+D;%c0Y=+djP58VLfnd3(6J-^q1)nq*rK-q<z4doY0FIQ2PYugkJ!$d#S8l
zL}-3zaY%m`4}DfC2FgG$-96Fns5r26ewYoam}W4Wz>?O`Au!h<ZboI@D7+sZ^;;;3
z&vrx8_;DTLET2gW6Zst)J2ZZ1vN3*^PoxzHO$Sy_23C)!PwqRl<<y$*Z<-2xX0-dN
zTWWYvI_|j?XrI0T|L=RLc#bb|sVBZw^CuEYE|o}V0h-e%pBYqnD4@0^F?5WcLA(BJ
z5yaij(nm1&Ls|=3r!>EDPJdBRdbsXPjveAVIkW=xn5qTx>7)5043l9K+tC7P1db0U
z5h~6D6c^*19<<0^QJr*{ATmINx&kW@871;ZM7~PoYeZfq@=YSI6ZsyI?-Thck++Gw
zLqsETfyl3k{D#PTM0lJe&C9@=iO>)xMq`<Hz&fJ{L+<0}4k)fAa(>0%_5|W4s0i|Y
zYJqzOa>{TveJ^_ydGPO2KLz@pN0OxX-9AbFpw1;pUvd3G=x46r&s<?nv0wiSSM;jS
zB`vyI7m%V?7I~z-(v?UAyUT+hF}JkwO2{p>UkQ1nC08PD5bSNd(!zhBJ)Fa@^h1+d
zdQ7?!l5jHMk#^EApOdaM`K6UtS^^-8f*@@HY4w#wK55OBrC#Z_D-8x|dvB?$v2}do
z>4WE+7LPx9y6b(9<o&et^M)43a~n=Sb-roQ_yea`Qo8$`lD$_S47*4DSK9p2`YSDT
z!6vV?;!1-ot-4Ydl{Q?7`K2vamPK&5JRse1Ww~GKmaepTrId8F!6$9v5BL8Ady(u;

diff --git a/test/unit/__pycache__/test_simulate_reads.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_simulate_reads.cpython-312-pytest-7.4.4.pyc
deleted file mode 100644
index 56494c884e47fc964aa54404ae6c8894ff00aad0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 45973
zcmeHw3v?XUdEU(K?E8TS0gwO*5F#m&ONkGFFG<vc5J|l(ZET5lEXL}dC1y!32<$?4
z79>IM9>b5EP&H%HNo`R!Pfe*lEg2~-lhe97ZQ4k!)5=NHLx`lnj+s*DIH}E^o>Pi)
z<5*4Fe*eAm+TFqKf&@gV#$fQz?99D)X71ek|KI<(cm6CE3rkqulitkUyHb*VLO;yK
zW9)q3ktOLRNtJd>s;qj3<=q~>^^PcmI5pzi?U%)w!0rG)6Wkr-+tBV1--dUG`8KjU
z!ne`gQNE4sj;r3UNxPd=1+-c9fwriA&{j17x<Czrwy7b|b~Oyzp+-O#s!`BIY7Def
zje~ZnO`wa_X3!;S3uw363fiMC09~rKf%dBHpm(Sppv%;Spv%=opexi)&^y&G(7V*d
zpexlSpsUnw(1h9px_aD`Sb9VYe>Rsb^cFH&LF?U{XT68=?7-gP{GoN)Aj^&xv~^l`
zWNbKH$fQ^%t!isW$HU=WJoV@p%k=IY%MBK?`JC2U*q<)+s`=hrzR=4uFN|ez7FmXS
zA02ph*TA~Rc0Jm=H#?ls)`qXqTSyL?FNL;2{Cn~L!6Lk(m!tw6<I9lw%FF}Umhfsy
zvPUXPMfo+nNLorB)l>2oJw@+<RrsY8740tyK5<TxhWzH9y;Z$MWv{F%lxMH!HAS3J
zeT9&jW{5W=^OxAZX8b%TX`PPpqvfgSm;DUpJu+vGcq2J-GVaHI3YR=Z$t=B2I;*DY
zr1Labp4UvL`cL_epngj}HJ}EIzO2lD`3o)P6FRQgY96z<MZZ~+x&4Z)N{8hV{mc0G
z;9n_u$E8FlsVf7!wm+sT+aDX)byoh+^WcML<s+ZJbJghh=7bjhqOsHaU@z_Un}@Jt
z99o~y*0RjO4AU~H!TiYRaAqWvD}+m{DE-ESmI#-69~8OoOK7@BV`qK3cXWI`k$xf@
ziEJiv--k_UP0O$XzgS-%(kr9meSK%cjGB<%T74lsylr%x`dTW@^p@0MdUzP!YLwBf
zq*+Ff;&f`ySa!IO&1t%SIFA&Xz9^;b&yNkOJZWm*aDGpEIF(C}Wb{x<OYhB433_~h
z-wV3pPR8;~Uywp?9!VEcnZu*Q>0FvR>DhoDp(}{n?sNXw;QX=4`Qu*ak1gUBMbUlh
z%YOB10X%lESU%jkZhw9xvo1S=XUyl)>ohjF4mX-fgQZXntQ!O0;d`kn4x@MTI;yU}
z)PY?7P>$V+>$Kxv+Yd4!T~+$0=6HQa>Hh%Nl4hN0$uneH9?rBlqjNAV<zvRQ6i1i;
zcrq=Y#kBltU{0A<FsTOz2DWd1?6F<D7|9_<!j7{DMgoR46Cq~Fh&!?+AkLn|R^Whs
z$8<e{Sje8VmeE)O={hO1*(>g!Qs#DjN4cNfg)5fz3KE4}ls7|g-GV6h4jCdD<ylKf
zQN0i}O34rLuM~X{9tJ=t1t7=+T=)lz!4tB!ra<h7FGE!06^rUC1|k0al*`eR0)>c~
zaL7fdmx+J7e3v%vD*o*p^wKMq>KqXN0moCg<Z+1qnq}KSsZM#e_^(}_CI0J_r<g*$
zPC8$M=`65Nr@Vk!o*FzA;(#QihSYE|WWbYfVS!owj*{BUWA?TfHcK+MF7PA>h~g!f
zf`1?W{iT5YHm8BG5f=y>aYzHSVnA5?Ell5rqZJT#z6~2nfur`^`i|9x)tDg-B5GW1
zDn^Vp5^cDRM9sEgZr$1lhp^2_-JeVj3?!3B0?FjAUCG|0F8AsZmkbH{$g6XLW&5_6
zP?+`%EKG=1KxW!A4g&Q8AOkTDW(-u^w;`e3BOvZ34v@L%N`x3ON=7`A#fT6OWKBd!
zmT@S&fzwT#ZaGqhmI+-GTE0*BCX-1a63eAwA|hlY^;11y+!Lc7$_V?h4k8PQEFwaZ
zk984Q43dbkZrbl5vXn?Kh=8jG`Zd6-@vFb=+{UtBo7o*!W&`=&>qNf&>)A58#Bvag
z>cYUlVk6}D^6BJvE;Y(B&>{|IRCYh|(rs&h146j=@u`o^dO>;os<M7+j@NgT_0@=i
zI3Gt8yRdf)h+^J2fQlS)!72`PP}|sGql4P%Y|i=-83*c0TaSyhovH3MZ!mIeJD=P(
z5pro0VFv=6HzI;xIG0}mJ`r|l6QTJwAzH!zpRxmn<p@CdfG*du&*<`YU4E>Vm1bI$
zr)6{dhBMqy!?vL^2`_5jfto@n@1ChSUMH1%=C3lNN`6PVry7Tn!ozXs6;OB&;i9O6
zJLEd&J+NYVY#UC=i{En5LQmwzDoWg@<tzGN(>mj@tGc<gJVhUDTEHnq$zjv_q*;e8
zSC#rnyK@T*&A^S=?s%sz7Du~vFlS0PBxbn+8*m$1ZOn0Xw=ymJ-R*xOI2wb(wa6Wf
ze#1%=XxPzc*#s-BG^Uf0TkuLg;b@GLqmlk$tpPRdGp~?+rVnmJ!*YT>!*ViHZ{xjY
z8Q3w3fPF<2iyh+-j4#tWEaCHn!_wii?88k6=c8B4ypRamHX3qRZXmLW8)|6pUT%O{
z&uM?Tbq1bLet07nb|_~%SRc2jIGuqTg<ZtqQoOIsZqny8lJspT`?X0Ut==@02ph?=
z>1-s~+`;_t7<n++uOSyXI<&VyCZx;Zu8HkemF-iX!1Wzvd$o)q&oP%V@51uin2bT-
z1IZB?;Se6n8{&nZL@7cNIHX69qp!P35BO6cJ$#hQAwB#bGvA`$@f2<x%a$I2hVR_T
zS!a(22kP9ZB`xZtn>nuzp7I-<l6)R>!zq0wr4G80)rf1$jLx^r`IgDOaksB!##~!w
ze7<GQx6HY1nLw9K$s-T%%j7bttlGP5V0i4=Bb%3P?Jcl$PQ+X~j;GYI(LS8qn;yvy
zkDDi!ZPf!t0lHrl!13u>`N$yQQ_I_DJNu?Vr-(X3B;3Bj-c-6!VA(xm1=H<kdHsab
z&y0#A=qnJWk8)^d{WivAI{;qTP7s2ZTi9dRE02*Qc=|Y<kCi>G9uW`0pH>f95374|
z+zh8%;Bv2xx&zM!*)F;_?h|~HZomOrH?vRCxn4TV1MmzVUmtrKhg@)K567u4w?p=w
z>>x`cTR&y}GB@yCjy^rH^Q!Xj)NNWXDi2>(o}9W(>qX_stI9)Dw`skkJX8Z?pGNG^
zEn#d5GzMB8L+~rN&(uTMUXHMN9tXnyq=g*Y2)l9JU~Uk0JzZe#5O%;q*g-WkcL*EO
zBAh(ZQY9_);NUPK7a$PM%DN}3A#!BSIDb|X(H|z~7p(NNU~uf$i4a3)Um)`LKwS9c
znrZpvNLC}uI0BmsV=Skoa`{|_?L{f~QAxiE0u?{Hl-TD3Q@3fos60@Eg}i#JEM#6y
zpI6iUa}GHp$s?;xCL*v_CE#=*u&wMf=y~iJB4j6G&k`Za)Qr2nFzLBLrCdJs!dM=h
zf$c}Z58>bH@AuwTxO7Q*xVpbn7$!H%U9|f<z@1p!0z8Y7>QTL_VvZ{+VN6TO10$Vh
zjKWU5+RF~3kDD<LW;h<6j2sTAFnfK)65@5;3RX%Uj$AwN#}c>*yY+Arl)~G#Vi@av
z4IAsNfZz&ay%!_BBC4cDyNvqe2F^xmG{$Q*PVZp8Mycf*L8_=xu9UtNM~fc#2o|_i
zb`5MFxW&z`k6^i`qMEM#9*8dAjqzrFU?Gfegr40&dNw(K#1W2=pfYnu$ytxkiAf#)
zFfIa2c><`?LOp);7N6b7e(h0Q!SqbEns;Q3U&Gy)s`J9kVd&d~1sXtx0az*chQ>0v
z!Azs-bHcCt2KTf42!A}M?%eNi3JJA&qc|;oY}Sai2``WE@!ZX^;57F`L)H$ZnVQnb
zJfN~KA~P8gw0{UfNU(om=aq1DD!gJcyyEm{&ul-p;N0Ma@crZh>YBRQ^`g>sRarT8
z3+tq^vRY)2e~XI@52(;GRrvak;IQf;T9OO3MF>3ks-pTV5#n*!+Z7a0vGC{tuk9~}
zZi>-j4*$;<M?5^;UB!a2Fy#NCaWqc<Pq5(;G3}1>-8?gvaDd1mfy$R5x5#`eB}5MN
ztA_8O(^0O=9oY3}3Ka~=t$8QaIdDgz+Ij5w4K>{GojYs9<<9N5TDR_;Q2^mOcWQh0
zBgO~_zZz9z4t<uQ@A={=sm(lQZ;R9U)@d{hR2&GHcvAoL!1l+$`DgE64`c9vv0S6!
z1J`zg=<*tqDe|Gg+($14^}1L_;p-MockI}%{OO(U!X7F-;)IFyAlPYYba+fl_#C!S
zJ|==}q3m-YWnuthANW8CbA-b|N<|gjFh35muTb(sM79&*;~Y}7M<z}u3%-|+O>oi5
z6E!*UNH#a7nS7WO@g)AWb0FZuO<fa@Uyin&I6N6$dV1Albagpfe&n3z-1ZCMdx<Ug
zP2JY@lG0brnXNc3#p29%KX(K30Ppaz!=ny<TvAM5J6ZlL-s}Us2mdVkVG9ft1Blgn
zv@XC8aDb0IHw8zXlL@%^tnDXq@Vc7dy*ft&fZtLM)ZlIQ90eWq?Z!bZnNla+jJ@r3
z%Ci)iI_a#DjcUk<a|)_qHBt;3s#B<8)hT4YadT?`{jxYGW29oB<Ogq#CiS;XKKe=D
zGHB=$J3wTFhzbJ0&gX`39r8AfYit^fYK)3=y2r%mIEWUp1NKfKSx^jktQf?Ku$>UM
zh9io&j~F($gZ3;DVUL|-PvAtk;|!5rJRoZ`8>3Rfg7sVOEXG-XFO|0uq$~<+1KUK$
ziGP>5x3*{`Fa<UzD)9Nrno^;V&!<Mxx$#tv9Y<!e0c$UT5H@cotIfdF$7X$38JK7N
zhJtZZGtbluUz%tAQXQ85$zc6&ogll<ETZ!HvTj|&5mM|+;H|b4V@K&Hx5IPf|04bT
z`$WD@gd_ty2I9)t@2bn#G06}su-`!eq(oVKeG&2XZBrkc^<8CKHD9;v^RL>FzP52?
z-abEP_W43P+7Dms0fSt9qRCeL+#m%NN&9OD_jL1sL@+UntkUEbP6a8}u$oa%L25Ja
z(uB!%rDRnOC`d6((L!C4FLh>&s}}@eNz}RV$vEoUO+m61koYNu`)-12liECs3JA^Q
zCKQgAB>N_6jB~kb6e}gTT#oieiEyjN3$(|GaFk4q1W0~8!D`@dVYR`IVYPvohfm{%
z$t)a0A_zHqhcmge3P;`)nbs!))u;~2>@paL!N{UsW|yIN-oS(t?Aw&lz~%hr4^u{t
z(8p<yEWTnUiYKw>w9ssE{<zPTdEZl$d1ns8wvXt*P3!lJrt$~aDO`XQ0_{&g2%3Y1
zJI&?;XV|$77s6X+!ED|+rOf$yUg@mC@Q;Db3+p<$TyM^<4%-c%Q4OwH8uMMxHS6bd
zu#uhIIdSCd**`2xMO`*vr6fCfo$hVfHDF&5tl!7&3$r(2qnM--wghz!orsosVotAO
zJwT;3p*Dvi2$l%w%AUMB?y%0fu!8Ct-Zs?i<GIw(7^aQflfl%islt9t8M8k>tg=^d
z!M)VgiA+fEMY<<;T#h!MxNkDLq_Wd%mQnWRePx%FuIj#M*;C^d_;U0`+?ZHgEvuGQ
zRrQs8u&h#u1njCs+w0{=?+g_E#Q-d;VrI8`mQ_EW*^TFMSXLYFpY?!)`9PMZr7m!&
z%F#f7wfZ*vO}7_dgc{{6b<$bhraI|ntTxvv&+<*FVZ*W-LI_FJF&7-oSj87dNv-BF
zdpqr1a0G11=6}QQU?~90YHTjKp~VX^gVy(Oo0ZH;rDOG#M&O;(l#(lz)7cc+Z&Q*@
zM3O|l0isdQw=IZ<39%tYmfn*;m~k+&@1rE*tr~GOVq`r|M)vVHELz_F@ACk$+Ma#Y
z0{WQB8Z#5wtxmP1^bYBS{QPQ~*xKQFb@AF4X$ouUHE>>9N?zOJZ2O@xYo>w-b#PU#
zeEExhC|~5D<cr&^9vB7c=iVtA6KYS6?HFr4<kCN0GfL!>W839hYLjsqn*!OM`QlK%
z>>M<=Tg6hHgS=BfJyzeXb4TrtCw93byS9M3YTk*Rl5_FeX>FcYJDtVzfX;;S72xPB
zV0eFm1)AWFFOIwGGLPBYV$iHnb8C2iXl_b>3A0EXTcb$a=W`9sOA>k*H#AMqPg1hj
zw}_BNqkFZnJ&a8J>}4WcJ&Nmo3eU-p=>BKK)`#EwGGbg%OiR<e+9_m&W1N@LuwOf*
zds%Luh%%B>?4QsjIe1!1d!#tA?-2Peh(<!uVm`uqA!f-fy8%q|%~%GpPtXDP0XY3r
z<RXPkI}I`+T?scIU-ROclb#FV#S_~vhdU;=Ux~Gz=s*74QTa-|c`DvB8Sgo%oqptu
z_d@)hqu#5*?y0$7CxhMZ`=t27YOG}$y8e`7t()aDZWH2zlTh@K$w@pm>cF^i!<YsS
zctyZEI0;DszPQbp3-07Ts+^{Z6uDd?fXX<DUA_y?DOfzwyhorhJBLe8ug-yfj9H)y
zjowkaqkNY;vTF;Rq~@L2DT$ZWNoz@lI_WHqgAiQ7Nh-t37n(KiC~1*-%-&8L(?duH
zqrWk_8+g3vIY`<w1vJ~w>JpCtyhQ=cG0I^}6h|JHH)Zp(+y8ddxS=(NxKYfWN4U}2
zoNgZBAfFc&2C;vPs@L1xoF{Aorz$wJV2<USS(lT1I8Y8&G9C^}^SWkpW^gLvQcqGr
zJBTnMWLCpesb^^aED_?V>@_09{Tt@n5%xMI-%g|^(-x*K5fZzeS%xupAi0Wb#O2(a
zJ1MLH*3@WPRWWhuzr=;VjDPJvgH$v1HZXMx8||75cb(jD+H)GwNvp{EHRrY~n8&Uv
z?Nc|qUQ*htMF2@?V$vV{TQFrzhib59OzlaL4;9PwP_q#cKp(LwdUi<EB4`1;v{X?W
z263a=w^X0%cXVeL7GRdb;bBc?Tg{@u)a@KDOt(4*G4<Mc>?f#`&St5BQ(kTv_t`!U
zN8S!mOHK+yhVg#Ptrc}oL6O1awLv5oAnYGtZ_s+1_(5-Z1O7i)0rDwH#@jAR&qLsq
zWC&~Np!99|C0Uk^g#qZ~-}fGp6CT}13?!L6D`DTWcD*j|(WqIN@g1QDXc*ApQTEW0
z%`V+5pMxFwV0t*KijNAceL#GJ<e{}zzTZY^8ok}0Nq;jTO(}_sO5&HAf&*WPH_J!v
zam2g3gjSkHdIa6TNUwYG`GSPk5feiwIT3Z`NN?hlW7cPq7p_x3`zx6dS7|9i(rIYl
zq)|T-_)&mIF?nA(yQ!=X_V1ACMf__AK&m+n!ADn4lR!n?%393^2={O{@WUCh0f)9}
z;kq5t^Z$mLRUKW^vUrJTNV0$*z{pR-Y97HI@nsquNS|BsRaniPU9>h1bYdwN24G{U
z8KEvG%W^{?7V1(1merg_{MuIYM&-ZdUaMdPu$mwHW3G{flfobNGrZPo#0TgLM-R~#
zj;ygA4~Z~lIK6avkd^z#&N>D8?;*R;$2yD_J(wTM6;k^$_$m)(dKhd}wx11%?)3Xq
z;E#wjWX%@iJlrpxN)I1Ok87#Xyp}Cw@iC@9N3q2GwC6zVP9<NU*}X^|+xU&&fI1e9
zPet#XjNW;A@IrL;QTb}H{dlaAzi-4ynR0O=SzYMu+37-m+(;Kvy{{|QG+o^M&ASm)
z`G&d?%sAXIQ|^5F#3(P;xIiBt7dX4!=q=ujBBC3yA5vo_!$v~ZLnG<Kumqq7B+#cG
zcrfAUQx8~uO0@6tUd8^Diuq3<)h?SdfLhVNFxE;Nh223zz~o^ziTvDX@f+A_q=lb*
z=3?|-YU3Tp<Bj}%BVMnKKQlwaq`R0nE)RP4NrRr}J7?O~zv(%)Sb;ecLdFslHq*9^
zTK2IcNC$%hR85#Jy$7g=0AnJ|-is&_`h^cSF?_y9`)3Xe<|0Js15fmkAAFsTrrz-H
z^_pS(j(lz+#m0Rh8}ALdpuE@cR%rpUZFiLErc_%31!AfuqP5W}kZh<<T1&XV#UT)C
zkh?hS;PJ)c4q<w=ue!uMW^aqr1&^Dbw&C#`^pVU#yval4;NXgD_9zCh>=Wg*Z9|V!
zMl!GoiUZMY#!lC^gomBSK~54LTT3-MArPZDMuTyTV~F5j7mx=yk*lu+5yjU=UZy>6
zl5BDyMoaXuKf`&lV2)I3vuM0dC`s?h@$HbXE%9_e@7Cf5*bgWj7k}Iy$p_uo_Q){K
z3Oq^F8BRTo7^E6XrAJ3u{xEwNnP0`fMqI*X5W*~ZW!ug-)gMMK#GZO9@<!jWJyY?O
zlkt^jyyup`8{hn9Wa82Dv8S#?;ztjDee0<OCqMVy#gma06FYvYbWhFFI;nL3@<u?4
zKP_uM+}}Uy*|9<)y~SGS**O?JkMk6cL8IUId9pM`vZc|u>Zev8bGkuXbyJBn%%c2y
z!0f$njo8|#1!WD60VlFK^s%mq5?`%^ObwCa*)~mK@SfdMT)2v(&pQo{E_|d~H@ztn
zB?f~ud`EKwmLoRj9;ykJ>!h>Zxf-=hlrc5#7<SkwsJ2|Y6`<NMQ5HQz7T>els%6dK
zE;jfvzW~VycWtN`9CG1^rLfvmiXb#0T#T4k;YjTjwOMVMwKrn9ZR)BoR!Yc)mO3?U
z86E4Sv!1L@I{VGFns2Vq=)1CHlv#_0B_s@Uas)LRs{O8>2lwS~Dq4yaV_zUENW9ck
zYA&^uT1yK`ZKZZ~0SwV`LnRFt!*xy4ZN)J9eLLlG^!tukd#7>pN$b8-D<$4(N1b<I
zAqaJ$p^|o}i`33yhv6z**zg-!Xx^W>HQxwS(l-=1ppSi%BkC@EcNgRT`w{`6puUa1
z#7I{YbHos))s<(TNbclOoQXj8=ivF-UI+v|{8Znk2l~atem=ghd!Fjkm8U56m#GLc
zdtW2jaSh!NLS7H#QiGXceD`)C7m#GE%HZ3!YHBpY_!%Ld$$aB*R4Vk9+X_!5W-UNL
zsUuH0Y*IiIB5)Kt_HbWu2u3M8mUOLFNV9@=C|lTH_I+SIzzfZ)2@z3R`3W=%x@7->
z2vrkqfwYbaRXyRvD<s48Ci?2H${!D7iSZ=grV_Y~i=wF=b}tTAIDi2J7GFElec3{0
zMAO6eEhiS~p=_>z?~36I!lJ@{Ldk9P@+AGV&<p!3{M19R_F>rXNP1L6u5zPUP#nyv
zx{}RdhKBz}89zg0H<3Xihd^{6zr60v<W${nOyMw2XODnrbl)~SD&G@eNzAKQQ4ur-
zd2A5VgJszJD1qF!+5t2##E1mrm!kuhquZ|gye;ADUSErU;)$O}rAUjzqkAP1n~HQ#
zM!HXC&$YZgaQ@lPU5xzt#Ey@~^nbZ$+4qKC9{S$!WU%XK;6&%u;OePcUf&I_zTC6&
zd-<31@Aj;FF?say6YAxsKQkG9^61VJTTVW4`tZ5-H+*ksZ$5f8+C6o%>$}nJ8$N02
zdMK4k@2<8z2JjZB%YN{O=!F9AWsTV?PI)j-zo#DTgP+-eh1`@aQ+o!SBXG$VTL}c5
z3qUwH;oOgETPf)y<gRcI>ZDlQ?2~11j-HP%Zm}^1#XM_11K^yU!v!i<=K!0pH#dU?
zXls{ktBG~e*`hmQitbsS>%okIBZXa7<2B!D)BHO%T9`|F`PjZw-cNT($Ag9u2(6xC
z0MurB*D&yeVBiUrf@(_%@T(X?WI&`C0SvQy2bo+e7PYk)L4RF9Icyl_$m3Q+7O+*@
z9B`?&)3M$hE0*a4aLvABx2szz(U05Ib_0e*)ed!GF>1iDSi@hBjhTGJ+*&X!qAr@R
z$<nOU<Twmg+-Mbq(W*142L!4az{gFlqCx7OeFM5@Fa8Gy*kv53h^0lc_vq-)iTp1j
ze@(<S<uwvxDDV*!i2uPyM6&;aqgfhj%TY3dF+QMV#(g|;)`K}+39YVQ-)HUhi;EJ}
zg%^kqe5&hHHD~I^d;}as8UvpkF!ggNj(wE~893PA65*(o066<UME;P-KL^p{2HMqV
za<0?>`xkVE5!p%P|AH9frSvud-9*I}@&d)r2*@Vlj8G~DZ1oS?2vp2&z)fynqiZgs
zB=TW~_bEASa^rN4L(Mq$+yLfk?(ufJ%hW!W9~;%oV8P*W=11ZG9_46X#SXw`d(Zja
zC(pNi>T>UsmwTUj-z&E#<@bF`)IYJ~=V1vbGknKn_>R*nE`(Q7WXbZWIanu^<#Si^
z<rOQ>gkDWvXy5Qk^5hFA(kJELRZk};lIPnu+{*N57nPN_lKqmh@|QrO9UHm3{y%Pw
zKHa92#YhmN=1_Dq^e9&-iVx$c0q#ri)V4(y9f8<309Avphs>ZzS+WI#onjLYfE%)G
zk+>scF+nAQ<s|wu{eH6aB}EOB&>XTrshJ1U!&%mtA+mK>EUNXfahh~olv47m7;Zp=
zWVp?fJ{><?+loPfco4d<xshkYw5U4rxY#j^mIbFyI?F1jHd#S)u$4ImT(p}H<tVAc
zJZ5jF9dO|n11!{5c!MzQ5`YFh^UIFXT*NjY20e2d<(T<1tL=6O_E497kLI5Ft}A>%
zE*3>QCaA6t$ci2U&}dgXjM|K-3)MxA&(t?sn^w!I02+Qkw$AyQEJbZEpQ9zYwaXec
zSya?6RqB!elVb)<hQK15<*oE}fa?_i*KvR$5`=syT2@26zs02{gw{6k{@+||hR>s>
z_X7RD8GT_1<+1yM+TCy)cgEwnwDAI&j`(7>wYm{}RXNb+yVahC@5t#-b-5$Ew(9me
zt6p&jJ&t;G<B66vzD~L(vmVr?hWZ^=d(}INVMASLX}ImRm?fE8LtO}hCpM!M#Dz?-
zB*U^_Maf{*!g7+bkdG^V6>DK!S}yfTntTXQ`8i3(-wEZ|Lx;W3Nrw=}!PTJ{)S)|*
zTo-{s`xmGz-Op!=AJBdH5Kju5!C@^mfRpdxBnuMg4iO21B$R6ez>F-9h%DGpmPgqC
zrVM`v@}YlUVQoP>$gRu>4v282Vh|s79LaG-OP9mC9O5DFP;{)=bz>@H=safZIsu}x
zNgF6Sfbc&1A4F`u#lJt3Rx_+ZU(v&`CTBz>kkENZ`E$AvSpJY!;YJWD5w99~Q=(af
z)wLdCFuHTSA*ZHLYGFS@4GK(G@}rrY9%8usa5k6GFjxwQv!gJNj}B)G4x2u;OgrqL
z(!~F5oK3Xq!NSNW!rW1MlynsPTKTa8^vnIak6(p#&@1@?l}LP4*!kb0z3&jQbR%6J
zW{W5(SEslSfFcewVr}Ir6a8}9<*T2UG{TW|f!&4jE>R(Cpg<a`(!=N5wq5Rh7%Qnt
z1Eeao`6r&b4pnKviO)}lyAjx@_z@mIsdS#a=XLLy?XLyj$X-$&|I1M9=-{!=6MHT<
zx4snpX7qRCM}5EWO5p|9gHn9KiM|&<e?0YK>g28qv1L=SRg<w*XWA~r)=oTnRk>qo
zChJvY>D0}x7nP+my`1yjE-H6i_e*`7&!x}p`Ge4f$o+p1I@5P_;P~S&K7L}?*OODx
z-pOe1>9$MJyUv9scASsg|Iv6WemX;q^&QeimrTuMy`U`lMK|8%i=BUaqfLr#)mqSO
z{;1`dkoQNSmCtl2?}U8MEC{&kcYle-Wk#03$$*Km;c_5X!!7G~v#;IRCdn^ZW_QG{
zg~SxG9HWUDkSzVq&C!5QchU?<5t>%kfzhH34|(mhqw3t^HL8cy0;Ab?<br#va{wV>
z!sv$Y$T_suErP_VEnu{ocY>*R!F6cC9mMLSwX~=@=`8&ggT35mKq8jS7e`4hy5O`?
zbTl0{kD#lP%kr_sLM|s-l909B9`_fx6yxN-V$YDk@SpdyRm8&iL61nl4^UF1;v%gg
zF9w4Q_Tj&uHBr{TLdt98$EqAvwt~*SL?>P*@+b(5a%9?IU36+OktIaBiI9+TNHl6P
zY#|Zy_OfyThuM*Okq3s+0$WCzmlNR^JWQ!=iNr+~7fTdZW{V{uj>I=*T*Of!Uq}z9
zxBy>+qW>rIno}SxTzh`=)054~%PsBavERH7QYqXw@yL7O)+%YV?9|}t<=@?ZCUjBh
zzcQUP`svMnZD2;9DLAim{ldrj<%fKT+`n$wd8hYJ?)2?kHw}~gnc<J+Owl8ZUI>Ch
zZB#Dy)NC+nkpYb_L)0cJ7B`cZ+4+p|!0^w)1OsyvOfZ%i%xy9>Ycw!&*euPF$HjwV
zclELXT`mTmG82q_A8u-!6+{C@W2bbxx|I@)2BU-x6ATP)2pIGm%3HvpyxFC>Fwkko
z)M9=HH4LBVd`;TMtedXMB1Wkb?<qNEO|ZujmoWPaylSChm<wX^Q2xl#28iWUEEu~?
z9?BTyv0JSgcc}ceTVtoAe3#Z(Aa2PQvtL0F+fVP(Dyws#b;fJwv1_1CI?KCRC!Ots
zZ8F4VltLmM9?(YL<#ZlUGHHbHPw&fkr)n#IvvDZ+3;3R*wjp#A;e-5b5)T@7+_(GN
zr1+3_%45GxM;@2@ws}yUbXMJ2#@Z%stZgzlMRUWPqS<T<=GHLt@IZV#aa`>*p1<o?
z@%b@$c5(8<A5&Ojh>d6mLYc$)T&Qn-Uue_%^`W()yYX*+@$M!+vfOKaW3zV|1vf6+
z8phT7%dUpY^=;<YTjRVQ4|J^lrn4OjS^i1qr|1*U_QfbR5*Pb0B0krot;P45g$vm-
zZ?OLkc2Q~Dpcu`5fgH(W%wX;@5SmPTU!`N*;|onn64spOXm0_XZ6nf7gj-WMhI|HR
z6EQt3uvUR)#3!z#ciA8GGj1`pXFP?g;|oz7Alh+@)I=*S8w8lyVj=_|6D<N3M(8Wl
zgL%X(<_fx(9+^Mm4=5eShvT$&gh)3LZsthQ9tVzJqdgef5r)Ve(CjTbKuD4+nKp<t
z_MGE0rhl9izJrn-Ne{CvsF}Z`!oGr?3F*4x3$3~Cm3+<Do5`Z_{A73;rjJnqzm@!Q
z^2Amc2(C1@PBpKYY+m#F;JM|m?Z42xW#WmeO3&2It{0V_Th4yhlrsD4q;eM~|39&J
zGTMDI4a3F6&YvnPre?Cv)D&}GSt>#j@hYsv@&lhR>;xYj1hz$j9k>OLueWN-Ez8&I
z7Jvw7OjB<0X}WB&Gy}-{(TqZw&53d>Xbli<*NvN9z)A_2Q*WXi%Q8@>Ov@@@O_USh
zV^ZxYw@8!ai=!k9s+s2`$p%&~q8d|fK~&-Ut3xiLs&8YJsM?jyjaN$!I?99S%LRv#
z*cKf-k^_(&Y(0902w>uZ;9Iy?p{jG1TA|*@SU)8v&dxT_9%tdinj6;aL!5=Pjg)&6
z5q_O%wR*ag#gGLfuKY%HrCdqv%?=})9iOD)v((>*i?rh3m@e?T;t8!4?3f#v=bjxm
z&d1LyJvF#h7^*i}HRqaJsowpvW8Arz#;b8<6R;Y*2@@aRl-fR%(do9@<Dw|p{B=fZ
zJEjS=)b<dps<W%M)7T}dNlc2>fB_`RYce)pla;)Z^T~LUtt0-y8Q5j82{;Smm?I{C
z2;+Y^z|?FXpBo5feaMb6{(gL>6;od&j4!p)L|Ed8IT;ovnQ>!#4I)>BgOV+?lH<96
zRhKXCt>s~eH9%*@13!o>_26IoSE!Jiz@BgW-#mP=`O~w2KbK7@H@RL?mT@kaxLXgT
zQfhuMmC}P0iCM_zG_D}%ovG1rs+qN7v{p8s!$%mo>Y<0MLwdkC#$}VP>^lO_swbP*
zJ$a3f0f)mBddeY|9@YJ#JUy;uN5+QJ@Ya%3*7zFNoh089MD8cDjmWPNNfX&iWIvHF
z68TLc$BFz7kyAuoA@Y49G$x5pLB?kXVt-CQX~G9SsGE;k;?bnsPs+$L&9w<eGK|r?
zWq(A8?;^4SL=QeNl2^xuGY_#Ra2BhiQNz0Km1X&Tk0L9-2urg3HR-F7A4?&=#(pd{
z{X~lWL|S|!F3THk1Z8>aji5*FyAf98H8<iOd53(XRgoX2-3M=kJ@PjGd)1A&BKOn5
zM`SuW0IJAcH{w2d@r`DMj&}Iuo*T_Rxr2VMzR{sz51AHHnw}er;vlOd@~Rsv7Ruc>
ZHU^MyNvqs@V|7Sg`oW@@yqrJR{{tnb*(3k}

diff --git a/test/unit/test_evaluate.py b/test/unit/test_evaluate.py
index 1508d54..cdfcc1b 100644
--- a/test/unit/test_evaluate.py
+++ b/test/unit/test_evaluate.py
@@ -366,3 +366,203 @@ def test_compute_metrics_returns_all_expected_keys():
     for key in ('pearson_r', 'spearman_r', 'log1p_rmse',
                 'precision', 'recall', 'f1', 'jaccard', 'specificity'):
         assert key in m, f"Missing key: {key}"
+
+
+# ---------------------------------------------------------------------------
+# compute_per_cell_metrics
+# ---------------------------------------------------------------------------
+
+def test_compute_per_cell_metrics_basic():
+    truth = {'c1': {'A': 5, 'B': 3}, 'c2': {'A': 1}}
+    obs   = {'c1': {'A': 4},          'c2': {'A': 1, 'B': 2}}
+    rows  = ev.compute_per_cell_metrics(truth, obs, ['c1', 'c2'], ['A', 'B'])
+    assert len(rows) == 2
+    assert rows[0]['cell_id'] == 'c1'
+    assert rows[0]['n_truth_expressed'] == 2
+    assert rows[1]['n_observed_expressed'] == 2
+
+
+def test_compute_per_cell_metrics_empty_cell():
+    truth = {'c1': {}}
+    obs   = {}
+    rows  = ev.compute_per_cell_metrics(truth, obs, ['c1'], ['A'])
+    assert rows[0]['n_truth_expressed'] == 0
+    assert rows[0]['n_observed_expressed'] == 0
+
+
+def test_compute_per_cell_metrics_returns_all_keys():
+    truth = {'c1': {'A': 3, 'B': 1, 'C': 2}}
+    obs   = {'c1': {'A': 3, 'B': 1, 'C': 2}}
+    rows  = ev.compute_per_cell_metrics(truth, obs, ['c1'], ['A', 'B', 'C'])
+    assert 'pearson_r' in rows[0]
+    assert 'spearman_r' in rows[0]
+    assert rows[0]['pearson_r'] == pytest.approx(1.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# load_benchmark
+# ---------------------------------------------------------------------------
+
+def test_load_benchmark_existing_file(tmp_path):
+    bench = tmp_path / 'bench.txt'
+    bench.write_text('s\tcpu_time\tmax_rss\tio_in\tio_out\n'
+                     '12.3\t11.2\t500.0\t100.0\t200.0\n')
+    result = ev.load_benchmark(str(bench))
+    assert result['wall_time_s'] == '12.3'
+    assert result['max_rss_mb'] == '500.0'
+    assert result['io_in_mb'] == '100.0'
+
+
+def test_load_benchmark_missing_path():
+    result = ev.load_benchmark('/nonexistent/path.txt')
+    assert result == {}
+
+
+def test_load_benchmark_none():
+    assert ev.load_benchmark(None) == {}
+
+
+# ---------------------------------------------------------------------------
+# write_tsv
+# ---------------------------------------------------------------------------
+
+def test_write_tsv_basic(tmp_path):
+    rows = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
+    out  = tmp_path / 'out.tsv'
+    ev.write_tsv(rows, str(out))
+    with open(out) as fh:
+        lines = fh.readlines()
+    assert lines[0].strip() == 'a\tb'
+    assert lines[1].strip() == '1\t2'
+    assert lines[2].strip() == '3\t4'
+
+
+def test_write_tsv_empty_with_fallback_fields(tmp_path):
+    out = tmp_path / 'out.tsv'
+    ev.write_tsv([], str(out), fallback_fields=['x', 'y'])
+    with open(out) as fh:
+        content = fh.read()
+    assert 'x\ty' in content
+
+
+# ---------------------------------------------------------------------------
+# load_ground_truth default granularity fallback
+# ---------------------------------------------------------------------------
+
+def test_load_ground_truth_unknown_granularity_falls_back(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, _ = ev.load_ground_truth(str(p), granularity='unknown_gran')
+    # falls back to repeat_id key (same as gene_id)
+    assert 'AluSz6' in truth['c1']
+
+
+# ---------------------------------------------------------------------------
+# main() end-to-end integration tests
+# ---------------------------------------------------------------------------
+
+def _write_gt(path):
+    path.write_text(
+        'cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n'
+        'c1\tAluSz6_dup1\tAluSz6\tAlu\tSINE\t5\n'
+        'c1\tL1PA2_dup1\tL1PA2\tL1\tLINE\t3\n'
+        'c2\tAluSz6_dup1\tAluSz6\tAlu\tSINE\t2\n'
+    )
+
+
+def _write_counts(path):
+    path.write_text(
+        'feature_id\tc1\tc2\n'
+        'AluSz6\t4\t2\n'
+        'L1PA2\t3\t0\n'
+    )
+
+
+def test_evaluate_main_creates_output_files(tmp_path, monkeypatch):
+    import sys
+    gt     = tmp_path / 'gt.tsv'
+    counts = tmp_path / 'counts.tsv'
+    _write_gt(gt)
+    _write_counts(counts)
+    prefix = str(tmp_path / 'out')
+    monkeypatch.setattr(sys, 'argv', [
+        'evaluate.py',
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'test_aligner',
+        '--output-prefix', prefix,
+    ])
+    ev.main()
+    assert os.path.exists(prefix + '_global_metrics.tsv')
+    assert os.path.exists(prefix + '_per_cell_metrics.tsv')
+    assert os.path.exists(prefix + '_per_family_metrics.tsv')
+
+
+def test_evaluate_main_global_metrics_content(tmp_path, monkeypatch):
+    import sys
+    gt     = tmp_path / 'gt.tsv'
+    counts = tmp_path / 'counts.tsv'
+    _write_gt(gt)
+    _write_counts(counts)
+    prefix = str(tmp_path / 'out')
+    monkeypatch.setattr(sys, 'argv', [
+        'evaluate.py',
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'starsolo',
+        '--multimapper-mode', 'unique',
+        '--granularity', 'gene_id',
+        '--feature-set', 'repeats',
+        '--output-prefix', prefix,
+    ])
+    ev.main()
+    import csv as _csv
+    with open(prefix + '_global_metrics.tsv') as fh:
+        rows = list(_csv.DictReader(fh, delimiter='\t'))
+    assert len(rows) == 1
+    assert rows[0]['aligner'] == 'starsolo'
+    assert rows[0]['granularity'] == 'gene_id'
+
+
+def test_evaluate_main_with_locus_map(tmp_path, monkeypatch):
+    import sys
+    gt     = tmp_path / 'gt.tsv'
+    counts = tmp_path / 'counts.tsv'
+    lm     = tmp_path / 'locus_map.tsv'
+    _write_gt(gt)
+    _write_counts(counts)
+    lm.write_text('AluSz6_dup1\tAluSz6\tAlu\tSINE\nL1PA2_dup1\tL1PA2\tL1\tLINE\n')
+    prefix = str(tmp_path / 'out')
+    monkeypatch.setattr(sys, 'argv', [
+        'evaluate.py',
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'alevin',
+        '--locus-map', str(lm),
+        '--output-prefix', prefix,
+    ])
+    ev.main()
+    assert os.path.exists(prefix + '_global_metrics.tsv')
+
+
+def test_evaluate_main_with_benchmark(tmp_path, monkeypatch):
+    import sys
+    gt     = tmp_path / 'gt.tsv'
+    counts = tmp_path / 'counts.tsv'
+    bench  = tmp_path / 'bench.txt'
+    _write_gt(gt)
+    _write_counts(counts)
+    bench.write_text('s\tcpu_time\tmax_rss\tio_in\tio_out\n5.0\t4.5\t300.0\t50.0\t80.0\n')
+    prefix = str(tmp_path / 'out')
+    monkeypatch.setattr(sys, 'argv', [
+        'evaluate.py',
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'kallisto',
+        '--benchmark', str(bench),
+        '--output-prefix', prefix,
+    ])
+    ev.main()
+    import csv as _csv
+    with open(prefix + '_global_metrics.tsv') as fh:
+        rows = list(_csv.DictReader(fh, delimiter='\t'))
+    assert rows[0]['wall_time_s'] == '5.0'
diff --git a/test/unit/test_simulate_reads.py b/test/unit/test_simulate_reads.py
index 0e81b1a..ec32c2e 100644
--- a/test/unit/test_simulate_reads.py
+++ b/test/unit/test_simulate_reads.py
@@ -297,3 +297,136 @@ def test_parse_gtf_repeats_by_chrom_allowed_chroms(tmp_path):
     intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), allowed_chroms={'chr1'})
     assert 'chr1' in intervals
     assert 'chr2' not in intervals
+
+
+# ---------------------------------------------------------------------------
+# parse_gtf_repeats_by_chrom – additional branch coverage
+# ---------------------------------------------------------------------------
+
+def test_parse_gtf_repeats_by_chrom_skips_comment_lines(tmp_path):
+    gtf_content = (
+        '# this is a header comment\n'
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf))
+    assert 'chr1' in intervals
+    assert len(intervals['chr1']) == 1
+
+
+def test_parse_gtf_repeats_by_chrom_skips_short_lines(tmp_path):
+    gtf_content = (
+        'too\tfew\tfields\n'
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf))
+    assert len(intervals['chr1']) == 1
+
+
+def test_parse_gtf_repeats_by_chrom_max_per_chrom_truncates(tmp_path):
+    lines = ''
+    for i in range(5):
+        start = 101 + i * 300
+        end   = start + 300
+        lines += (
+            f'chr1\trmsk\texon\t{start}\t{end}\t.\t+\t.\t'
+            f'gene_id "R{i}"; transcript_id "R{i}_dup1"; '
+            f'family_id "Alu"; class_id "SINE";\n'
+        )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(lines)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), max_per_chrom=2)
+    assert len(intervals['chr1']) == 2
+
+
+def test_parse_gtf_repeats_by_chrom_max_per_chrom_no_truncation(tmp_path):
+    # max_per_chrom larger than count -> no sampling
+    gtf_content = (
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "A"; transcript_id "A_dup1"; family_id "Alu"; class_id "SINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), max_per_chrom=10)
+    assert len(intervals['chr1']) == 1
+
+
+# ---------------------------------------------------------------------------
+# stream_fasta_by_chrom
+# ---------------------------------------------------------------------------
+
+def test_stream_fasta_by_chrom_plain(tmp_path):
+    fa = tmp_path / 'test.fa'
+    fa.write_text('>chr1\nACGTACGT\n>chr2\nTTTTGGGG\n>chr3\nAAAAAAAA\n')
+    result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1', 'chr3'}))
+    assert result['chr1'] == 'ACGTACGT'
+    assert result['chr3'] == 'AAAAAAAA'
+    assert 'chr2' not in result
+
+
+def test_stream_fasta_by_chrom_multiline_seq(tmp_path):
+    fa = tmp_path / 'test.fa'
+    fa.write_text('>chr1\nACGT\nACGT\n')
+    result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1'}))
+    assert result['chr1'] == 'ACGTACGT'
+
+
+def test_stream_fasta_by_chrom_gzip(tmp_path):
+    fa_gz = tmp_path / 'test.fa.gz'
+    import gzip as _gz
+    with _gz.open(str(fa_gz), 'wt') as f:
+        f.write('>chr1\nACGTACGT\n')
+    result = dict(sr.stream_fasta_by_chrom(str(fa_gz), {'chr1'}))
+    assert result['chr1'] == 'ACGTACGT'
+
+
+def test_stream_fasta_by_chrom_skips_unwanted(tmp_path):
+    fa = tmp_path / 'test.fa'
+    fa.write_text('>chr1\nAAAA\n>chr99\nCCCC\n')
+    result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1'}))
+    assert 'chr99' not in result
+
+
+# ---------------------------------------------------------------------------
+# make_qual and safe_id
+# ---------------------------------------------------------------------------
+
+def test_make_qual_default_char():
+    assert sr.make_qual(5) == 'FFFFF'
+    assert len(sr.make_qual(90)) == 90
+
+
+def test_make_qual_custom_char():
+    assert sr.make_qual(3, 'I') == 'III'
+
+
+def test_safe_id_replaces_spaces_and_slashes():
+    assert sr.safe_id('LINE/SINE foo') == 'LINE_SINE_foo'
+
+
+def test_safe_id_no_change():
+    assert sr.safe_id('AluSz6_dup1') == 'AluSz6_dup1'
+
+
+# ---------------------------------------------------------------------------
+# build_chrom_locus_coords
+# ---------------------------------------------------------------------------
+
+def test_build_chrom_locus_coords_basic():
+    plan = {
+        'c1': {'locus_A': (3, 'gA', 'fA', 'cA', 'chr1', 100, 200, '+')},
+        'c2': {'locus_A': (2, 'gA', 'fA', 'cA', 'chr1', 100, 200, '+'),
+               'locus_B': (5, 'gB', 'fB', 'cB', 'chr2', 500, 700, '-')},
+    }
+    result = sr.build_chrom_locus_coords(plan)
+    assert 'chr1' in result
+    assert result['chr1']['locus_A'] == (100, 200, '+', 'gA')
+    assert 'chr2' in result
+    assert result['chr2']['locus_B'] == (500, 700, '-', 'gB')
diff --git a/workflow/scripts/evaluation_report.Rmd b/workflow/scripts/evaluation_report.Rmd
index 08c9f43..dd97006 100644
--- a/workflow/scripts/evaluation_report.Rmd
+++ b/workflow/scripts/evaluation_report.Rmd
@@ -13,7 +13,7 @@ params:
 ---
 
 ```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE,
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE,
                       fig.width = 12, fig.height = 12)
 library(ggplot2)
 library(dplyr)
@@ -315,7 +315,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa
 
 ## compute resources
 
-```{r resources, fig.height=14, fig.width=18}
+```{r resources, fig.height=10, fig.width=14}
 res_cols <- intersect(c("wall_time_s", "cpu_time_s", "max_rss_mb", "io_in_mb", "io_out_mb"), names(global))
 if (length(res_cols) > 0 && any(!is.na(global[res_cols]))) {
   n_al5 <- length(unique(global$aligner_mode))

From af9c7308fd9277316411f4d435bd1259584568cb Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 13:19:28 +0100
Subject: [PATCH 05/12] Track extra tests, remove testing twice

---
 .github/workflows/tests.yml            |   6 +-
 test/unit/test_count_starsolo_locus.py | 200 +++++++++++++++++++++++++
 2 files changed, 202 insertions(+), 4 deletions(-)
 create mode 100644 test/unit/test_count_starsolo_locus.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 656c49a..ae5a84e 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,8 +2,6 @@ name: Tests
 
 on:
   workflow_dispatch:
-  push:
-    branches: [master, dev]
   pull_request:
     branches: [master, dev]
 
@@ -50,7 +48,7 @@ jobs:
             -c bioconda
             -c conda-forge
             python=3.11
-            "snakemake>=8"
+            snakemake>=8
           init-shell: bash
 
       - name: Dry-run main Snakefile with SmartSeq2 config
@@ -106,7 +104,7 @@ jobs:
             -c bioconda
             -c conda-forge
             python=3.11
-            "snakemake>=8"
+            snakemake>=8
           init-shell: bash
 
       - name: Run negative control workflow
diff --git a/test/unit/test_count_starsolo_locus.py b/test/unit/test_count_starsolo_locus.py
new file mode 100644
index 0000000..0589c1c
--- /dev/null
+++ b/test/unit/test_count_starsolo_locus.py
@@ -0,0 +1,200 @@
+"""
+Unit tests for count_starsolo_locus.py pure functions and mocked subprocess paths.
+"""
+import os
+import sys
+import textwrap
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import count_starsolo_locus as csl
+
+
+# ---------------------------------------------------------------------------
+# parse_locus_id
+# ---------------------------------------------------------------------------
+
+def test_parse_locus_id_plus_strand():
+    result = csl.parse_locus_id("AluSz_dup1::chr10:10000-10435(+)")
+    assert result == ("chr10", 10000, 10435, "+")
+
+
+def test_parse_locus_id_minus_strand():
+    result = csl.parse_locus_id("L1M4_dup3::10:65000-65900(-)")
+    assert result == ("10", 65000, 65900, "-")
+
+
+def test_parse_locus_id_malformed_returns_none():
+    assert csl.parse_locus_id("no_double_colon") is None
+    assert csl.parse_locus_id("bad::coords") is None
+
+
+# ---------------------------------------------------------------------------
+# load_intervals
+# ---------------------------------------------------------------------------
+
+def test_load_intervals_basic(tmp_path):
+    fa = tmp_path / "test.fa"
+    fa.write_text(
+        ">AluSz_dup1::10:10000-10435(+)\nACGT\n"
+        ">L1M4_dup3::10:65000-65900(-)\nACGT\n"
+    )
+    chrom_starts, chrom_intervals = csl.load_intervals(str(fa))
+    assert "10" in chrom_intervals
+    assert len(chrom_intervals["10"]) == 2
+    # sorted by start
+    assert chrom_intervals["10"][0][0] == 10000
+    assert chrom_intervals["10"][1][0] == 65000
+    assert chrom_starts["10"] == [10000, 65000]
+
+
+def test_load_intervals_skips_non_header_lines(tmp_path):
+    fa = tmp_path / "test.fa"
+    fa.write_text(">AluSz_dup1::10:10000-10200(+)\nACGTACGT\n")
+    _, intervals = csl.load_intervals(str(fa))
+    assert len(intervals["10"]) == 1
+    assert intervals["10"][0][2] == "AluSz_dup1"
+
+
+def test_load_intervals_deduplicates(tmp_path):
+    fa = tmp_path / "test.fa"
+    fa.write_text(
+        ">dup1::10:100-200(+)\nACGT\n"
+        ">dup1::10:100-200(+)\nACGT\n"
+    )
+    _, intervals = csl.load_intervals(str(fa))
+    assert len(intervals["10"]) == 1
+
+
+# ---------------------------------------------------------------------------
+# find_locus
+# ---------------------------------------------------------------------------
+
+def make_intervals():
+    chrom_intervals = {"10": [(10000, 10200, "AluSz_dup1"), (20000, 20500, "L1_dup2")]}
+    chrom_starts = {"10": [10000, 20000]}
+    return chrom_starts, chrom_intervals
+
+
+def test_find_locus_hit_first():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 10100, cs, ci) == "AluSz_dup1"
+
+
+def test_find_locus_hit_second():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 20300, cs, ci) == "L1_dup2"
+
+
+def test_find_locus_boundary_start():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 10000, cs, ci) == "AluSz_dup1"
+
+
+def test_find_locus_boundary_end_exclusive():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 10200, cs, ci) is None
+
+
+def test_find_locus_before_all():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 5000, cs, ci) is None
+
+
+def test_find_locus_between_intervals():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 15000, cs, ci) is None
+
+
+def test_find_locus_unknown_chrom():
+    cs, ci = make_intervals()
+    assert csl.find_locus("chrX", 10100, cs, ci) is None
+
+
+# ---------------------------------------------------------------------------
+# process_smartseq2 with mocked subprocess
+# ---------------------------------------------------------------------------
+
+def make_fasta(tmp_path):
+    fa = tmp_path / "rep.fa"
+    fa.write_text(">AluSz_dup1::10:10000-10200(+)\nACGT\n>L1_dup2::10:20000-20500(+)\nACGT\n")
+    return str(fa)
+
+
+def _mock_proc(lines):
+    proc = MagicMock()
+    proc.stdout = iter([l.encode() for l in lines])
+    return proc
+
+
+def test_process_smartseq2_assigns_reads(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    sam = [
+        "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n",
+        "cell_001_r2_AluSz_dup1\t0\t10\t10080\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n",
+        "cell_002_r3_L1_dup2\t0\t10\t20100\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n",
+    ]
+    with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)):
+        counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "unique")
+    assert "cell_001" in cbs
+    assert "cell_002" in cbs
+    assert counts["cell_001"]["AluSz_dup1"] == 2
+    assert counts["cell_002"]["L1_dup2"] == 1
+
+
+def test_process_smartseq2_unique_filter(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    # NH:i:3 = multimapper, should be dropped in unique mode
+    sam = [
+        "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:3\n",
+    ]
+    with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)):
+        counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "unique")
+    assert len(cbs) == 0
+
+
+def test_process_smartseq2_multi_mode_keeps_multimappers(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    sam = [
+        "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:3\n",
+    ]
+    with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)):
+        counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "multi")
+    assert "cell_001" in cbs
+
+
+# ---------------------------------------------------------------------------
+# process_chromium with mocked subprocess
+# ---------------------------------------------------------------------------
+
+def test_process_chromium_assigns_reads(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    sam = [
+        "r1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:ACGAACGAAGTGAGCT\tUB:Z:GGATGACGAAGG\n",
+        "r2\t0\t10\t10080\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:ACGAACGAAGTGAGCT\tUB:Z:TTTTTTTTTAAA\n",
+        "r3\t0\t10\t20100\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:CTAAGCCACCCTGAAG\tUB:Z:AAAAAAAAAAAA\n",
+    ]
+    with patch("count_starsolo_locus.sort_bam_by_cb", return_value="/tmp/fake_sorted.bam"), \
+         patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)), \
+         patch("os.unlink"):
+        counts, cbs, loci = csl.process_chromium(
+            "/fake.bam", chrom_starts, chrom_intervals, "unique", 1)
+    assert "ACGAACGAAGTGAGCT" in cbs
+    assert "CTAAGCCACCCTGAAG" in cbs
+    # UMI deduplication: 2 distinct UMIs for cell1/AluSz_dup1
+    assert counts["ACGAACGAAGTGAGCT"]["AluSz_dup1"] == 2
+    assert counts["CTAAGCCACCCTGAAG"]["L1_dup2"] == 1
+
+
+def test_process_chromium_drops_no_cb(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    sam = [
+        "r1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n",  # no CB tag
+    ]
+    with patch("count_starsolo_locus.sort_bam_by_cb", return_value="/tmp/fake.bam"), \
+         patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)), \
+         patch("os.unlink"):
+        counts, cbs, loci = csl.process_chromium(
+            "/fake.bam", chrom_starts, chrom_intervals, "unique", 1)
+    assert len(cbs) == 0

From ca33b200a8b91e7adf8ec82674c05c00b5b01fa1 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 13:23:21 +0100
Subject: [PATCH 06/12] Fix starsolo sam flags regression on smartseq2, add
 tests

---
 .github/workflows/tests.yml    | 8 ++++----
 workflow/modules/starsolo.snmk | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ae5a84e..f4042db 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -58,7 +58,7 @@ jobs:
           snakemake \
             --configfile configs/simulation_smartseq2.yaml \
             --dry-run \
-            --quiet
+            --quiet --cores 2
 
       - name: Dry-run main Snakefile with Chromium config
         shell: bash -el {0}
@@ -67,7 +67,7 @@ jobs:
           snakemake \
             --configfile configs/simulation_chromium.yaml \
             --dry-run \
-            --quiet
+            --quiet --cores 2
 
       - name: Dry-run test Snakefile with negative control config
         shell: bash -el {0}
@@ -77,7 +77,7 @@ jobs:
             -s ../test/workflow/Snakefile_test \
             --configfile ../test/workflow/configs/test_negative_control.yaml \
             --dry-run \
-            --quiet
+            --quiet --cores 2
 
       - name: Run workflow dry-run pytest tests
         shell: bash -el {0}
@@ -115,5 +115,5 @@ jobs:
             -s ../test/workflow/Snakefile_test \
             --configfile ../test/workflow/configs/test_negative_control.yaml \
             --use-conda \
-            --cores 4 \
+            --cores 2 \
             --conda-frontend mamba
diff --git a/workflow/modules/starsolo.snmk b/workflow/modules/starsolo.snmk
index da16717..50416d7 100644
--- a/workflow/modules/starsolo.snmk
+++ b/workflow/modules/starsolo.snmk
@@ -97,7 +97,7 @@ rule starsolo_smartseq2:
              --soloCellReadStats None \
              --soloUMIdedup NoDedup \
              --soloMultiMappers {params.solo_multimappers} \
-             --outSAMattributes NH HI AS NM CB UB \
+             --outSAMattributes NH HI AS NM \
              --outSAMtype BAM SortedByCoordinate \
              --limitBAMsortRAM {params.max_ram_bytes} \
              {params.extra_args} 2> {log}

From 818f952180405f4ca86aad1994c096579fde2b26 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 13:30:29 +0100
Subject: [PATCH 07/12] Attempt to remove hardcoded home path

---
 test/workflow/configs/test_negative_control.yaml | 10 +++++-----
 workflow/Snakefile                               |  3 +++
 workflow/configs/colon_cancer_cell_lines.yaml    | 10 +++++-----
 workflow/configs/real_data_chromium.yaml         | 16 ++++++++--------
 workflow/configs/simulation.yaml                 | 16 ++++++++--------
 workflow/configs/simulation_chromium.yaml        | 10 +++++-----
 workflow/configs/simulation_smartseq2.yaml       | 10 +++++-----
 7 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml
index c35bf3e..754bba2 100644
--- a/test/workflow/configs/test_negative_control.yaml
+++ b/test/workflow/configs/test_negative_control.yaml
@@ -5,8 +5,8 @@
 #   - only a small number of cells (5) to keep runtime low
 #   - testing.negative_control_max_recall sets the recall threshold
 
-base: "/home/imallona/repeats/results/test_negative_control"
-indices_base: "/home/imallona/repeats/results/shared"
+base: "~/repeats/results/test_negative_control"
+indices_base: "~/repeats/results/shared"
 
 reference:
   assembly: hg38
@@ -14,9 +14,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf:  "/home/imallona/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf:    "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "~/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf:  "~/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf:    "~/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index ab05a91..d904f0b 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -16,6 +16,9 @@ genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all'
 if 'indices_base' not in config:
     config['indices_base'] = config['base']
 
+config['base']         = op.expanduser(config['base'])
+config['indices_base'] = op.expanduser(config['indices_base'])
+
 include: "modules/download_references.snmk"
 include: "modules/data_acquisition.snmk"
 include: "modules/reference.snmk"
diff --git a/workflow/configs/colon_cancer_cell_lines.yaml b/workflow/configs/colon_cancer_cell_lines.yaml
index df4f533..16cf23a 100644
--- a/workflow/configs/colon_cancer_cell_lines.yaml
+++ b/workflow/configs/colon_cancer_cell_lines.yaml
@@ -1,16 +1,16 @@
 sample: "colon_cancer_cell_lines"
 ## base_path
-base: "/home/imallona/repeats/"
+base: "~/repeats/"
 aligners: ['starsolo']
 
 # https://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_Ensembl_rmsk_TE.gtf.gz
-repeats_gtf: "/home/imallona/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz"
+repeats_gtf: "~/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz"
 
 # https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.chr.gtf.gz
-genes_gtf: "/home/imallona/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz"
+genes_gtf: "~/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz"
 
 # http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
-genome_fasta_gz: "/home/imallona/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
+genome_fasta_gz: "~/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
 
 max_mem_mb: 20000 ## these are megabytes
 
@@ -21,4 +21,4 @@ extraStarSoloArgs: ""
 ## solo ####################################################################
 
 soloType: "CB_UMI_Simple"
-readFilesManifest: "/home/imallona/src/repeats/workflow/configs/colon_cancer_cell_lines_manifest.tsv"
\ No newline at end of file
+readFilesManifest: "~/src/repeats/workflow/configs/colon_cancer_cell_lines_manifest.tsv"
\ No newline at end of file
diff --git a/workflow/configs/real_data_chromium.yaml b/workflow/configs/real_data_chromium.yaml
index 422e8a1..77f0de9 100644
--- a/workflow/configs/real_data_chromium.yaml
+++ b/workflow/configs/real_data_chromium.yaml
@@ -1,9 +1,9 @@
-base: "/home/imallona/repeats/real_chromium"
+base: "~/repeats/real_chromium"
 
 reference:
-  genome_fasta: "/home/imallona/repeats/refs/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
-  repeats_gtf: "/home/imallona/repeats/refs/GRCh38_Ensembl_rmsk_TE.gtf.gz"
-  genes_gtf: "/home/imallona/repeats/refs/Homo_sapiens.GRCh38.112.chr.gtf.gz"
+  genome_fasta: "~/repeats/refs/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
+  repeats_gtf: "~/repeats/refs/GRCh38_Ensembl_rmsk_TE.gtf.gz"
+  genes_gtf: "~/repeats/refs/Homo_sapiens.GRCh38.112.chr.gtf.gz"
   filter_genic: true
 
 mode: real_data
@@ -13,11 +13,11 @@ real_data:
   chemistry: 10xv3
   samples:
     hct116_wt:
-      R1: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974767_1.fastq.gz"
-      R2: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974767_2.fastq.gz"
+      R1: "~/repeats/data/colon_cancer_cell_lines/SRR10974767_1.fastq.gz"
+      R2: "~/repeats/data/colon_cancer_cell_lines/SRR10974767_2.fastq.gz"
     hct116_dko:
-      R1: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974769_1.fastq.gz"
-      R2: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974769_2.fastq.gz"
+      R1: "~/repeats/data/colon_cancer_cell_lines/SRR10974769_1.fastq.gz"
+      R2: "~/repeats/data/colon_cancer_cell_lines/SRR10974769_2.fastq.gz"
 
 aligners:
   - starsolo
diff --git a/workflow/configs/simulation.yaml b/workflow/configs/simulation.yaml
index 24747bd..20db5f8 100644
--- a/workflow/configs/simulation.yaml
+++ b/workflow/configs/simulation.yaml
@@ -1,16 +1,16 @@
 sample: "test_sample_name"
 ## base_path
-base: "/home/imallona/repeats/"
+base: "~/repeats/"
 aligners: ['starsolo']
 
 # https://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_Ensembl_rmsk_TE.gtf.gz
-repeats_gtf: "/home/imallona/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz"
+repeats_gtf: "~/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz"
 
 # https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.chr.gtf.gz
-genes_gtf: "/home/imallona/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz"
+genes_gtf: "~/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz"
 
 # http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
-genome_fasta_gz: "/home/imallona/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
+genome_fasta_gz: "~/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
 
 max_mem_mb: 20000 ## these are megabytes
 
@@ -21,8 +21,8 @@ extraStarSoloArgs: ""
 ## solo ####################################################################3
 soloType: "SmartSeq"
 ## generated by a simulation but anyway
-#smartseq_cdna_fastq: "/home/imallona/repeats/simulations/cdna.fastq.gz"
-readFilesManifest: "/home/imallona/repeats/simulations/manifest.tsv"
+#smartseq_cdna_fastq: "~/repeats/simulations/cdna.fastq.gz"
+readFilesManifest: "~/repeats/simulations/manifest.tsv"
 
 
 # run_tech: "chromium"
@@ -42,8 +42,8 @@ readFilesManifest: "/home/imallona/repeats/simulations/manifest.tsv"
 #   featurecounts_parsing: "~/src/repeats_sc/04_snakemake/plot_featurecounts_profile.R"
 # software:
 #   Rscript: "/usr/local/R/R-4.0.5/bin/Rscript"
-#   bowtie_build: "/home/imallona/soft/bowtie/bowtie-1.2.3/bowtie-build"
-#   bowtie: "/home/imallona/soft/bowtie/bowtie-1.2.3/bowtie"
+#   bowtie_build: "~/soft/bowtie/bowtie-1.2.3/bowtie-build"
+#   bowtie: "~/soft/bowtie/bowtie-1.2.3/bowtie"
 #   pigz: "/usr/bin/pigz"
 #   star: "~/soft/star/STAR-2.7.3a/source/STAR"
 #   featurecounts: "~/soft/subread/subread-2.0.0-source/bin/featureCounts"
diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml
index ea12b78..7c3bfc6 100644
--- a/workflow/configs/simulation_chromium.yaml
+++ b/workflow/configs/simulation_chromium.yaml
@@ -1,5 +1,5 @@
-base: "/home/imallona/repeats/results/simulation_chromium"
-indices_base: "/home/imallona/repeats/results/shared"
+base: "~/repeats/results/simulation_chromium"
+indices_base: "~/repeats/results/shared"
 
 reference:
   assembly: hg38
@@ -8,9 +8,9 @@ reference:
   # Split repeats into genic/intergenic subsets using gene body overlap (bedtools)
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "/home/imallona/repeats/results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "/home/imallona/repeats/results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "/home/imallona/repeats/results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "~/repeats/results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "~/repeats/results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "~/repeats/results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml
index b9a7698..9c65f9d 100644
--- a/workflow/configs/simulation_smartseq2.yaml
+++ b/workflow/configs/simulation_smartseq2.yaml
@@ -1,5 +1,5 @@
-base: "/home/imallona/repeats/results/simulation_smartseq2"
-indices_base: "/home/imallona/repeats/results/shared"
+base: "~/repeats/results/simulation_smartseq2"
+indices_base: "~/repeats/results/shared"
 
 reference:
   assembly: hg38
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "~/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "~/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "~/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 

From 6752e900abba16bd5a89b0f571118bae89626fa9 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 13:36:03 +0100
Subject: [PATCH 08/12] Switch to relative paths

---
 test/workflow/configs/test_negative_control.yaml | 10 +++++-----
 workflow/Snakefile                               |  3 +--
 workflow/configs/simulation_chromium.yaml        | 10 +++++-----
 workflow/configs/simulation_smartseq2.yaml       | 10 +++++-----
 4 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml
index 754bba2..e0c6498 100644
--- a/test/workflow/configs/test_negative_control.yaml
+++ b/test/workflow/configs/test_negative_control.yaml
@@ -5,8 +5,8 @@
 #   - only a small number of cells (5) to keep runtime low
 #   - testing.negative_control_max_recall sets the recall threshold
 
-base: "~/repeats/results/test_negative_control"
-indices_base: "~/repeats/results/shared"
+base: "../results/test_negative_control"
+indices_base: "../results/shared"
 
 reference:
   assembly: hg38
@@ -14,9 +14,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "~/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf:  "~/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf:    "~/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf:  "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf:    "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index d904f0b..a605307 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -16,8 +16,7 @@ genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all'
 if 'indices_base' not in config:
     config['indices_base'] = config['base']
 
-config['base']         = op.expanduser(config['base'])
-config['indices_base'] = op.expanduser(config['indices_base'])
+
 
 include: "modules/download_references.snmk"
 include: "modules/data_acquisition.snmk"
diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml
index 7c3bfc6..3a49b4f 100644
--- a/workflow/configs/simulation_chromium.yaml
+++ b/workflow/configs/simulation_chromium.yaml
@@ -1,5 +1,5 @@
-base: "~/repeats/results/simulation_chromium"
-indices_base: "~/repeats/results/shared"
+base: "../results/simulation_chromium"
+indices_base: "../results/shared"
 
 reference:
   assembly: hg38
@@ -8,9 +8,9 @@ reference:
   # Split repeats into genic/intergenic subsets using gene body overlap (bedtools)
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "~/repeats/results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "~/repeats/results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "~/repeats/results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml
index 9c65f9d..8d54f10 100644
--- a/workflow/configs/simulation_smartseq2.yaml
+++ b/workflow/configs/simulation_smartseq2.yaml
@@ -1,5 +1,5 @@
-base: "~/repeats/results/simulation_smartseq2"
-indices_base: "~/repeats/results/shared"
+base: "../results/simulation_smartseq2"
+indices_base: "../results/shared"
 
 reference:
   assembly: hg38
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "~/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "~/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "~/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 

From 4f2f53f8f10bd6df4ab51a2b1113df622326ed84 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 13:38:45 +0100
Subject: [PATCH 09/12] Negative controls path

---
 test/workflow/configs/test_negative_control.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml
index e0c6498..204865f 100644
--- a/test/workflow/configs/test_negative_control.yaml
+++ b/test/workflow/configs/test_negative_control.yaml
@@ -14,9 +14,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf:  "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf:    "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/test_negative_control/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf:  "../results/test_negative_control/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf:    "../results/test_negative_control/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 

From 7125ed7c7edc1b85daa0178c6a33583eea83dc0b Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 15:00:16 +0100
Subject: [PATCH 10/12] Add sim_mutation_rate to simulations

---
 .github/workflows/tests.yml                   |   3 +
 Makefile                                      | 123 +++++++
 .../{ => old}/colon_cancer_cell_lines.yaml    |   0
 .../configs/{ => old}/real_data_chromium.yaml |   0
 workflow/configs/{ => old}/simulation.yaml    |   0
 workflow/configs/simulation_chromium.yaml     |   1 +
 .../simulation_chromium_noise_0pct.yaml       |  59 ++++
 .../simulation_chromium_noise_10pct.yaml      |  59 ++++
 .../simulation_chromium_noise_1pct.yaml       |  59 ++++
 .../simulation_chromium_noise_5pct.yaml       |  59 ++++
 workflow/configs/simulation_smartseq2.yaml    |   1 +
 .../simulation_smartseq2_noise_0pct.yaml      |  57 ++++
 .../simulation_smartseq2_noise_10pct.yaml     |  57 ++++
 .../simulation_smartseq2_noise_1pct.yaml      |  57 ++++
 .../simulation_smartseq2_noise_5pct.yaml      |  57 ++++
 workflow/modules/evaluation.snmk              |   6 +-
 workflow/modules/simulations.snmk             |   8 +-
 workflow/scripts/evaluate.py                  |   5 +
 workflow/scripts/noise_sweep_report.Rmd       | 303 ++++++++++++++++++
 workflow/scripts/simulate_reads.py            |  18 +-
 20 files changed, 921 insertions(+), 11 deletions(-)
 create mode 100644 Makefile
 rename workflow/configs/{ => old}/colon_cancer_cell_lines.yaml (100%)
 rename workflow/configs/{ => old}/real_data_chromium.yaml (100%)
 rename workflow/configs/{ => old}/simulation.yaml (100%)
 create mode 100644 workflow/configs/simulation_chromium_noise_0pct.yaml
 create mode 100644 workflow/configs/simulation_chromium_noise_10pct.yaml
 create mode 100644 workflow/configs/simulation_chromium_noise_1pct.yaml
 create mode 100644 workflow/configs/simulation_chromium_noise_5pct.yaml
 create mode 100644 workflow/configs/simulation_smartseq2_noise_0pct.yaml
 create mode 100644 workflow/configs/simulation_smartseq2_noise_10pct.yaml
 create mode 100644 workflow/configs/simulation_smartseq2_noise_1pct.yaml
 create mode 100644 workflow/configs/simulation_smartseq2_noise_5pct.yaml
 create mode 100644 workflow/scripts/noise_sweep_report.Rmd

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f4042db..2bf6715 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,6 +49,9 @@ jobs:
             -c conda-forge
             python=3.11
             snakemake>=8
+            pytest
+            pytest-cov
+            scipy
           init-shell: bash
 
       - name: Dry-run main Snakefile with SmartSeq2 config
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5af3948
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,123 @@
+SHELL   := /bin/bash
+CORES   ?= 10
+WF      := workflow
+RESULTS := results
+
+CONDA_RUN := source ~/miniconda3/etc/profile.d/conda.sh && conda activate snakemake &&
+SM        := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES)
+RSCRIPT   := cd $(WF) && $(CONDA_RUN) Rscript
+
+# noise sweep eval dirs (relative to workflow/, comma-separated for the Rmd param)
+NOISE_EDIRS_SS2 := ../$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation
+NOISE_EDIRS_CHR := ../$(RESULTS)/simulation_chromium_noise_0pct/evaluation,../$(RESULTS)/simulation_chromium_noise_1pct/evaluation,../$(RESULTS)/simulation_chromium_noise_5pct/evaluation,../$(RESULTS)/simulation_chromium_noise_10pct/evaluation
+
+# output HTML files (relative to project root)
+NOISE_REPORT_SS2 := $(RESULTS)/noise_sweep_smartseq2.html
+NOISE_REPORT_CHR := $(RESULTS)/noise_sweep_chromium.html
+
+.PHONY: all \
+        simulation_smartseq2 simulation_chromium \
+        noise_smartseq2 noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct \
+        noise_chromium noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct \
+        report_noise_smartseq2 report_noise_chromium reports_noise \
+        help
+
+# -----------------------------------------------------------------------
+# base simulations
+# -----------------------------------------------------------------------
+
+simulation_smartseq2:
+	$(SM) --configfile configs/simulation_smartseq2.yaml
+
+simulation_chromium:
+	$(SM) --configfile configs/simulation_chromium.yaml
+
+# -----------------------------------------------------------------------
+# noise sweep — SmartSeq2
+# note: genome refs are reused from the base simulation_smartseq2 run;
+#       alignment indices are shared via results/shared/ (never rebuilt)
+# -----------------------------------------------------------------------
+
+noise_smartseq2_0pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_0pct.yaml
+
+noise_smartseq2_1pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_1pct.yaml
+
+noise_smartseq2_5pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_5pct.yaml
+
+noise_smartseq2_10pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_10pct.yaml
+
+noise_smartseq2: noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct
+
+# -----------------------------------------------------------------------
+# noise sweep — Chromium
+# -----------------------------------------------------------------------
+
+noise_chromium_0pct:
+	$(SM) --configfile configs/simulation_chromium_noise_0pct.yaml
+
+noise_chromium_1pct:
+	$(SM) --configfile configs/simulation_chromium_noise_1pct.yaml
+
+noise_chromium_5pct:
+	$(SM) --configfile configs/simulation_chromium_noise_5pct.yaml
+
+noise_chromium_10pct:
+	$(SM) --configfile configs/simulation_chromium_noise_10pct.yaml
+
+noise_chromium: noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct
+
+# -----------------------------------------------------------------------
+# noise sweep reports
+# file targets: only (re)rendered when the HTML does not yet exist;
+# use  make -B report_noise_smartseq2  to force a re-render
+# -----------------------------------------------------------------------
+
+$(NOISE_REPORT_SS2):
+	mkdir -p $(RESULTS)
+	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', \
+	    output_file = '../$(NOISE_REPORT_SS2)', \
+	    params = list(eval_dirs = '$(NOISE_EDIRS_SS2)'))"
+
+$(NOISE_REPORT_CHR):
+	mkdir -p $(RESULTS)
+	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', \
+	    output_file = '../$(NOISE_REPORT_CHR)', \
+	    params = list(eval_dirs = '$(NOISE_EDIRS_CHR)'))"
+
+report_noise_smartseq2: $(NOISE_REPORT_SS2)
+report_noise_chromium:  $(NOISE_REPORT_CHR)
+reports_noise: report_noise_smartseq2 report_noise_chromium
+
+# -----------------------------------------------------------------------
+# all
+# -----------------------------------------------------------------------
+
+all: simulation_smartseq2 simulation_chromium noise_smartseq2 noise_chromium reports_noise
+
+# -----------------------------------------------------------------------
+# help
+# -----------------------------------------------------------------------
+
+help:
+	@echo "Usage: make [target] [CORES=N]   (default CORES=$(CORES))"
+	@echo ""
+	@echo "Base simulation runs:"
+	@echo "  simulation_smartseq2             SmartSeq2 base run (mutation_rate 0.1%)"
+	@echo "  simulation_chromium              Chromium base run  (mutation_rate 0.1%)"
+	@echo ""
+	@echo "Noise sweep runs (base simulation must have been run first):"
+	@echo "  noise_smartseq2                  all 4 noise levels for SmartSeq2"
+	@echo "  noise_smartseq2_{0,1,5,10}pct   individual SmartSeq2 noise level"
+	@echo "  noise_chromium                   all 4 noise levels for Chromium"
+	@echo "  noise_chromium_{0,1,5,10}pct    individual Chromium noise level"
+	@echo ""
+	@echo "Reports (file targets; use -B to force re-render):"
+	@echo "  report_noise_smartseq2           $(NOISE_REPORT_SS2)"
+	@echo "  report_noise_chromium            $(NOISE_REPORT_CHR)"
+	@echo "  reports_noise                    both noise sweep reports"
+	@echo ""
+	@echo "  all                              run everything in sequence"
diff --git a/workflow/configs/colon_cancer_cell_lines.yaml b/workflow/configs/old/colon_cancer_cell_lines.yaml
similarity index 100%
rename from workflow/configs/colon_cancer_cell_lines.yaml
rename to workflow/configs/old/colon_cancer_cell_lines.yaml
diff --git a/workflow/configs/real_data_chromium.yaml b/workflow/configs/old/real_data_chromium.yaml
similarity index 100%
rename from workflow/configs/real_data_chromium.yaml
rename to workflow/configs/old/real_data_chromium.yaml
diff --git a/workflow/configs/simulation.yaml b/workflow/configs/old/simulation.yaml
similarity index 100%
rename from workflow/configs/simulation.yaml
rename to workflow/configs/old/simulation.yaml
diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml
index 3a49b4f..04ae340 100644
--- a/workflow/configs/simulation_chromium.yaml
+++ b/workflow/configs/simulation_chromium.yaml
@@ -22,6 +22,7 @@ simulation:
   cb_length: 16
   umi_length: 12
   seed: 42
+  mutation_rate: 0.001
 
 feature_sets:
   - repeats
diff --git a/workflow/configs/simulation_chromium_noise_0pct.yaml b/workflow/configs/simulation_chromium_noise_0pct.yaml
new file mode 100644
index 0000000..1aea43a
--- /dev/null
+++ b/workflow/configs/simulation_chromium_noise_0pct.yaml
@@ -0,0 +1,59 @@
+base: "../results/simulation_chromium_noise_0pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: chromium
+  n_cells: 50
+  n_expressed_per_cell: 80
+  read_length: 90
+  cb_length: 16
+  umi_length: 12
+  seed: 42
+  mutation_rate: 0.0
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 50
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_chromium_noise_10pct.yaml b/workflow/configs/simulation_chromium_noise_10pct.yaml
new file mode 100644
index 0000000..b8a3b0a
--- /dev/null
+++ b/workflow/configs/simulation_chromium_noise_10pct.yaml
@@ -0,0 +1,59 @@
+base: "../results/simulation_chromium_noise_10pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: chromium
+  n_cells: 50
+  n_expressed_per_cell: 80
+  read_length: 90
+  cb_length: 16
+  umi_length: 12
+  seed: 42
+  mutation_rate: 0.10
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 50
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_chromium_noise_1pct.yaml b/workflow/configs/simulation_chromium_noise_1pct.yaml
new file mode 100644
index 0000000..2d52c8a
--- /dev/null
+++ b/workflow/configs/simulation_chromium_noise_1pct.yaml
@@ -0,0 +1,59 @@
+base: "../results/simulation_chromium_noise_1pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: chromium
+  n_cells: 50
+  n_expressed_per_cell: 80
+  read_length: 90
+  cb_length: 16
+  umi_length: 12
+  seed: 42
+  mutation_rate: 0.01
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 50
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_chromium_noise_5pct.yaml b/workflow/configs/simulation_chromium_noise_5pct.yaml
new file mode 100644
index 0000000..228f030
--- /dev/null
+++ b/workflow/configs/simulation_chromium_noise_5pct.yaml
@@ -0,0 +1,59 @@
+base: "../results/simulation_chromium_noise_5pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: chromium
+  n_cells: 50
+  n_expressed_per_cell: 80
+  read_length: 90
+  cb_length: 16
+  umi_length: 12
+  seed: 42
+  mutation_rate: 0.05
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 50
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml
index 8d54f10..54beaee 100644
--- a/workflow/configs/simulation_smartseq2.yaml
+++ b/workflow/configs/simulation_smartseq2.yaml
@@ -19,6 +19,7 @@ simulation:
   n_expressed_per_cell: 100
   read_length: 90
   seed: 42
+  mutation_rate: 0.001
 
 # Feature sets to quantify against:
 #   genes              - Ensembl gene annotation
diff --git a/workflow/configs/simulation_smartseq2_noise_0pct.yaml b/workflow/configs/simulation_smartseq2_noise_0pct.yaml
new file mode 100644
index 0000000..ba95edb
--- /dev/null
+++ b/workflow/configs/simulation_smartseq2_noise_0pct.yaml
@@ -0,0 +1,57 @@
+base: "../results/simulation_smartseq2_noise_0pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 20
+  n_expressed_per_cell: 100
+  read_length: 90
+  seed: 42
+  mutation_rate: 0.0
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: "--minAssignedFrags 1"
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 20
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_smartseq2_noise_10pct.yaml b/workflow/configs/simulation_smartseq2_noise_10pct.yaml
new file mode 100644
index 0000000..b7d9d5c
--- /dev/null
+++ b/workflow/configs/simulation_smartseq2_noise_10pct.yaml
@@ -0,0 +1,57 @@
+base: "../results/simulation_smartseq2_noise_10pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 20
+  n_expressed_per_cell: 100
+  read_length: 90
+  seed: 42
+  mutation_rate: 0.10
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: "--minAssignedFrags 1"
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 20
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_smartseq2_noise_1pct.yaml b/workflow/configs/simulation_smartseq2_noise_1pct.yaml
new file mode 100644
index 0000000..f38d904
--- /dev/null
+++ b/workflow/configs/simulation_smartseq2_noise_1pct.yaml
@@ -0,0 +1,57 @@
+base: "../results/simulation_smartseq2_noise_1pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 20
+  n_expressed_per_cell: 100
+  read_length: 90
+  seed: 42
+  mutation_rate: 0.01
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: "--minAssignedFrags 1"
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 20
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_smartseq2_noise_5pct.yaml b/workflow/configs/simulation_smartseq2_noise_5pct.yaml
new file mode 100644
index 0000000..ef69956
--- /dev/null
+++ b/workflow/configs/simulation_smartseq2_noise_5pct.yaml
@@ -0,0 +1,57 @@
+base: "../results/simulation_smartseq2_noise_5pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 20
+  n_expressed_per_cell: 100
+  read_length: 90
+  seed: 42
+  mutation_rate: 0.05
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: "--minAssignedFrags 1"
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 20
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/modules/evaluation.snmk b/workflow/modules/evaluation.snmk
index a83c559..c0d78b4 100644
--- a/workflow/modules/evaluation.snmk
+++ b/workflow/modules/evaluation.snmk
@@ -21,6 +21,7 @@ if sim_technology == 'smartseq2' and 'multi' in starsolo_modes:
 bowtie2_modes = config.get('aligner_params', {}).get('bowtie2', {}).get(
     'multimapper_modes', ['unique'])
 granularities = config.get('granularities', ['family_id'])
+sim_mutation_rate = config.get('simulation', {}).get('mutation_rate', 0.0)
 evaluation_script = op.join(workflow.basedir, 'scripts', 'evaluate.py')
 counts_dir = op.join(config['base'], 'counts')
 eval_dir = op.join(config['base'], 'evaluation')
@@ -81,6 +82,7 @@ rule evaluate_combination:
         granularity = 'locus|gene_id|family_id|class_id',
         multimapper_mode = 'unique|multi'
     params:
+        mutation_rate = sim_mutation_rate,
         benchmark_file = eval_benchmark,
         output_prefix = op.join(eval_dir,
             '{aligner}_{feature_set}_{granularity}_{multimapper_mode}')
@@ -98,7 +100,8 @@ rule evaluate_combination:
             --feature-set {wildcards.feature_set} \
             --locus-map {input.locus_map} \
             --benchmark {params.benchmark_file} \
-            --output-prefix {params.output_prefix} 2> {log}
+            --output-prefix {params.output_prefix} \
+            --mutation-rate {params.mutation_rate} 2> {log}
         """
 rule aggregate_global_metrics:
     """Concatenate all per-aligner global metric TSVs into one summary file."""
@@ -176,6 +179,7 @@ rule render_report:
     output:
         report = op.join(eval_dir, 'evaluation_report.html')
     params:
+        mutation_rate = sim_mutation_rate,
         eval_dir = eval_dir,
         rmd = op.join(workflow.basedir, 'scripts', 'evaluation_report.Rmd')
     log:
diff --git a/workflow/modules/simulations.snmk b/workflow/modules/simulations.snmk
index 443e291..1c231c1 100644
--- a/workflow/modules/simulations.snmk
+++ b/workflow/modules/simulations.snmk
@@ -38,6 +38,7 @@ rule simulate_smartseq2:
         n_expressed = sim_cfg.get('n_expressed_per_cell', 100),
         read_length = sim_cfg.get('read_length', 90),
         seed = sim_seed,
+        mutation_rate = sim_cfg.get('mutation_rate', 0.001),
         chrom_args = '--chromosomes ' + ' '.join(sim_chromosomes) if sim_chromosomes else '',
         max_per_chrom_arg = (
             f'--max-repeats-per-chrom {sim_cfg["max_repeats_per_chrom"]}'
@@ -57,7 +58,8 @@ rule simulate_smartseq2:
             --read-length {params.read_length} \
             --seed {params.seed} \
             {params.chrom_args} \
-            {params.max_per_chrom_arg} 2> {log}
+            {params.max_per_chrom_arg} \
+            --mutation-rate {params.mutation_rate} 2> {log}
         """
 
 
@@ -80,6 +82,7 @@ rule simulate_chromium:
         cb_length = sim_cfg.get('cb_length', 16),
         umi_length = sim_cfg.get('umi_length', 12),
         seed = sim_seed,
+        mutation_rate = sim_cfg.get('mutation_rate', 0.001),
         chrom_args = '--chromosomes ' + ' '.join(sim_chromosomes) if sim_chromosomes else '',
         max_per_chrom_arg = (
             f'--max-repeats-per-chrom {sim_cfg["max_repeats_per_chrom"]}'
@@ -101,5 +104,6 @@ rule simulate_chromium:
             --umi-length {params.umi_length} \
             --seed {params.seed} \
             {params.chrom_args} \
-            {params.max_per_chrom_arg} 2> {log}
+            {params.max_per_chrom_arg} \
+            --mutation-rate {params.mutation_rate} 2> {log}
         """
diff --git a/workflow/scripts/evaluate.py b/workflow/scripts/evaluate.py
index 3cc074b..8f868b6 100644
--- a/workflow/scripts/evaluate.py
+++ b/workflow/scripts/evaluate.py
@@ -228,6 +228,8 @@ def main():
                     help='4-col TSV (transcript_id, gene_id, family_id, class_id). '
                          'Expands the feature universe for specificity and restricts '
                          'ground truth to this feature_set\'s loci.')
+    ap.add_argument('--mutation-rate', type=float, default=0.0,
+                    help='Mutation rate used in the simulation (for output labelling)')
     ap.add_argument('--output-prefix', required=True)
     args = ap.parse_args()
 
@@ -277,6 +279,7 @@ def main():
     global_metrics['multimapper_mode'] = args.multimapper_mode
     global_metrics['feature_set'] = args.feature_set
     global_metrics['granularity'] = args.granularity
+    global_metrics['mutation_rate'] = args.mutation_rate
 
     bench = load_benchmark(args.benchmark)
     global_metrics.update(bench)
@@ -289,6 +292,7 @@ def main():
         row['multimapper_mode'] = args.multimapper_mode
         row['feature_set'] = args.feature_set
         row['granularity'] = args.granularity
+        row['mutation_rate'] = args.mutation_rate
     write_tsv(per_cell_rows, args.output_prefix + '_per_cell_metrics.tsv',
               fallback_fields=['cell_id', 'pearson_r', 'spearman_r',
                                'n_truth_expressed', 'n_observed_expressed',
@@ -310,6 +314,7 @@ def main():
         row['multimapper_mode'] = args.multimapper_mode
         row['feature_set'] = args.feature_set
         row['granularity'] = args.granularity
+        row['mutation_rate'] = args.mutation_rate
         per_family_rows.append(row)
     write_tsv(per_family_rows, args.output_prefix + '_per_family_metrics.tsv',
               fallback_fields=['class_id', 'aligner', 'multimapper_mode', 'feature_set',
diff --git a/workflow/scripts/noise_sweep_report.Rmd b/workflow/scripts/noise_sweep_report.Rmd
new file mode 100644
index 0000000..c1a9602
--- /dev/null
+++ b/workflow/scripts/noise_sweep_report.Rmd
@@ -0,0 +1,303 @@
+---
+title: "repeat element quantification - noise sweep report"
+author: "Izaskun Mallona"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    theme: flatly
+    toc: true
+    toc_float: true
+    code_folding: hide
+params:
+  eval_dirs: "."
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE,
+                      fig.width = 14, fig.height = 10)
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(scales)
+library(patchwork)
+
+make_palette <- function(n) {
+  if (n <= 8)  return(RColorBrewer::brewer.pal(max(3, n), "Dark2")[seq_len(n)])
+  colorRampPalette(RColorBrewer::brewer.pal(8, "Dark2"))(n)
+}
+
+eval_dirs <- strsplit(params$eval_dirs, ",")[[1]]
+eval_dirs <- trimws(eval_dirs)
+```
+
+```{r load-data}
+load_metrics_from_dirs <- function(dirs, filename) {
+  purrr::map_dfr(dirs, function(d) {
+    p <- file.path(d, filename)
+    if (!file.exists(p)) return(NULL)
+    read_tsv(p, show_col_types = FALSE)
+  })
+}
+
+load_pattern_from_dirs <- function(dirs, pattern) {
+  purrr::map_dfr(dirs, function(d) {
+    files <- list.files(d, pattern = pattern, full.names = TRUE)
+    if (length(files) == 0) return(NULL)
+    purrr::map_dfr(files, ~ read_tsv(.x, show_col_types = FALSE))
+  })
+}
+
+global <- load_metrics_from_dirs(eval_dirs, "summary_global_metrics.tsv")
+
+num_cols <- c("pearson_r", "spearman_r", "log1p_rmse",
+              "precision", "recall", "f1", "jaccard", "specificity",
+              "n_cells", "n_features", "mutation_rate",
+              "wall_time_s", "cpu_time_s", "max_rss_mb", "io_in_mb", "io_out_mb")
+for (col in num_cols) {
+  if (col %in% names(global))
+    global[[col]] <- suppressWarnings(as.numeric(global[[col]]))
+}
+
+if (!"granularity" %in% names(global))   global$granularity   <- "gene_id"
+if (!"feature_set" %in% names(global))   global$feature_set   <- "repeats"
+if (!"multimapper_mode" %in% names(global)) global$multimapper_mode <- "unique"
+if (!"mutation_rate" %in% names(global)) global$mutation_rate <- NA_real_
+
+global <- global %>%
+  mutate(
+    aligner_mode  = paste0(aligner, " (", multimapper_mode, ")"),
+    gran_label    = factor(granularity,
+                           levels = c("locus", "gene_id", "family_id", "class_id")),
+    noise_label   = paste0(mutation_rate * 100, "%")
+  )
+
+per_cell   <- tryCatch(load_pattern_from_dirs(eval_dirs, "_per_cell_metrics\\.tsv$"),
+                       error = function(e) NULL)
+per_family <- tryCatch(load_pattern_from_dirs(eval_dirs, "_per_family_metrics\\.tsv$"),
+                       error = function(e) NULL)
+
+for (col in c("pearson_r", "spearman_r", "mutation_rate")) {
+  if (!is.null(per_cell) && col %in% names(per_cell))
+    per_cell[[col]] <- suppressWarnings(as.numeric(per_cell[[col]]))
+  if (!is.null(per_family) && col %in% names(per_family))
+    per_family[[col]] <- suppressWarnings(as.numeric(per_family[[col]]))
+}
+
+if (!is.null(per_family) && nrow(per_family) > 0) {
+  for (col in c("f1", "precision", "recall", "jaccard", "specificity")) {
+    if (col %in% names(per_family))
+      per_family[[col]] <- suppressWarnings(as.numeric(per_family[[col]]))
+  }
+  if (!"multimapper_mode" %in% names(per_family)) per_family$multimapper_mode <- "unique"
+  per_family <- per_family %>%
+    mutate(aligner_mode = paste0(aligner, " (", multimapper_mode, ")"))
+}
+```
+
+---
+
+This report combines evaluation results across multiple noise levels (simulated nucleotide
+substitution rates). Each `eval_dirs` entry corresponds to one pipeline run with a distinct
+`mutation_rate` in the simulation config. The `mutation_rate` column in the metrics TSVs
+identifies each run.
+
+Noise levels found in the data: `r sort(unique(global$mutation_rate))`
+
+---
+
+## metric degradation by noise level
+
+How each aligner's accuracy changes as the per-base mutation rate increases.
+x-axis = mutation rate, y-axis = metric value, colour = aligner (+ multimapper mode).
+
+```{r degradation-main, fig.height=20, fig.width=14}
+acc_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall",
+                 "jaccard", "specificity")
+
+deg <- global %>%
+  filter(!is.na(mutation_rate)) %>%
+  select(aligner_mode, feature_set, gran_label, mutation_rate, all_of(acc_metrics)) %>%
+  pivot_longer(all_of(acc_metrics), names_to = "metric", values_to = "value") %>%
+  filter(!is.na(value)) %>%
+  mutate(metric = factor(metric, levels = acc_metrics))
+
+n_al <- length(unique(deg$aligner_mode))
+pal  <- make_palette(n_al)
+
+ggplot(deg, aes(x = mutation_rate, y = value,
+                colour = aligner_mode, group = aligner_mode)) +
+  geom_point(size = 2.5, alpha = 0.85) +
+  geom_line(linewidth = 0.6) +
+  facet_grid(metric ~ feature_set + gran_label, scales = "free_y") +
+  scale_colour_manual(values = pal, name = "aligner (mode)") +
+  scale_x_continuous(labels = percent_format(accuracy = 0.1)) +
+  labs(x = "mutation rate", y = NULL,
+       title = "metric degradation with increasing noise",
+       subtitle = "rows = metric; columns = feature_set x granularity; x = mutation rate") +
+  theme_bw(base_size = 9) +
+  theme(axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+        strip.text   = element_text(size = 7),
+        legend.position = "bottom",
+        legend.text  = element_text(size = 8))
+```
+
+## pearson r and F1 — noise overview
+
+Summary view focusing on the two most interpretable metrics.
+
+```{r overview-two-metrics, fig.height=10, fig.width=14}
+ov <- global %>%
+  filter(!is.na(mutation_rate)) %>%
+  select(aligner_mode, feature_set, gran_label, mutation_rate, pearson_r, f1) %>%
+  pivot_longer(c(pearson_r, f1), names_to = "metric", values_to = "value") %>%
+  filter(!is.na(value))
+
+n_al2 <- length(unique(ov$aligner_mode))
+pal2  <- make_palette(n_al2)
+
+ggplot(ov, aes(x = mutation_rate, y = value,
+               colour = aligner_mode, group = aligner_mode)) +
+  geom_point(size = 3, alpha = 0.85) +
+  geom_line(linewidth = 0.8) +
+  facet_grid(metric ~ feature_set, scales = "free_y") +
+  scale_colour_manual(values = pal2, name = "aligner (mode)") +
+  scale_x_continuous(labels = percent_format(accuracy = 0.1)) +
+  ylim(0, 1) +
+  labs(x = "mutation rate", y = NULL,
+       title = "Pearson r and F1 across noise levels",
+       subtitle = "rows = metric, columns = feature_set; lines connect same aligner across mutation rates") +
+  theme_bw(base_size = 11) +
+  theme(axis.text.x = element_text(angle = 25, hjust = 1),
+        legend.position = "bottom")
+```
+
+## global accuracy metrics per noise level
+
+Full metric grid from the standard report, faceted by mutation rate.
+
+```{r global-faceted, fig.height=22, fig.width=16}
+full_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall",
+                  "jaccard", "specificity", "log1p_rmse")
+
+full_long <- global %>%
+  filter(!is.na(mutation_rate)) %>%
+  select(aligner_mode, feature_set, gran_label, mutation_rate, noise_label,
+         all_of(full_metrics)) %>%
+  pivot_longer(all_of(full_metrics), names_to = "metric", values_to = "value") %>%
+  filter(!is.na(value)) %>%
+  mutate(metric = factor(metric, levels = full_metrics))
+
+n_al3 <- length(unique(full_long$aligner_mode))
+pal3  <- make_palette(n_al3)
+
+ggplot(full_long, aes(x = gran_label, y = value,
+                      colour = aligner_mode, group = aligner_mode)) +
+  geom_point(size = 2, alpha = 0.8) +
+  geom_line(linetype = "dashed", linewidth = 0.5) +
+  facet_grid(metric ~ noise_label + feature_set, scales = "free_y") +
+  scale_colour_manual(values = pal3, name = "aligner (mode)") +
+  labs(x = "granularity", y = NULL,
+       title = "global accuracy metrics",
+       subtitle = "rows = metric; columns = mutation rate x feature_set; x = granularity") +
+  theme_bw(base_size = 8) +
+  theme(aspect.ratio = 1,
+        axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+        strip.text   = element_text(size = 6),
+        legend.position = "bottom",
+        legend.text  = element_text(size = 7))
+```
+
+## per-cell correlations by noise level
+
+```{r per-cell-noise, fig.height=10, fig.width=14}
+if (!is.null(per_cell) && nrow(per_cell) > 0 && "mutation_rate" %in% names(per_cell)) {
+  pc <- per_cell %>%
+    mutate(across(c(pearson_r, spearman_r), ~ suppressWarnings(as.numeric(.)))) %>%
+    filter(!is.na(aligner), !is.na(mutation_rate))
+
+  if (!"multimapper_mode" %in% names(pc)) pc$multimapper_mode <- "unique"
+  if (!"feature_set" %in% names(pc))      pc$feature_set      <- "repeats"
+
+  pc <- pc %>%
+    mutate(aligner_mode = paste0(aligner, " (", multimapper_mode, ")"),
+           gran_label   = factor(granularity,
+                                 levels = c("locus", "gene_id", "family_id", "class_id")),
+           noise_label  = paste0(mutation_rate * 100, "%"))
+
+  pc_long <- pc %>%
+    pivot_longer(c(pearson_r, spearman_r), names_to = "metric", values_to = "r") %>%
+    filter(!is.na(r))
+
+  n_al4 <- length(unique(pc_long$aligner_mode))
+  pal4  <- make_palette(n_al4)
+
+  ggplot(pc_long, aes(x = aligner_mode, y = r, fill = aligner_mode)) +
+    geom_violin(alpha = 0.7, draw_quantiles = c(0.25, 0.5, 0.75)) +
+    geom_jitter(width = 0.1, size = 0.5, alpha = 0.4) +
+    facet_grid(metric + gran_label ~ noise_label + feature_set) +
+    scale_fill_manual(values = pal4) +
+    labs(x = NULL, y = "correlation",
+         title = "per-cell correlations by noise level",
+         subtitle = "rows = metric x granularity; columns = mutation rate x feature_set") +
+    theme_bw(base_size = 8) +
+    theme(axis.text.x  = element_text(angle = 45, hjust = 1, size = 6),
+          legend.position = "none",
+          strip.text   = element_text(size = 6))
+} else {
+  cat("No per-cell metrics with mutation_rate column found.\n")
+}
+```
+
+## per-repeat-class F1 by noise level
+
+```{r per-family-noise, fig.height=14, fig.width=14}
+if (!is.null(per_family) && nrow(per_family) > 0 &&
+    "class_id" %in% names(per_family) && "mutation_rate" %in% names(per_family)) {
+
+  pf <- per_family %>%
+    filter(!is.na(class_id), class_id != "unknown", !is.na(mutation_rate)) %>%
+    mutate(gran_label  = factor(granularity,
+                                levels = c("locus", "gene_id", "family_id", "class_id")),
+           noise_label = paste0(mutation_rate * 100, "%"))
+
+  n_al5 <- length(unique(pf$aligner_mode))
+  pal5  <- make_palette(n_al5)
+
+  p_f1 <- ggplot(pf, aes(x = class_id, y = f1,
+                          colour = aligner_mode, group = aligner_mode)) +
+    geom_line(linewidth = 0.4, linetype = "dashed") +
+    geom_point(size = 1.5, alpha = 0.85) +
+    facet_grid(gran_label ~ noise_label + feature_set) +
+    scale_y_sqrt() +
+    scale_colour_manual(values = pal5, name = "Aligner (mode)") +
+    labs(x = "repeat class", y = "F1",
+         title = "F1 per repeat class by noise level",
+         subtitle = "rows = granularity; columns = mutation rate x feature_set") +
+    theme_bw(base_size = 8) +
+    theme(axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+          legend.position = "bottom",
+          legend.text  = element_text(size = 7),
+          strip.text   = element_text(size = 6))
+
+  p_r <- ggplot(pf, aes(x = class_id, y = pearson_r,
+                         colour = aligner_mode, group = aligner_mode)) +
+    geom_line(linewidth = 0.4, linetype = "dashed") +
+    geom_point(size = 1.5, alpha = 0.85) +
+    facet_grid(gran_label ~ noise_label + feature_set) +
+    scale_y_sqrt() +
+    scale_colour_manual(values = pal5, name = "Aligner (mode)") +
+    labs(x = "repeat class", y = "pearson r") +
+    theme_bw(base_size = 8) +
+    theme(axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+          legend.position = "none",
+          strip.text   = element_text(size = 6))
+
+  p_f1 / p_r
+} else {
+  cat("No per-family metrics with class_id and mutation_rate found.\n")
+}
+```
+
+---
+*Generated by [noise_sweep_report.Rmd](../workflow/scripts/noise_sweep_report.Rmd)*
diff --git a/workflow/scripts/simulate_reads.py b/workflow/scripts/simulate_reads.py
index d2b8d55..b322655 100644
--- a/workflow/scripts/simulate_reads.py
+++ b/workflow/scripts/simulate_reads.py
@@ -111,14 +111,14 @@ def reverse_complement(seq):
     return seq.translate(table)[::-1]
 
 
-def sample_subseq(seq, read_length, rng):
+def sample_subseq(seq, read_length, rng, mutation_rate=0.001):
     if len(seq) <= read_length:
         return (seq + 'N' * read_length)[:read_length]
     offset = rng.randint(0, len(seq) - read_length)
     read = seq[offset:offset + read_length]
     bases = list(read)
     for i in range(len(bases)):
-        if rng.random() < 0.001:
+        if rng.random() < mutation_rate:
             bases[i] = rng.choice('ACGT')
     return ''.join(bases)
 
@@ -178,7 +178,7 @@ def build_chrom_locus_coords(cell_plan):
     return chrom_loci
 
 
-def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng):
+def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng, mutation_rate=0.001):
     os.makedirs(outdir, exist_ok=True)
     locus_to_cells = build_locus_to_cells(cell_plan)
     chrom_locus_coords = build_chrom_locus_coords(cell_plan)
@@ -200,7 +200,7 @@ def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng):
                     fq = cell_handles[cell_id]
                     read_idx = cell_read_counts[cell_id]
                     for i in range(count):
-                        read = sample_subseq(repeat_seq, read_length, rng)
+                        read = sample_subseq(repeat_seq, read_length, rng, mutation_rate)
                         fq.write(f'@{cell_id}_r{read_idx + i}_{safe_id(locus_id)}\n'
                                  f'{read}\n+\n{make_qual(len(read))}\n')
                     cell_read_counts[cell_id] += count
@@ -215,7 +215,7 @@ def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng):
     return cell_fastq_paths, ground_truth
 
 
-def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_length, outdir, rng):
+def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_length, outdir, rng, mutation_rate=0.001):
     os.makedirs(outdir, exist_ok=True)
     locus_to_cells = build_locus_to_cells(cell_plan)
     chrom_locus_coords = build_chrom_locus_coords(cell_plan)
@@ -263,7 +263,7 @@ def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_le
                         used_umis.add(umi)
                         read_id = f'r{total_reads}_{barcode[:8]}_{umi}_{safe_id(locus_id)}'
                         r1_seq = barcode + umi
-                        r2_seq = sample_subseq(repeat_seq, read_length, rng)
+                        r2_seq = sample_subseq(repeat_seq, read_length, rng, mutation_rate)
                         r1f.write(f'@{read_id}\n{r1_seq}\n+\n{make_qual(len(r1_seq))}\n')
                         r2f.write(f'@{read_id}\n{r2_seq}\n+\n{make_qual(len(r2_seq))}\n')
                         total_reads += 1
@@ -323,6 +323,8 @@ def main():
     ap.add_argument('--max-repeats-per-chrom', type=int, default=None,
                     help='Cap intervals per chromosome to limit memory use on full genomes')
     ap.add_argument('--seed', type=int, default=42)
+    ap.add_argument('--mutation-rate', type=float, default=0.001,
+                    help='Per-base substitution probability for simulated reads (default 0.001)')
     args = ap.parse_args()
 
     rng = random.Random(args.seed)
@@ -353,7 +355,7 @@ def main():
     if args.mode == 'smartseq2':
         print(f'Simulating SmartSeq2 reads (streaming FASTA by chrom)', file=sys.stderr)
         cell_fastq_paths, ground_truth = simulate_smartseq2(
-            args.fasta, cell_plan, args.read_length, args.outdir, rng)
+            args.fasta, cell_plan, args.read_length, args.outdir, rng, mutation_rate=args.mutation_rate)
         write_ground_truth(ground_truth, ground_truth_path, cell_column='cell_id')
         manifest_path = os.path.join(args.outdir, 'manifest.tsv')
         write_smartseq2_manifest(cell_fastq_paths, manifest_path)
@@ -363,7 +365,7 @@ def main():
         print(f'Simulating Chromium reads (streaming FASTA by chrom)', file=sys.stderr)
         (r1, r2), ground_truth = simulate_chromium(
             args.fasta, cell_plan, args.read_length,
-            args.cb_length, args.umi_length, args.outdir, rng)
+            args.cb_length, args.umi_length, args.outdir, rng, mutation_rate=args.mutation_rate)
         write_ground_truth(ground_truth, ground_truth_path, cell_column='cell_barcode')
         print(f'R1: {r1}', file=sys.stderr)
         print(f'R2: {r2}', file=sys.stderr)

From 6c6edcc0e615625f634663bdaa76c14ca22b46c3 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 16:16:07 +0100
Subject: [PATCH 11/12] Cache genome/indices across noise runs; make all
 reports by default

---
 Makefile                                          | 15 ++++++++-------
 workflow/configs/simulation_chromium.yaml         |  6 +++---
 .../configs/simulation_chromium_noise_0pct.yaml   |  6 +++---
 .../configs/simulation_chromium_noise_10pct.yaml  |  6 +++---
 .../configs/simulation_chromium_noise_1pct.yaml   |  6 +++---
 .../configs/simulation_chromium_noise_5pct.yaml   |  6 +++---
 workflow/configs/simulation_smartseq2.yaml        |  6 +++---
 .../configs/simulation_smartseq2_noise_0pct.yaml  |  6 +++---
 .../configs/simulation_smartseq2_noise_10pct.yaml |  6 +++---
 .../configs/simulation_smartseq2_noise_1pct.yaml  |  6 +++---
 .../configs/simulation_smartseq2_noise_5pct.yaml  |  6 +++---
 workflow/modules/download_references.snmk         |  2 +-
 workflow/modules/evaluation.snmk                  |  6 ++++--
 workflow/modules/reference.snmk                   |  8 ++++----
 14 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/Makefile b/Makefile
index 5af3948..cc9a316 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,16 @@
 SHELL   := /bin/bash
+.DEFAULT_GOAL := all
 CORES   ?= 10
 WF      := workflow
 RESULTS := results
 
 CONDA_RUN := source ~/miniconda3/etc/profile.d/conda.sh && conda activate snakemake &&
-SM        := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES)
+SM        := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES) --rerun-triggers mtime
 RSCRIPT   := cd $(WF) && $(CONDA_RUN) Rscript
 
 # noise sweep eval dirs (relative to workflow/, comma-separated for the Rmd param)
-NOISE_EDIRS_SS2 := ../$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation
-NOISE_EDIRS_CHR := ../$(RESULTS)/simulation_chromium_noise_0pct/evaluation,../$(RESULTS)/simulation_chromium_noise_1pct/evaluation,../$(RESULTS)/simulation_chromium_noise_5pct/evaluation,../$(RESULTS)/simulation_chromium_noise_10pct/evaluation
+NOISE_EDIRS_SS2 := $(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation
+NOISE_EDIRS_CHR := $(CURDIR)/$(RESULTS)/simulation_chromium_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_10pct/evaluation
 
 # output HTML files (relative to project root)
 NOISE_REPORT_SS2 := $(RESULTS)/noise_sweep_smartseq2.html
@@ -78,14 +79,14 @@ noise_chromium: noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct nois
 
 $(NOISE_REPORT_SS2):
 	mkdir -p $(RESULTS)
-	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', \
-	    output_file = '../$(NOISE_REPORT_SS2)', \
+	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \
+	    output_file = '$(CURDIR)/$(NOISE_REPORT_SS2)', \
 	    params = list(eval_dirs = '$(NOISE_EDIRS_SS2)'))"
 
 $(NOISE_REPORT_CHR):
 	mkdir -p $(RESULTS)
-	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', \
-	    output_file = '../$(NOISE_REPORT_CHR)', \
+	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \
+	    output_file = '$(CURDIR)/$(NOISE_REPORT_CHR)', \
 	    params = list(eval_dirs = '$(NOISE_EDIRS_CHR)'))"
 
 report_noise_smartseq2: $(NOISE_REPORT_SS2)
diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml
index 04ae340..2c36df7 100644
--- a/workflow/configs/simulation_chromium.yaml
+++ b/workflow/configs/simulation_chromium.yaml
@@ -8,9 +8,9 @@ reference:
   # Split repeats into genic/intergenic subsets using gene body overlap (bedtools)
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_chromium_noise_0pct.yaml b/workflow/configs/simulation_chromium_noise_0pct.yaml
index 1aea43a..7fd2909 100644
--- a/workflow/configs/simulation_chromium_noise_0pct.yaml
+++ b/workflow/configs/simulation_chromium_noise_0pct.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_chromium_noise_10pct.yaml b/workflow/configs/simulation_chromium_noise_10pct.yaml
index b8a3b0a..2f359ec 100644
--- a/workflow/configs/simulation_chromium_noise_10pct.yaml
+++ b/workflow/configs/simulation_chromium_noise_10pct.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_chromium_noise_1pct.yaml b/workflow/configs/simulation_chromium_noise_1pct.yaml
index 2d52c8a..da11b20 100644
--- a/workflow/configs/simulation_chromium_noise_1pct.yaml
+++ b/workflow/configs/simulation_chromium_noise_1pct.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_chromium_noise_5pct.yaml b/workflow/configs/simulation_chromium_noise_5pct.yaml
index 228f030..3f63a64 100644
--- a/workflow/configs/simulation_chromium_noise_5pct.yaml
+++ b/workflow/configs/simulation_chromium_noise_5pct.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml
index 54beaee..4d3b4d7 100644
--- a/workflow/configs/simulation_smartseq2.yaml
+++ b/workflow/configs/simulation_smartseq2.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_smartseq2_noise_0pct.yaml b/workflow/configs/simulation_smartseq2_noise_0pct.yaml
index ba95edb..2480849 100644
--- a/workflow/configs/simulation_smartseq2_noise_0pct.yaml
+++ b/workflow/configs/simulation_smartseq2_noise_0pct.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_smartseq2_noise_10pct.yaml b/workflow/configs/simulation_smartseq2_noise_10pct.yaml
index b7d9d5c..ee5983b 100644
--- a/workflow/configs/simulation_smartseq2_noise_10pct.yaml
+++ b/workflow/configs/simulation_smartseq2_noise_10pct.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_smartseq2_noise_1pct.yaml b/workflow/configs/simulation_smartseq2_noise_1pct.yaml
index f38d904..fee65d1 100644
--- a/workflow/configs/simulation_smartseq2_noise_1pct.yaml
+++ b/workflow/configs/simulation_smartseq2_noise_1pct.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_smartseq2_noise_5pct.yaml b/workflow/configs/simulation_smartseq2_noise_5pct.yaml
index ef69956..8b9ff40 100644
--- a/workflow/configs/simulation_smartseq2_noise_5pct.yaml
+++ b/workflow/configs/simulation_smartseq2_noise_5pct.yaml
@@ -7,9 +7,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/modules/download_references.snmk b/workflow/modules/download_references.snmk
index 58b6533..0fe3a19 100644
--- a/workflow/modules/download_references.snmk
+++ b/workflow/modules/download_references.snmk
@@ -69,7 +69,7 @@ genome_fasta_url = urls_cfg.get('genome_fasta', default_genome_fasta_url)
 genes_gtf_url = urls_cfg.get('genes_gtf', default_genes_gtf_url)
 rmsk_flatfile_url = urls_cfg.get('rmsk_flatfile', default_rmsk_flatfile_url)
 
-refs_dir = op.join(config['base'], 'refs')
+refs_dir = op.join(config.get('indices_base', config['base']), 'refs')
 build_rmsk_script = op.join(workflow.basedir, 'scripts', 'build_rmsk_gtf.py')
 
 
diff --git a/workflow/modules/evaluation.snmk b/workflow/modules/evaluation.snmk
index c0d78b4..9ae34bf 100644
--- a/workflow/modules/evaluation.snmk
+++ b/workflow/modules/evaluation.snmk
@@ -180,7 +180,8 @@ rule render_report:
         report = op.join(eval_dir, 'evaluation_report.html')
     params:
         mutation_rate = sim_mutation_rate,
-        eval_dir = eval_dir,
+        eval_dir = op.abspath(eval_dir),
+        abs_report = op.abspath(op.join(eval_dir, 'evaluation_report.html')),
         rmd = op.join(workflow.basedir, 'scripts', 'evaluation_report.Rmd')
     log:
         op.join(config['base'], 'logs', 'render_report.log')
@@ -188,7 +189,8 @@ rule render_report:
         """
         Rscript -e "rmarkdown::render(
             '{params.rmd}',
-            output_file = '{output.report}',
+            output_file = '{params.abs_report}',
+            knit_root_dir = getwd(),
             params = list(eval_dir = '{params.eval_dir}')
         )" 2> {log}
         """
diff --git a/workflow/modules/reference.snmk b/workflow/modules/reference.snmk
index cd00f3c..0dae94a 100644
--- a/workflow/modules/reference.snmk
+++ b/workflow/modules/reference.snmk
@@ -66,7 +66,7 @@ def t2g_path(wildcards):
 rule decompress_genome:
     conda: op.join(workflow.basedir, 'envs', 'star.yaml')
     input: config['reference']['genome_fasta']
-    output: temp(op.join(config['base'], 'tmp', 'genome.fa'))
+    output: op.join(config['indices_base'], 'tmp', 'genome.fa')
     params:
         chroms = _chrom_list
     threads: workflow.cores
@@ -84,7 +84,7 @@ rule decompress_genome:
 rule decompress_repeats_gtf:
     conda: op.join(workflow.basedir, 'envs', 'star.yaml')
     input: config['reference']['repeats_gtf']
-    output: temp(op.join(config['base'], 'tmp', 'repeats.gtf'))
+    output: op.join(config['indices_base'], 'tmp', 'repeats.gtf')
     threads: workflow.cores
     shell: "pigz --decompress --keep --stdout --processes {threads} {input} | sed 's/^chr//' > {output}"
 
@@ -92,7 +92,7 @@ rule decompress_repeats_gtf:
 rule decompress_genes_gtf:
     conda: op.join(workflow.basedir, 'envs', 'star.yaml')
     input: config['reference']['genes_gtf']
-    output: temp(op.join(config['base'], 'tmp', 'genes.gtf'))
+    output: op.join(config['indices_base'], 'tmp', 'genes.gtf')
     params:
         chroms = _chrom_list
     threads: workflow.cores
@@ -196,7 +196,7 @@ rule extract_feature_set_fasta:
         gtf = gtf_for_repeat_feature_set
     output:
         fasta = op.join(config['indices_base'], 'indices', genome_tag, '{feature_set}.fa'),
-        bed = temp(op.join(config['base'], 'tmp', '{feature_set}.bed'))
+        bed = temp(op.join(config['indices_base'], 'tmp', '{feature_set}.bed'))
     wildcard_constraints:
         feature_set = 'repeats|genic_repeats|intergenic_repeats'
     log:

From 642c388d7dafc667b1db8e9b4aba0e880d329322 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 23 Mar 2026 17:07:26 +0100
Subject: [PATCH 12/12] Largely increase the amount of simulated cells

---
 .github/workflows/tests.yml                   |  3 +
 README.md                                     | 56 +++++++++++++++----
 .../configs/test_negative_control.yaml        |  6 +-
 workflow/configs/simulation_chromium.yaml     |  4 +-
 .../simulation_chromium_noise_0pct.yaml       |  4 +-
 .../simulation_chromium_noise_10pct.yaml      |  4 +-
 .../simulation_chromium_noise_1pct.yaml       |  4 +-
 .../simulation_chromium_noise_5pct.yaml       |  4 +-
 workflow/configs/simulation_smartseq2.yaml    |  4 +-
 .../simulation_smartseq2_noise_0pct.yaml      |  4 +-
 .../simulation_smartseq2_noise_10pct.yaml     |  4 +-
 .../simulation_smartseq2_noise_1pct.yaml      |  4 +-
 .../simulation_smartseq2_noise_5pct.yaml      |  4 +-
 13 files changed, 71 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2bf6715..847d3e3 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -61,6 +61,7 @@ jobs:
           snakemake \
             --configfile configs/simulation_smartseq2.yaml \
             --dry-run \
+            --rerun-triggers mtime \
             --quiet --cores 2
 
       - name: Dry-run main Snakefile with Chromium config
@@ -70,6 +71,7 @@ jobs:
           snakemake \
             --configfile configs/simulation_chromium.yaml \
             --dry-run \
+            --rerun-triggers mtime \
             --quiet --cores 2
 
       - name: Dry-run test Snakefile with negative control config
@@ -80,6 +82,7 @@ jobs:
             -s ../test/workflow/Snakefile_test \
             --configfile ../test/workflow/configs/test_negative_control.yaml \
             --dry-run \
+            --rerun-triggers mtime \
             --quiet --cores 2
 
       - name: Run workflow dry-run pytest tests
diff --git a/README.md b/README.md
index 6bc5ef0..09ef24f 100644
--- a/README.md
+++ b/README.md
@@ -46,30 +46,64 @@ Exact package pins (platform-locked explicit exports) are in `workflow/envs/expl
 
 ## Running
 
+### Via Makefile (recommended)
+
+A `Makefile` at the project root orchestrates all configs and renders reports.
+
+```
+# Run everything (all simulations + noise sweep + reports)
+make
+
+# Individual targets
+make simulation_smartseq2      # SmartSeq2 base simulation
+make simulation_chromium       # Chromium base simulation
+make noise_smartseq2           # SmartSeq2 noise sweep (0%, 1%, 5%, 10% mutation rate)
+make noise_chromium            # Chromium noise sweep
+make report_noise_smartseq2    # Render SmartSeq2 noise sweep HTML report
+make report_noise_chromium     # Render Chromium noise sweep HTML report
+make help                      # List all targets
+```
+
+Tune parallelism with `make CORES=20`. The Makefile activates the `snakemake` conda
+environment automatically.
+
+### Manually
+
 ```
 source ~/miniconda3/etc/profile.d/conda.sh
 conda activate snakemake
 cd workflow
-snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10
-snakemake --configfile configs/simulation_chromium.yaml  --use-conda --cores 10
+snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10 --rerun-triggers mtime
+snakemake --configfile configs/simulation_chromium.yaml  --use-conda --cores 10 --rerun-triggers mtime
 ```
 
-The report is written to `{base}/evaluation/evaluation_report.html` as defined in
-the config file.
+The per-run evaluation report is written to `{base}/evaluation/evaluation_report.html`.
 
 ## Configuration
 
-Two reference configs are provided under `workflow/configs/`:
+Simulation configs are under `workflow/configs/`:
+
+| File | Technology | Cells | Expressed loci/cell | Mutation rate |
+|---|---|---|---|---|
+| `simulation_smartseq2.yaml` | SmartSeq2 | 500 | 1000 | 0.1% |
+| `simulation_chromium.yaml`  | 10x Chromium | 500 | 1000 | 0.1% |
+| `simulation_smartseq2_noise_0pct.yaml` | SmartSeq2 | 500 | 1000 | 0% |
+| `simulation_smartseq2_noise_1pct.yaml` | SmartSeq2 | 500 | 1000 | 1% |
+| `simulation_smartseq2_noise_5pct.yaml` | SmartSeq2 | 500 | 1000 | 5% |
+| `simulation_smartseq2_noise_10pct.yaml` | SmartSeq2 | 500 | 1000 | 10% |
+| `simulation_chromium_noise_0pct.yaml` | 10x Chromium | 500 | 1000 | 0% |
+| `simulation_chromium_noise_1pct.yaml` | 10x Chromium | 500 | 1000 | 1% |
+| `simulation_chromium_noise_5pct.yaml` | 10x Chromium | 500 | 1000 | 5% |
+| `simulation_chromium_noise_10pct.yaml` | 10x Chromium | 500 | 1000 | 10% |
 
-| File | Technology | Cells | Chr subset |
-|---|---|---|---|
-| `simulation_smartseq2.yaml` | SmartSeq2 | 20 | chr10 |
-| `simulation_chromium.yaml`  | 10x Chromium | 50 | chr10 |
+Unused real-data configs have been moved to `workflow/configs/old/`.
 
 Key parameters:
 
-- `base`: output directory for all results.
-- `indices_base`: directory for shared aligner indices (can be shared across runs).
+- `base`: run-specific output directory.
+- `indices_base`: shared directory for aligner indices and decompressed references.
+  All noise-sweep configs share the same `indices_base` so indices are built once.
+- `simulation.mutation_rate`: per-base substitution rate applied to simulated reads.
 - `feature_sets`: which repeat subsets to quantify (`repeats`, `genic_repeats`, `intergenic_repeats`).
 - `granularities`: aggregation levels (`gene_id`, `family_id`, `class_id`).
 - `aligner_params.{aligner}.multimapper_modes`: `unique` (best hit only) or `multi` (EM/all-alignments).
diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml
index 204865f..32f068f 100644
--- a/test/workflow/configs/test_negative_control.yaml
+++ b/test/workflow/configs/test_negative_control.yaml
@@ -14,9 +14,9 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "../results/test_negative_control/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf:  "../results/test_negative_control/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf:    "../results/test_negative_control/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf:  "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf:    "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml
index 2c36df7..5a08495 100644
--- a/workflow/configs/simulation_chromium.yaml
+++ b/workflow/configs/simulation_chromium.yaml
@@ -16,8 +16,8 @@ mode: simulation
 
 simulation:
   technology: chromium
-  n_cells: 50
-  n_expressed_per_cell: 80
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   cb_length: 16
   umi_length: 12
diff --git a/workflow/configs/simulation_chromium_noise_0pct.yaml b/workflow/configs/simulation_chromium_noise_0pct.yaml
index 7fd2909..d098ee1 100644
--- a/workflow/configs/simulation_chromium_noise_0pct.yaml
+++ b/workflow/configs/simulation_chromium_noise_0pct.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: chromium
-  n_cells: 50
-  n_expressed_per_cell: 80
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   cb_length: 16
   umi_length: 12
diff --git a/workflow/configs/simulation_chromium_noise_10pct.yaml b/workflow/configs/simulation_chromium_noise_10pct.yaml
index 2f359ec..07b1052 100644
--- a/workflow/configs/simulation_chromium_noise_10pct.yaml
+++ b/workflow/configs/simulation_chromium_noise_10pct.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: chromium
-  n_cells: 50
-  n_expressed_per_cell: 80
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   cb_length: 16
   umi_length: 12
diff --git a/workflow/configs/simulation_chromium_noise_1pct.yaml b/workflow/configs/simulation_chromium_noise_1pct.yaml
index da11b20..7837c70 100644
--- a/workflow/configs/simulation_chromium_noise_1pct.yaml
+++ b/workflow/configs/simulation_chromium_noise_1pct.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: chromium
-  n_cells: 50
-  n_expressed_per_cell: 80
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   cb_length: 16
   umi_length: 12
diff --git a/workflow/configs/simulation_chromium_noise_5pct.yaml b/workflow/configs/simulation_chromium_noise_5pct.yaml
index 3f63a64..fadf484 100644
--- a/workflow/configs/simulation_chromium_noise_5pct.yaml
+++ b/workflow/configs/simulation_chromium_noise_5pct.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: chromium
-  n_cells: 50
-  n_expressed_per_cell: 80
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   cb_length: 16
   umi_length: 12
diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml
index 4d3b4d7..541b150 100644
--- a/workflow/configs/simulation_smartseq2.yaml
+++ b/workflow/configs/simulation_smartseq2.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: smartseq2
-  n_cells: 20
-  n_expressed_per_cell: 100
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   seed: 42
   mutation_rate: 0.001
diff --git a/workflow/configs/simulation_smartseq2_noise_0pct.yaml b/workflow/configs/simulation_smartseq2_noise_0pct.yaml
index 2480849..fcf2473 100644
--- a/workflow/configs/simulation_smartseq2_noise_0pct.yaml
+++ b/workflow/configs/simulation_smartseq2_noise_0pct.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: smartseq2
-  n_cells: 20
-  n_expressed_per_cell: 100
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   seed: 42
   mutation_rate: 0.0
diff --git a/workflow/configs/simulation_smartseq2_noise_10pct.yaml b/workflow/configs/simulation_smartseq2_noise_10pct.yaml
index ee5983b..ee49fda 100644
--- a/workflow/configs/simulation_smartseq2_noise_10pct.yaml
+++ b/workflow/configs/simulation_smartseq2_noise_10pct.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: smartseq2
-  n_cells: 20
-  n_expressed_per_cell: 100
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   seed: 42
   mutation_rate: 0.10
diff --git a/workflow/configs/simulation_smartseq2_noise_1pct.yaml b/workflow/configs/simulation_smartseq2_noise_1pct.yaml
index fee65d1..e19d0cc 100644
--- a/workflow/configs/simulation_smartseq2_noise_1pct.yaml
+++ b/workflow/configs/simulation_smartseq2_noise_1pct.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: smartseq2
-  n_cells: 20
-  n_expressed_per_cell: 100
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   seed: 42
   mutation_rate: 0.01
diff --git a/workflow/configs/simulation_smartseq2_noise_5pct.yaml b/workflow/configs/simulation_smartseq2_noise_5pct.yaml
index 8b9ff40..2c7756e 100644
--- a/workflow/configs/simulation_smartseq2_noise_5pct.yaml
+++ b/workflow/configs/simulation_smartseq2_noise_5pct.yaml
@@ -15,8 +15,8 @@ mode: simulation
 
 simulation:
   technology: smartseq2
-  n_cells: 20
-  n_expressed_per_cell: 100
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   seed: 42
   mutation_rate: 0.05