diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..4c196c9
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,23 @@
+[run]
+source = workflow/scripts
+
+# Scripts requiring subprocess calls to samtools/BAM files or real aligner
+# output matrices cannot be meaningfully unit-tested without bioinformatics
+# infrastructure. Omit them so the coverage threshold applies only to
+# logic that can be exercised with synthetic data.
+omit =
+    workflow/scripts/count_pseudo_genome.py
+    workflow/scripts/count_pseudo_genome_chromium.py
+    workflow/scripts/normalize_starsolo.py
+    workflow/scripts/normalize_alevin_chromium.py
+    workflow/scripts/normalize_alevin_smartseq2.py
+    workflow/scripts/normalize_kallisto_chromium.py
+    workflow/scripts/normalize_kallisto_smartseq2_granular.py
+    workflow/scripts/merge_featurecounts.py
+    workflow/scripts/tag_bam_chromium.py
+    workflow/scripts/split_gtf_chunks.py
+
+[report]
+exclude_lines =
+    pragma: no cover
+    if __name__ == .__main__.:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..847d3e3
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,125 @@
+name: Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [master, dev]
+
+jobs:
+  # Unit and integration tests. Needs Python and scipy
+  unit-and-integration:
+    name: Unit and integration tests
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install Python dependencies
+        run: pip install pytest pytest-cov scipy
+
+      - name: Run unit and integration tests
+        run: |
+          pytest test/unit/ test/integration/ \
+            -v \
+            --tb=short \
+            --cov=workflow/scripts \
+            --cov-report=term-missing \
+            --cov-fail-under=70
+
+  # Snakemake dry-run. Installs snakemake via micromamba but does not run any rules.
+  snakemake-dryrun:
+    name: Snakemake dry-run
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          micromamba-version: latest
+          environment-name: snakemake
+          create-args: >-
+            -c bioconda
+            -c conda-forge
+            python=3.11
+            snakemake>=8
+            pytest
+            pytest-cov
+            scipy
+          init-shell: bash
+
+      - name: Dry-run main Snakefile with SmartSeq2 config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            --configfile configs/simulation_smartseq2.yaml \
+            --dry-run \
+            --rerun-triggers mtime \
+            --quiet --cores 2
+
+      - name: Dry-run main Snakefile with Chromium config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            --configfile configs/simulation_chromium.yaml \
+            --dry-run \
+            --rerun-triggers mtime \
+            --quiet --cores 2
+
+      - name: Dry-run test Snakefile with negative control config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            -s ../test/workflow/Snakefile_test \
+            --configfile ../test/workflow/configs/test_negative_control.yaml \
+            --dry-run \
+            --rerun-triggers mtime \
+            --quiet --cores 2
+
+      - name: Run workflow dry-run pytest tests
+        shell: bash -el {0}
+        run: |
+          pytest test/workflow/test_snakemake_dryrun.py \
+            -v --tb=short -m workflow
+
+  # Full negative control run. Requires reference data on the runner and an env.
+  # triggered manually
+  negative-control-run:
+    name: Negative control full run
+    runs-on: ubuntu-latest
+    if: |
+      github.event_name == 'workflow_dispatch'
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          micromamba-version: latest
+          environment-name: snakemake
+          create-args: >-
+            -c bioconda
+            -c conda-forge
+            python=3.11
+            snakemake>=8
+          init-shell: bash
+
+      - name: Run negative control workflow
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            -s ../test/workflow/Snakefile_test \
+            --configfile ../test/workflow/configs/test_negative_control.yaml \
+            --use-conda \
+            --cores 2 \
+            --conda-frontend mamba
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cc9a316
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,124 @@
+SHELL   := /bin/bash
+.DEFAULT_GOAL := all
+CORES   ?= 10
+WF      := workflow
+RESULTS := results
+
+CONDA_RUN := source ~/miniconda3/etc/profile.d/conda.sh && conda activate snakemake &&
+SM        := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES) --rerun-triggers mtime
+RSCRIPT   := cd $(WF) && $(CONDA_RUN) Rscript
+
+# noise sweep eval dirs (relative to workflow/, comma-separated for the Rmd param)
+NOISE_EDIRS_SS2 := $(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation
+NOISE_EDIRS_CHR := $(CURDIR)/$(RESULTS)/simulation_chromium_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_10pct/evaluation
+
+# output HTML files (relative to project root)
+NOISE_REPORT_SS2 := $(RESULTS)/noise_sweep_smartseq2.html
+NOISE_REPORT_CHR := $(RESULTS)/noise_sweep_chromium.html
+
+.PHONY: all \
+        simulation_smartseq2 simulation_chromium \
+        noise_smartseq2 noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct \
+        noise_chromium noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct \
+        report_noise_smartseq2 report_noise_chromium reports_noise \
+        help
+
+# -----------------------------------------------------------------------
+# base simulations
+# -----------------------------------------------------------------------
+
+simulation_smartseq2:
+	$(SM) --configfile configs/simulation_smartseq2.yaml
+
+simulation_chromium:
+	$(SM) --configfile configs/simulation_chromium.yaml
+
+# -----------------------------------------------------------------------
+# noise sweep — SmartSeq2
+# note: genome refs are reused from the base simulation_smartseq2 run;
+#       alignment indices are shared via results/shared/ (never rebuilt)
+# -----------------------------------------------------------------------
+
+noise_smartseq2_0pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_0pct.yaml
+
+noise_smartseq2_1pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_1pct.yaml
+
+noise_smartseq2_5pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_5pct.yaml
+
+noise_smartseq2_10pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_10pct.yaml
+
+noise_smartseq2: noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct
+
+# -----------------------------------------------------------------------
+# noise sweep — Chromium
+# -----------------------------------------------------------------------
+
+noise_chromium_0pct:
+	$(SM) --configfile configs/simulation_chromium_noise_0pct.yaml
+
+noise_chromium_1pct:
+	$(SM) --configfile configs/simulation_chromium_noise_1pct.yaml
+
+noise_chromium_5pct:
+	$(SM) --configfile configs/simulation_chromium_noise_5pct.yaml
+
+noise_chromium_10pct:
+	$(SM) --configfile configs/simulation_chromium_noise_10pct.yaml
+
+noise_chromium: noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct
+
+# -----------------------------------------------------------------------
+# noise sweep reports
+# file targets: only (re)rendered when the HTML does not yet exist;
+# use  make -B report_noise_smartseq2  to force a re-render
+# -----------------------------------------------------------------------
+
+$(NOISE_REPORT_SS2):
+	mkdir -p $(RESULTS)
+	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \
+	    output_file = '$(CURDIR)/$(NOISE_REPORT_SS2)', \
+	    params = list(eval_dirs = '$(NOISE_EDIRS_SS2)'))"
+
+$(NOISE_REPORT_CHR):
+	mkdir -p $(RESULTS)
+	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \
+	    output_file = '$(CURDIR)/$(NOISE_REPORT_CHR)', \
+	    params = list(eval_dirs = '$(NOISE_EDIRS_CHR)'))"
+
+report_noise_smartseq2: $(NOISE_REPORT_SS2)
+report_noise_chromium:  $(NOISE_REPORT_CHR)
+reports_noise: report_noise_smartseq2 report_noise_chromium
+
+# -----------------------------------------------------------------------
+# all
+# -----------------------------------------------------------------------
+
+all: simulation_smartseq2 simulation_chromium noise_smartseq2 noise_chromium reports_noise
+
+# -----------------------------------------------------------------------
+# help
+# -----------------------------------------------------------------------
+
+help:
+	@echo "Usage: make [target] [CORES=N]   (default CORES=$(CORES))"
+	@echo ""
+	@echo "Base simulation runs:"
+	@echo "  simulation_smartseq2             SmartSeq2 base run (mutation_rate 0.1%)"
+	@echo "  simulation_chromium              Chromium base run  (mutation_rate 0.1%)"
+	@echo ""
+	@echo "Noise sweep runs (base simulation must have been run first):"
+	@echo "  noise_smartseq2                  all 4 noise levels for SmartSeq2"
+	@echo "  noise_smartseq2_{0,1,5,10}pct   individual SmartSeq2 noise level"
+	@echo "  noise_chromium                   all 4 noise levels for Chromium"
+	@echo "  noise_chromium_{0,1,5,10}pct    individual Chromium noise level"
+	@echo ""
+	@echo "Reports (file targets; use -B to force re-render):"
+	@echo "  report_noise_smartseq2           $(NOISE_REPORT_SS2)"
+	@echo "  report_noise_chromium            $(NOISE_REPORT_CHR)"
+	@echo "  reports_noise                    both noise sweep reports"
+	@echo ""
+	@echo "  all                              run everything in sequence"
diff --git a/README.md b/README.md
index f8532c2..09ef24f 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Repeat element quantification in bulk and single cells
+# Repetitive element quantification in bulk and single-cell RNA-seq
 
 ## Status
 
@@ -46,30 +46,64 @@ Exact package pins (platform-locked explicit exports) are in `workflow/envs/expl
 
 ## Running
 
+### Via Makefile (recommended)
+
+A `Makefile` at the project root orchestrates all configs and renders reports.
+
+```
+# Run everything (all simulations + noise sweep + reports)
+make
+
+# Individual targets
+make simulation_smartseq2      # SmartSeq2 base simulation
+make simulation_chromium       # Chromium base simulation
+make noise_smartseq2           # SmartSeq2 noise sweep (0%, 1%, 5%, 10% mutation rate)
+make noise_chromium            # Chromium noise sweep
+make report_noise_smartseq2    # Render SmartSeq2 noise sweep HTML report
+make report_noise_chromium     # Render Chromium noise sweep HTML report
+make help                      # List all targets
+```
+
+Tune parallelism with `make CORES=20`. The Makefile activates the `snakemake` conda
+environment automatically.
+
+### Manually
+
 ```
 source ~/miniconda3/etc/profile.d/conda.sh
 conda activate snakemake
 cd workflow
-snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10
-snakemake --configfile configs/simulation_chromium.yaml  --use-conda --cores 10
+snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10 --rerun-triggers mtime
+snakemake --configfile configs/simulation_chromium.yaml  --use-conda --cores 10 --rerun-triggers mtime
 ```
 
-The report is written to `{base}/evaluation/evaluation_report.html` as defined in
-the config file.
+The per-run evaluation report is written to `{base}/evaluation/evaluation_report.html`.
 
 ## Configuration
 
-Two reference configs are provided under `workflow/configs/`:
+Simulation configs are under `workflow/configs/`:
+
+| File | Technology | Cells | Expressed loci/cell | Mutation rate |
+|---|---|---|---|---|
+| `simulation_smartseq2.yaml` | SmartSeq2 | 500 | 1000 | 0.1% |
+| `simulation_chromium.yaml`  | 10x Chromium | 500 | 1000 | 0.1% |
+| `simulation_smartseq2_noise_0pct.yaml` | SmartSeq2 | 500 | 1000 | 0% |
+| `simulation_smartseq2_noise_1pct.yaml` | SmartSeq2 | 500 | 1000 | 1% |
+| `simulation_smartseq2_noise_5pct.yaml` | SmartSeq2 | 500 | 1000 | 5% |
+| `simulation_smartseq2_noise_10pct.yaml` | SmartSeq2 | 500 | 1000 | 10% |
+| `simulation_chromium_noise_0pct.yaml` | 10x Chromium | 500 | 1000 | 0% |
+| `simulation_chromium_noise_1pct.yaml` | 10x Chromium | 500 | 1000 | 1% |
+| `simulation_chromium_noise_5pct.yaml` | 10x Chromium | 500 | 1000 | 5% |
+| `simulation_chromium_noise_10pct.yaml` | 10x Chromium | 500 | 1000 | 10% |
 
-| File | Technology | Cells | Chr subset |
-|---|---|---|---|
-| `simulation_smartseq2.yaml` | SmartSeq2 | 20 | chr10 |
-| `simulation_chromium.yaml`  | 10x Chromium | 50 | chr10 |
+Unused real-data configs have been moved to `workflow/configs/old/`.
 
 Key parameters:
 
-- `base`: output directory for all results.
-- `indices_base`: directory for shared aligner indices (can be shared across runs).
+- `base`: run-specific output directory.
+- `indices_base`: shared directory for aligner indices and decompressed references.
+  All noise-sweep configs share the same `indices_base` so indices are built once.
+- `simulation.mutation_rate`: per-base substitution rate applied to simulated reads.
 - `feature_sets`: which repeat subsets to quantify (`repeats`, `genic_repeats`, `intergenic_repeats`).
 - `granularities`: aggregation levels (`gene_id`, `family_id`, `class_id`).
 - `aligner_params.{aligner}.multimapper_modes`: `unique` (best hit only) or `multi` (EM/all-alignments).
@@ -86,6 +120,12 @@ per-(barcode, locus) counts directly, without splitting into per-cell BAM files.
 
 See workflow/methods.md for full details.
 
+## Testing
+
+Unit tests, integration tests, and Snakemake dry-run tests are in the
+`test/` directory. See [test/README.md](test/README.md) for details on
+design, coverage, and how to run the tests.
+
 ## Contact
 
 izaskun dot mallona dot work at gmail.com
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..9cf6b9f
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+testpaths = test
+addopts = -v --tb=short
+markers =
+    slow: marks tests as slow (requires bioinformatics tools or large data)
+    workflow: marks tests that require snakemake
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 0000000..d33baf9
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,209 @@
+# Tests
+
+This directory contains the test suite for the repeats pipeline. Tests are
+run with pytest and cover the Python scripts in workflow/scripts/ as well as
+the correctness of the evaluation logic end-to-end.
+
+See also: the main [README.md](../README.md) for pipeline usage.
+
+## Running the tests
+
+All commands below assume the repository root as the working directory unless
+stated otherwise.
+
+### Unit and integration tests
+
+Install dependencies:
+
+```
+pip install pytest pytest-cov scipy
+```
+
+Run unit and integration tests:
+
+```
+cd /path/to/repeats
+pytest test/unit/ test/integration/ -v
+```
+
+Run with coverage:
+
+```
+cd /path/to/repeats
+pytest test/unit/ test/integration/ --cov=workflow/scripts --cov-report=term-missing
+```
+
+### Snakemake dry-run tests
+
+These require snakemake in PATH and are tagged with the `workflow` marker.
+Run via pytest (from the repo root):
+
+```
+cd /path/to/repeats
+pytest test/workflow/test_snakemake_dryrun.py -v -m workflow
+```
+
+Or run the dry-runs directly with snakemake. The working directory must be
+`workflow/` for the production Snakefile, and paths to the test Snakefile and
+configs are given relative to that directory:
+
+```
+cd /path/to/repeats/workflow
+
+snakemake --configfile configs/simulation_smartseq2.yaml --dry-run --quiet
+
+snakemake --configfile configs/simulation_chromium.yaml --dry-run --quiet
+
+snakemake -s ../test/workflow/Snakefile_test \
+  --configfile ../test/workflow/configs/test_negative_control.yaml \
+  --dry-run --quiet
+```
+
+### Negative control workflow (full run)
+
+Requires reference data accessible from the runner and snakemake with conda
+support. Run from the `workflow/` directory:
+
+```
+cd /path/to/repeats/workflow
+
+snakemake -s ../test/workflow/Snakefile_test \
+  --configfile ../test/workflow/configs/test_negative_control.yaml \
+  --use-conda \
+  --cores 4 \
+  --conda-frontend mamba
+```
+
+
+## Directory layout
+
+```
+test/
+  unit/                    unit tests for individual script functions
+  integration/             end-to-end tests using synthetic input files
+  workflow/                snakemake dry-run tests and the negative control workflow
+    Snakefile_test         test workflow that reuses the production snmk modules
+    configs/               config for the negative control run
+    envs/                  conda environment for test workflow rules
+```
+
+
+## Unit tests (test/unit/)
+
+Each file targets one script in workflow/scripts/.
+
+test_evaluate.py covers:
+- pearson_r, spearman_r: known values, edge cases (too few points, constant vector)
+- log1p_rmse: perfect recovery gives 0.0, symmetry, empty input
+- detection_metrics: all-correct, no-overlap, partial overlap, specificity
+- build_aligned_vectors: zero-filling for missing features, cell ordering
+- load_ground_truth: all four granularities (locus, gene_id, family_id, class_id)
+  and the valid_locus_ids filter that was added to fix the genic/intergenic
+  partitioning bug
+- load_count_matrix: feature tracking including zero-count rows
+- compute_metrics_for_subset: perfect observer and null observer
+
+test_simulate_reads.py covers:
+- reverse_complement: known sequences, N passthrough, involution property
+- parse_gtf_attribute: quoted and unquoted values, missing key
+- extract_repeat_sequence: N content filter, strand handling, length filter
+- sample_subseq: output length, N padding for short sequences
+- sample_count_geometric: always >= 1, respects max_count, approximate mean
+- build_cell_plan and build_locus_to_cells: output structure
+- write_ground_truth: TSV format and row count
+- parse_gtf_repeats_by_chrom: minimal GTF parsing, length filter, chrom filter
+
+test_build_rmsk_gtf.py covers:
+- make_gtf_attributes: format and dup index incrementing
+- convert_rmsk_to_gtf: basic output, dup index, length filter, chrom filter,
+  1-based GTF coordinates, gzip output
+
+test_parse_gtf_t2g.py covers:
+- parse_attrs: quoted values, missing keys, empty string
+- main() with sys.argv patching: 2-col and 4-col output, correct mapping,
+  deduplication, feature-type filtering
+
+
+## Integration tests (test/integration/)
+
+test_feature_set_consistency.py documents and tests a bug that caused lower
+evaluation metrics for genic_repeats and intergenic_repeats at gene_id
+granularity compared to the full repeats set, even for a hypothetically
+perfect observer.
+
+Root cause: evaluate.py was aggregating ground truth counts across all loci
+for a given gene_id (including both genic and intergenic copies), then
+filtering at the gene_id level. For a gene_id like AluSz6 that has copies
+in both contexts, the genic truth count included intergenic loci, inflating
+truth relative to observed and degrading Pearson correlation and recall.
+
+The fix adds locus-level filtering before aggregation (see valid_locus_ids
+in load_ground_truth). The tests in this file verify:
+
+1. gene_ids overlap between genic and intergenic locus maps (structural
+   precondition for the bug, expected in real data)
+2. full repeats at gene_id with a perfect observer gives metrics of 1.0
+   (baseline sanity check)
+3. genic_repeats at locus granularity with a perfect observer gives 1.0
+   (locus level is unaffected by the partition issue)
+4. genic_repeats at gene_id with a perfect genic observer now gives 1.0
+   (regression test for the fix; would fail if the fix were reverted)
+5. intergenic_repeats at gene_id with a perfect observer gives 1.0
+6. subset metrics are not lower than full-set metrics for a perfect observer
+   (monotonicity property)
+
+test_metamorphic.py tests properties that must hold across input transformations
+without needing exact expected values:
+
+- scale invariance: multiplying observed counts by a constant does not change
+  Pearson or Spearman
+- scaling does change log1p_rmse
+- monotone recall: adding true positives never decreases recall
+- noise degrades metrics: progressively noisier observers give lower Pearson
+- perfect beats null: a perfect observer strictly outperforms a zero observer
+  on all metrics
+- granularity aggregation consistency: a perfect locus-level count matrix
+  summed to family_id also gives metrics of 1.0
+- random counts give low correlation (below 0.7 for 50-feature random data)
+
+
+## Workflow tests (test/workflow/)
+
+test_snakemake_dryrun.py verifies that snakemake --dry-run succeeds for both
+production configs (simulation_smartseq2.yaml and simulation_chromium.yaml)
+and for the test negative control config. These tests are skipped if snakemake
+is not in the PATH and are marked with the workflow marker.
+
+Snakefile_test is a separate Snakemake workflow that includes all production
+snmk modules and adds the negative control rules:
+
+simulate_from_genes runs simulate_reads.py with the Ensembl gene GTF instead
+of the repeat GTF. Reads come from gene body regions rather than repeat
+elements. This gives a ground truth over gene features.
+
+check_negative_control_recall runs evaluate.py comparing that gene-body
+ground truth against the repeat quantification output and asserts that recall
+is below the threshold in testing.negative_control_max_recall (default 0.10).
+A low recall means the pipeline correctly does not attribute gene-body reads
+to repeat elements.
+
+The conda environment for these rules is defined in
+test/workflow/envs/test_evaluation.yaml (python, scipy, pandas, numpy).
+
+
+## GitHub Actions
+
+.github/workflows/tests.yml defines three jobs:
+
+unit-and-integration runs on every push and pull request. It uses plain
+Python and does not need any bioinformatics tools installed.
+
+snakemake-dryrun runs on every push and pull request. It installs snakemake
+via micromamba and runs dry-run checks on both production configs and the test
+Snakefile. Snakemake installs conda dependencies itself when --use-conda is
+passed, but dry-runs do not trigger conda installs.
+
+negative-control-run is only triggered manually (workflow_dispatch) or when
+the repository variable ENABLE_FULL_INTEGRATION is set to true. It runs the
+full negative control workflow with --use-conda and requires reference data
+to be accessible from the runner.
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000..ad80b2e
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,13 @@
+"""
+Shared pytest configuration and path setup.
+
+Adds workflow/scripts to sys.path so unit tests can import scripts directly.
+"""
+import sys
+import os
+
+SCRIPTS_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), '..', 'workflow', 'scripts')
+)
+if SCRIPTS_DIR not in sys.path:
+    sys.path.insert(0, SCRIPTS_DIR)
diff --git a/test/integration/__init__.py b/test/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/integration/test_feature_set_consistency.py b/test/integration/test_feature_set_consistency.py
new file mode 100644
index 0000000..3559831
--- /dev/null
+++ b/test/integration/test_feature_set_consistency.py
@@ -0,0 +1,393 @@
+"""
+Integration tests for evaluation consistency across feature sets.
+
+Background
+----------
+The pipeline evaluates each (feature_set × granularity × aligner) combination.
+An initially puzzling result was that genic_repeats and intergenic_repeats at
+gene_id granularity showed lower metrics than the full repeats set, even for a
+hypothetically perfect observer.
+
+Root cause (now fixed in evaluate.py)
+--------------------------------------
+At gene_id granularity, load_ground_truth() previously aggregated ALL loci for
+a given gene_id (summing genic + intergenic copies), then filtered the already-
+aggregated dict by checking whether the gene_id appeared in the genic locus_map.
+
+For a gene_id like AluSz6 that has copies in BOTH genic and intergenic regions:
+
+  - genic locus_map contains AluSz6 (it has genic copies)
+  - OLD truth['c1']['AluSz6'] = 5 (genic) + 3 (intergenic) = 8   ← inflated
+  - observed from genic normalization = 5 (only genic copy)
+  - -> truth > observed -> degraded Pearson / RMSE, false-negative detections
+
+The fix moves locus_map loading to BEFORE load_ground_truth() and passes a
+valid_locus_ids set so filtering happens AT LOCUS LEVEL before any aggregation.
+
+Scenario used by these tests
+-----------------------------
+We construct a minimal ground truth with:
+  - genic_alu_1      -> AluSz6, Alu, SINE, count=5   (GENIC)
+  - intergenic_alu_1 -> AluSz6, Alu, SINE, count=3   (INTERGENIC - same gene_id!)
+  - genic_alu_2      -> AluSx,  Alu, SINE, count=7   (GENIC)
+  - intergenic_alu_2 -> AluSx,  Alu, SINE, count=6   (INTERGENIC - same gene_id!)
+  - genic_L1_1       -> L1PA2,  L1,  LINE, count=2   (GENIC)
+  - genic_mir_1      -> MIR,    MIR, SINE, count=4   (GENIC)
+  - genic_L2_1       -> L2,     L2,  LINE, count=3   (GENIC)
+  - intergenic_dna_1 -> TcMar,  TcMar, DNA, count=1  (INTERGENIC)
+  - [TcMar has a genic reference locus genic_tcmar_ref not expressed in
+     simulation, but present in the genic locus_map - this triggers the
+     false-negative detection failure in the old code.]
+
+Genic locus_map contains:
+  genic_alu_1, genic_alu_2, genic_L1_1, genic_mir_1, genic_L2_1, genic_tcmar_ref
+
+Intergenic locus_map contains:
+  intergenic_alu_1, intergenic_alu_2, intergenic_dna_1
+
+AluSz6 and AluSx appear in BOTH locus_maps (gene_id level).
+
+With the fix:
+  genic truth at gene_id: AluSz6=5, AluSx=7, L1PA2=2, MIR=4, L2=3, TcMar=0
+  genic observed at gene_id (from genic normalization): AluSz6=5, AluSx=7, L1PA2=2, MIR=4, L2=3
+  -> perfect match -> pearson=1.0, recall=1.0, precision=1.0
+"""
+import csv
+import math
+import os
+import subprocess
+import sys
+
+import pytest
+
+SCRIPTS_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), '..', '..', 'workflow', 'scripts')
+)
+EVALUATE = os.path.join(SCRIPTS_DIR, 'evaluate.py')
+
+# ---------------------------------------------------------------------------
+# Shared fixture: build the scenario files
+# ---------------------------------------------------------------------------
+
+GROUND_TRUTH_ROWS = [
+    # cell_id, locus_id, repeat_id (=gene_id), family_id, class_id, true_count
+    ('cell_001', 'genic_alu_1',       'AluSz6', 'Alu',   'SINE', '5'),
+    ('cell_001', 'intergenic_alu_1',  'AluSz6', 'Alu',   'SINE', '3'),
+    ('cell_001', 'genic_alu_2',       'AluSx',  'Alu',   'SINE', '7'),
+    ('cell_001', 'intergenic_alu_2',  'AluSx',  'Alu',   'SINE', '6'),
+    ('cell_001', 'genic_L1_1',        'L1PA2',  'L1',    'LINE', '2'),
+    ('cell_001', 'genic_mir_1',       'MIR',    'MIR',   'SINE', '4'),
+    ('cell_001', 'genic_L2_1',        'L2',     'L2',    'LINE', '3'),
+    ('cell_001', 'intergenic_dna_1',  'TcMar',  'TcMar', 'DNA',  '1'),
+]
+
+# Genic locus_map includes a reference locus for TcMar that was NOT expressed
+# in the simulation.  In the old code, TcMar's intergenic count (1) would
+# bleed into the genic truth because TcMar is in the genic locus_map.
+GENIC_LOCUS_MAP = [
+    ('genic_alu_1',      'AluSz6', 'Alu',   'SINE'),
+    ('genic_alu_2',      'AluSx',  'Alu',   'SINE'),
+    ('genic_L1_1',       'L1PA2',  'L1',    'LINE'),
+    ('genic_mir_1',      'MIR',    'MIR',   'SINE'),
+    ('genic_L2_1',       'L2',     'L2',    'LINE'),
+    # reference locus for TcMar - exists in genome but NOT expressed
+    ('genic_tcmar_ref',  'TcMar',  'TcMar', 'DNA'),
+]
+
+INTERGENIC_LOCUS_MAP = [
+    ('intergenic_alu_1',  'AluSz6', 'Alu',   'SINE'),
+    ('intergenic_alu_2',  'AluSx',  'Alu',   'SINE'),
+    ('intergenic_dna_1',  'TcMar',  'TcMar', 'DNA'),
+]
+
+FULL_LOCUS_MAP = GENIC_LOCUS_MAP + INTERGENIC_LOCUS_MAP
+
+# "Perfect genic observer" at gene_id level - exactly what genic normalization produces:
+# only counts from genic loci, using GENIC_LOCUS_MAP.
+GENIC_OBSERVED_GENE_ID = {
+    'AluSz6': 5,
+    'AluSx':  7,
+    'L1PA2':  2,
+    'MIR':    4,
+    'L2':     3,
+    # TcMar: 0 (its only genic reference locus was not expressed)
+}
+
+# "Perfect intergenic observer" at gene_id level
+INTERGENIC_OBSERVED_GENE_ID = {
+    'AluSz6': 3,
+    'AluSx':  6,
+    'TcMar':  1,
+}
+
+# "Perfect full observer" at gene_id level
+FULL_OBSERVED_GENE_ID = {
+    'AluSz6': 8,
+    'AluSx':  13,
+    'L1PA2':  2,
+    'MIR':    4,
+    'L2':     3,
+    'TcMar':  1,
+}
+
+# "Perfect genic observer" at locus level
+GENIC_OBSERVED_LOCUS = {
+    'genic_alu_1':  5,
+    'genic_alu_2':  7,
+    'genic_L1_1':   2,
+    'genic_mir_1':  4,
+    'genic_L2_1':   3,
+}
+
+
+# ---------------------------------------------------------------------------
+# File-writing helpers
+# ---------------------------------------------------------------------------
+
+def write_ground_truth(tmp_path):
+    p = tmp_path / 'ground_truth.tsv'
+    with open(p, 'w') as fh:
+        fh.write('cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n')
+        for row in GROUND_TRUTH_ROWS:
+            fh.write('\t'.join(row) + '\n')
+    return p
+
+
+def write_locus_map(tmp_path, name, rows):
+    p = tmp_path / name
+    with open(p, 'w') as fh:
+        for row in rows:
+            fh.write('\t'.join(row) + '\n')
+    return p
+
+
+def write_count_matrix(tmp_path, name, feature_counts, cells=('cell_001',)):
+    p = tmp_path / name
+    features = sorted(feature_counts)
+    with open(p, 'w') as fh:
+        fh.write('feature_id\t' + '\t'.join(cells) + '\n')
+        for feat in features:
+            vals = [str(feature_counts[feat]) for _ in cells]
+            fh.write(feat + '\t' + '\t'.join(vals) + '\n')
+    return p
+
+
+def run_evaluate(tmp_path, gt, counts, locus_map, granularity, feature_set):
+    prefix = str(tmp_path / f'{feature_set}_{granularity}')
+    cmd = [
+        sys.executable, EVALUATE,
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'test',
+        '--multimapper-mode', 'unique',
+        '--granularity', granularity,
+        '--feature-set', feature_set,
+        '--locus-map', str(locus_map),
+        '--output-prefix', prefix,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f'evaluate.py failed:\n{result.stderr}')
+    return read_global_metrics(prefix + '_global_metrics.tsv')
+
+
+def read_global_metrics(path):
+    with open(path) as fh:
+        reader = csv.DictReader(fh, delimiter='\t')
+        return next(reader)
+
+
+def parse_float(v):
+    try:
+        return float(v)
+    except (ValueError, TypeError):
+        return float('nan')
+
+
+# ---------------------------------------------------------------------------
+# Test 1: Structural - gene_id overlap between genic and intergenic locus_maps
+# ---------------------------------------------------------------------------
+
+def test_gene_ids_overlap_between_genic_and_intergenic_locus_maps():
+    """
+    Documents the structural precondition for the bug:
+    AluSz6 and AluSx appear as gene_ids in BOTH genic and intergenic locus_maps.
+    This is expected in real data (repeat families span both contexts) and is
+    the reason why locus-level filtering before aggregation is necessary.
+    """
+    genic_gene_ids = {row[1] for row in GENIC_LOCUS_MAP}
+    intergenic_gene_ids = {row[1] for row in INTERGENIC_LOCUS_MAP}
+    shared = genic_gene_ids & intergenic_gene_ids
+    assert len(shared) > 0, (
+        'Expected at least one gene_id in both genic and intergenic maps '
+        f'but got genic={genic_gene_ids}, intergenic={intergenic_gene_ids}'
+    )
+    assert 'AluSz6' in shared
+    assert 'AluSx' in shared
+
+
+def test_full_locus_map_partitions_at_locus_level_not_gene_id():
+    """
+    At locus level, genic and intergenic locus_maps are disjoint.
+    At gene_id level they overlap.  This is why the old post-aggregation
+    filter could not correctly separate the two subsets.
+    """
+    genic_loci = {row[0] for row in GENIC_LOCUS_MAP}
+    intergenic_loci = {row[0] for row in INTERGENIC_LOCUS_MAP}
+    # Locus level: disjoint
+    assert genic_loci & intergenic_loci == set()
+    # Gene_id level: overlapping
+    genic_gene_ids = {row[1] for row in GENIC_LOCUS_MAP}
+    intergenic_gene_ids = {row[1] for row in INTERGENIC_LOCUS_MAP}
+    assert genic_gene_ids & intergenic_gene_ids != set()
+
+
+# ---------------------------------------------------------------------------
+# Test 2: Full repeats, gene_id - perfect observer -> perfect metrics (baseline)
+# ---------------------------------------------------------------------------
+
+def test_full_repeats_gene_id_perfect_observer_gives_perfect_metrics(tmp_path):
+    """
+    Sanity baseline: if observed = truth for all repeats at gene_id level,
+    all accuracy metrics should be 1.0 / 0.0.
+    """
+    gt = write_ground_truth(tmp_path)
+    lm = write_locus_map(tmp_path, 'full_locus_map.tsv', FULL_LOCUS_MAP)
+    counts = write_count_matrix(tmp_path, 'full_counts.tsv', FULL_OBSERVED_GENE_ID)
+    m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'repeats')
+    assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['log1p_rmse']) == pytest.approx(0.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# Test 3: Genic repeats, locus granularity - perfect observer -> perfect metrics
+# ---------------------------------------------------------------------------
+
+def test_genic_repeats_locus_granularity_perfect_observer(tmp_path):
+    """
+    At locus level, a perfect genic observer should give perfect metrics.
+    No cross-partition contamination is possible at locus level because
+    locus IDs are unique per repeat instance.
+    """
+    gt = write_ground_truth(tmp_path)
+    lm = write_locus_map(tmp_path, 'genic_locus_map.tsv', GENIC_LOCUS_MAP)
+    counts = write_count_matrix(tmp_path, 'genic_locus_counts.tsv', GENIC_OBSERVED_LOCUS)
+    m = run_evaluate(tmp_path, gt, counts, lm, 'locus', 'genic_repeats')
+    assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# Test 4 (THE KEY TEST): Genic repeats, gene_id - perfect genic observer
+#          should now give perfect metrics after the fix
+# ---------------------------------------------------------------------------
+
+def test_genic_repeats_gene_id_perfect_genic_observer_gives_perfect_metrics(tmp_path):
+    """
+    Core regression test for the locus-level filtering fix.
+
+    With the OLD code (post-aggregation filter):
+      - truth['cell_001']['AluSz6'] = 5 + 3 = 8  (sums ALL AluSz6 loci)
+      - observed['cell_001']['AluSz6'] = 5  (only genic loci)
+      - -> truth > observed -> degraded pearson, false-negative for TcMar
+
+    With the FIXED code (locus-level filter BEFORE aggregation):
+      - only genic loci contribute to truth
+      - truth['cell_001']['AluSz6'] = 5, truth['cell_001']['AluSx'] = 7
+      - TcMar: its intergenic count is excluded; genic_tcmar_ref is not expressed
+        -> TcMar absent from both truth and observed -> no false negative
+      - observed matches truth exactly -> all metrics = 1.0
+    """
+    gt = write_ground_truth(tmp_path)
+    lm = write_locus_map(tmp_path, 'genic_locus_map.tsv', GENIC_LOCUS_MAP)
+    counts = write_count_matrix(tmp_path, 'genic_gene_id_counts.tsv', GENIC_OBSERVED_GENE_ID)
+    m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'genic_repeats')
+
+    pearson = parse_float(m['pearson_r'])
+    recall = parse_float(m['recall'])
+    precision = parse_float(m['precision'])
+    rmse = parse_float(m['log1p_rmse'])
+
+    assert pearson == pytest.approx(1.0, abs=1e-4), (
+        f'pearson={pearson} - expected 1.0; '
+        'if this fails, the locus-level filtering fix may have been reverted'
+    )
+    assert recall == pytest.approx(1.0, abs=1e-4), f'recall={recall}'
+    assert precision == pytest.approx(1.0, abs=1e-4), f'precision={precision}'
+    assert rmse == pytest.approx(0.0, abs=1e-4), f'log1p_rmse={rmse}'
+
+
+# ---------------------------------------------------------------------------
+# Test 5: Intergenic repeats, gene_id - same fix applies
+# ---------------------------------------------------------------------------
+
+def test_intergenic_repeats_gene_id_perfect_intergenic_observer_gives_perfect_metrics(tmp_path):
+    """
+    Mirror of test 4 for the intergenic subset.
+
+    After the fix, intergenic truth for AluSz6 = 3 (only intergenic_alu_1),
+    not 8 (which would include genic_alu_1).
+    """
+    gt = write_ground_truth(tmp_path)
+    lm = write_locus_map(tmp_path, 'intergenic_locus_map.tsv', INTERGENIC_LOCUS_MAP)
+    counts = write_count_matrix(
+        tmp_path, 'intergenic_counts.tsv', INTERGENIC_OBSERVED_GENE_ID)
+    m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'intergenic_repeats')
+
+    assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4), (
+        f"pearson={m['pearson_r']} - expected 1.0 for perfect intergenic observer"
+    )
+    assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4)
+    assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# Test 6: Genic + intergenic metrics should not be lower than full-set metrics
+#          for a perfect full observer (monotonicity property)
+# ---------------------------------------------------------------------------
+
+def test_subset_metrics_not_lower_than_full_for_perfect_observer(tmp_path):
+    """
+    Metamorphic property: if an aligner perfectly recovers ALL repeats,
+    evaluating a clean subset should not give LOWER metrics than the full set.
+
+    We use a perfect full observer and check that both genic and intergenic
+    sub-evaluations also give perfect metrics (1.0).
+
+    This test would have failed before the fix because the inflated truth
+    counts caused pearson < 1.0 even for a perfect observer.
+    """
+    gt = write_ground_truth(tmp_path)
+
+    # Full repeats evaluation
+    lm_full = write_locus_map(tmp_path, 'full_lm.tsv', FULL_LOCUS_MAP)
+    cnt_full = write_count_matrix(tmp_path, 'full_cnt.tsv', FULL_OBSERVED_GENE_ID)
+    m_full = run_evaluate(tmp_path, gt, cnt_full, lm_full, 'gene_id', 'repeats')
+
+    # Genic repeats evaluation (using genic counts only)
+    lm_genic = write_locus_map(tmp_path, 'genic_lm.tsv', GENIC_LOCUS_MAP)
+    cnt_genic = write_count_matrix(tmp_path, 'genic_cnt.tsv', GENIC_OBSERVED_GENE_ID)
+    m_genic = run_evaluate(tmp_path, gt, cnt_genic, lm_genic, 'gene_id', 'genic_repeats')
+
+    # Intergenic repeats evaluation
+    lm_inter = write_locus_map(tmp_path, 'inter_lm.tsv', INTERGENIC_LOCUS_MAP)
+    cnt_inter = write_count_matrix(
+        tmp_path, 'inter_cnt.tsv', INTERGENIC_OBSERVED_GENE_ID)
+    m_inter = run_evaluate(tmp_path, gt, cnt_inter, lm_inter, 'gene_id', 'intergenic_repeats')
+
+    full_pearson = parse_float(m_full['pearson_r'])
+    genic_pearson = parse_float(m_genic['pearson_r'])
+    inter_pearson = parse_float(m_inter['pearson_r'])
+
+    # All should be 1.0 (perfect observer) - none should be lower than full
+    assert full_pearson == pytest.approx(1.0, abs=1e-4)
+    assert genic_pearson >= full_pearson - 0.01, (
+        f'genic pearson={genic_pearson} is lower than full={full_pearson}; '
+        'subset metrics should not be worse than full-set metrics for a perfect observer'
+    )
+    assert inter_pearson >= full_pearson - 0.01, (
+        f'intergenic pearson={inter_pearson} is lower than full={full_pearson}'
+    )
diff --git a/test/integration/test_metamorphic.py b/test/integration/test_metamorphic.py
new file mode 100644
index 0000000..e080e64
--- /dev/null
+++ b/test/integration/test_metamorphic.py
@@ -0,0 +1,264 @@
+"""
+Metamorphic tests for the evaluation pipeline.
+
+Metamorphic testing verifies properties that must hold across transformations
+of the input, without needing exact expected values.  This is especially
+useful for a bioinformatics benchmarking pipeline where true correct outputs
+are not always known in advance.
+
+Properties tested
+-----------------
+1. Scale invariance:     multiplying observed counts by a constant should not
+                         change Pearson or Spearman (but will change RMSE).
+2. Monotone recall:      adding true positives to observed should increase
+                         or maintain recall - never decrease it.
+3. Noise degrades metrics:  adding random noise to a perfect observer should
+                         monotonically worsen Pearson and RMSE.
+4. Null observer is worst:  a perfect observer beats a zero observer on all
+                         quantification and detection metrics.
+5. Permuting cell labels degrades per-cell metrics but not global metrics
+                         computed on pooled counts.
+6. Family_id granularity:  summing a perfect locus-level count matrix to
+                         family_id should also give perfect metrics.
+"""
+import math
+import random
+
+import pytest
+
+import evaluate as ev
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def make_truth_obs(n_cells=4, n_features=8, seed=42):
+    """Return a (truth, observed) pair where observed = truth (perfect)."""
+    rng = random.Random(seed)
+    cells = [f'c{i}' for i in range(n_cells)]
+    feats = [f'feat_{i}' for i in range(n_features)]
+    truth = {c: {f: rng.randint(1, 20) for f in feats} for c in cells}
+    obs = {c: dict(d) for c, d in truth.items()}
+    return truth, obs, cells, feats
+
+
+# ---------------------------------------------------------------------------
+# 1. Scale invariance of rank-based metrics
+# ---------------------------------------------------------------------------
+
+def test_scale_invariance_pearson():
+    """Multiplying observed by a scalar should not change Pearson."""
+    truth, obs, cells, feats = make_truth_obs()
+    m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats)
+
+    obs_scaled = {c: {f: v * 10 for f, v in d.items()} for c, d in obs.items()}
+    m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats)
+
+    assert m1['pearson_r'] == pytest.approx(m2['pearson_r'], abs=1e-4)
+
+
+def test_scale_invariance_spearman():
+    """Multiplying observed by a scalar should not change Spearman."""
+    truth, obs, cells, feats = make_truth_obs()
+    m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats)
+
+    obs_scaled = {c: {f: v * 3 for f, v in d.items()} for c, d in obs.items()}
+    m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats)
+
+    assert m1['spearman_r'] == pytest.approx(m2['spearman_r'], abs=1e-4)
+
+
+def test_scale_changes_rmse():
+    """Scaling observed DOES change log1p_rmse."""
+    truth, obs, cells, feats = make_truth_obs()
+    m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats)
+
+    obs_scaled = {c: {f: v * 100 for f, v in d.items()} for c, d in obs.items()}
+    m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats)
+
+    # RMSE for perfect observer is 0; scaled observer should have higher RMSE
+    assert m2['log1p_rmse'] > m1['log1p_rmse']
+
+
+# ---------------------------------------------------------------------------
+# 2. Monotone recall: adding true positives never decreases recall
+# ---------------------------------------------------------------------------
+
+def test_monotone_recall_adding_true_positives():
+    """
+    Start with a partially correct observer.  Add more true positives.
+    Recall must not decrease.
+    """
+    truth = {'c1': {'A': 5, 'B': 3, 'C': 2, 'D': 7}}
+    feats = ['A', 'B', 'C', 'D']
+
+    obs_partial = {'c1': {'A': 5}}   # only A correct
+    obs_more    = {'c1': {'A': 5, 'B': 3}}   # A and B correct
+
+    m1 = ev.compute_metrics_for_subset(truth, obs_partial, ['c1'], feats)
+    m2 = ev.compute_metrics_for_subset(truth, obs_more, ['c1'], feats)
+
+    assert m2['recall'] >= m1['recall'] - 1e-9
+
+
+def test_monotone_recall_full_recovery_is_maximum():
+    truth, obs, cells, feats = make_truth_obs()
+    # Partial: only first half of features observed
+    feats_half = feats[:len(feats)//2]
+    obs_partial = {c: {f: d[f] for f in feats_half if f in d} for c, d in obs.items()}
+
+    m_partial = ev.compute_metrics_for_subset(truth, obs_partial, cells, feats)
+    m_full = ev.compute_metrics_for_subset(truth, obs, cells, feats)
+
+    assert m_full['recall'] >= m_partial['recall'] - 1e-9
+    assert m_full['f1'] >= m_partial['f1'] - 1e-9
+
+
+# ---------------------------------------------------------------------------
+# 3. Noise degrades metrics monotonically
+# ---------------------------------------------------------------------------
+
+def test_increasing_noise_degrades_pearson():
+    """
+    Progressively noisier observers should give monotonically worse Pearson.
+    """
+    truth, _, cells, feats = make_truth_obs(seed=7)
+    prev_r = 1.0
+
+    rng = random.Random(7)
+    obs = {c: dict(d) for c, d in truth.items()}
+
+    noise_levels = [0, 1, 3, 10, 50]
+    for level in noise_levels:
+        noisy_obs = {
+            c: {f: max(0, v + rng.randint(-level, level)) for f, v in d.items()}
+            for c, d in truth.items()
+        }
+        m = ev.compute_metrics_for_subset(truth, noisy_obs, cells, feats)
+        r = m['pearson_r']
+        if not isinstance(r, str) and not math.isnan(float(r)):
+            assert float(r) <= prev_r + 0.05, (
+                f'Pearson increased from {prev_r} to {r} as noise level went to {level}'
+            )
+            prev_r = float(r)
+
+
+# ---------------------------------------------------------------------------
+# 4. Perfect observer strictly beats null observer
+# ---------------------------------------------------------------------------
+
+def test_perfect_beats_null_on_all_metrics():
+    truth, perfect_obs, cells, feats = make_truth_obs(seed=3)
+    null_obs = {}
+
+    m_perfect = ev.compute_metrics_for_subset(truth, perfect_obs, cells, feats)
+    m_null = ev.compute_metrics_for_subset(truth, null_obs, cells, feats)
+
+    null_pearson = m_null.get('pearson_r', 'NA')
+    null_pearson_val = 0.0 if null_pearson in ('NA', '') else float(null_pearson)
+    assert float(m_perfect['pearson_r']) > null_pearson_val
+    assert float(m_perfect['recall']) > float(m_null['recall'])
+    assert float(m_perfect['f1']) > float(m_null['f1'])
+    assert (float(m_perfect['log1p_rmse'])
+            < float(m_null['log1p_rmse']))
+
+
+# ---------------------------------------------------------------------------
+# 5. Granularity aggregation consistency
+# ---------------------------------------------------------------------------
+
+def test_locus_counts_summed_to_family_give_correct_metrics(tmp_path):
+    """
+    A locus-level count matrix summed to family_id should give the same
+    result as loading ground truth at family_id granularity directly.
+    Validates that the granularity aggregation pipeline is self-consistent.
+    """
+    import csv
+    import os
+    import sys
+    import subprocess
+
+    EVALUATE = os.path.join(
+        os.path.dirname(__file__), '..', '..', 'workflow', 'scripts', 'evaluate.py'
+    )
+
+    gt_rows = [
+        ('cell_001', 'AluSz6_dup1', 'AluSz6', 'Alu',  'SINE', '5'),
+        ('cell_001', 'AluSz6_dup2', 'AluSz6', 'Alu',  'SINE', '3'),
+        ('cell_001', 'L1PA2_dup1',  'L1PA2',  'L1',   'LINE', '7'),
+        ('cell_001', 'MIR3_dup1',   'MIR3',   'MIR',  'SINE', '2'),
+    ]
+    locus_map_rows = [
+        ('AluSz6_dup1', 'AluSz6', 'Alu',  'SINE'),
+        ('AluSz6_dup2', 'AluSz6', 'Alu',  'SINE'),
+        ('L1PA2_dup1',  'L1PA2',  'L1',   'LINE'),
+        ('MIR3_dup1',   'MIR3',   'MIR',  'SINE'),
+    ]
+
+    gt_path = tmp_path / 'gt.tsv'
+    with open(gt_path, 'w') as fh:
+        fh.write('cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n')
+        for row in gt_rows:
+            fh.write('\t'.join(row) + '\n')
+
+    lm_path = tmp_path / 'lm.tsv'
+    with open(lm_path, 'w') as fh:
+        for row in locus_map_rows:
+            fh.write('\t'.join(row) + '\n')
+
+    # Perfect observer at family_id level (manually aggregated)
+    family_counts = {'Alu': 8, 'L1': 7, 'MIR': 2}
+    cnt_path = tmp_path / 'counts.tsv'
+    with open(cnt_path, 'w') as fh:
+        fh.write('feature_id\tcell_001\n')
+        for fam, cnt in family_counts.items():
+            fh.write(f'{fam}\t{cnt}\n')
+
+    prefix = str(tmp_path / 'family_eval')
+    cmd = [
+        sys.executable, EVALUATE,
+        '--ground-truth', str(gt_path),
+        '--observed-counts', str(cnt_path),
+        '--aligner', 'test',
+        '--multimapper-mode', 'unique',
+        '--granularity', 'family_id',
+        '--feature-set', 'repeats',
+        '--locus-map', str(lm_path),
+        '--output-prefix', prefix,
+    ]
+    r = subprocess.run(cmd, capture_output=True, text=True)
+    assert r.returncode == 0, r.stderr
+
+    with open(prefix + '_global_metrics.tsv') as fh:
+        m = next(csv.DictReader(fh, delimiter='\t'))
+
+    pearson = float(m['pearson_r'])
+    recall = float(m['recall'])
+    assert pearson == pytest.approx(1.0, abs=1e-4), (
+        f'family_id aggregation: pearson={pearson}, expected 1.0'
+    )
+    assert recall == pytest.approx(1.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# 6. Random counts give low (not artificially inflated) metrics
+# ---------------------------------------------------------------------------
+
+def test_random_observer_gives_low_correlation():
+    """
+    A random observer uncorrelated with truth should give Pearson near 0.
+    Uses enough data points to make this reliable.
+    """
+    rng = random.Random(99)
+    n_feats = 50
+    feats = [f'f{i}' for i in range(n_feats)]
+    truth = {'c1': {f: rng.randint(1, 100) for f in feats}}
+    random_obs = {'c1': {f: rng.randint(1, 100) for f in feats}}
+
+    m = ev.compute_metrics_for_subset(truth, random_obs, ['c1'], feats)
+    # With 50 random pairs, Pearson should rarely be above 0.5
+    r = float(m['pearson_r'])
+    assert abs(r) < 0.7, (
+        f'Random observer gave high Pearson={r}; this suggests the metric is inflated'
+    )
diff --git a/test/unit/__init__.py b/test/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/unit/__pycache__/__init__.cpython-312.pyc b/test/unit/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..35e9f2c
Binary files /dev/null and b/test/unit/__pycache__/__init__.cpython-312.pyc differ
diff --git a/test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc
new file mode 100644
index 0000000..cc71bea
Binary files /dev/null and b/test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc differ
diff --git a/test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc
new file mode 100644
index 0000000..8bbbfe1
Binary files /dev/null and b/test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc differ
diff --git a/test/unit/test_build_rmsk_gtf.py b/test/unit/test_build_rmsk_gtf.py
new file mode 100644
index 0000000..c5f48a0
--- /dev/null
+++ b/test/unit/test_build_rmsk_gtf.py
@@ -0,0 +1,110 @@
+"""
+Unit tests for workflow/scripts/build_rmsk_gtf.py
+"""
+import gzip
+import os
+
+import pytest
+
+import build_rmsk_gtf as rmsk
+
+
+# ---------------------------------------------------------------------------
+# make_gtf_attributes
+# ---------------------------------------------------------------------------
+
+def test_make_gtf_attributes_format():
+    attr = rmsk.make_gtf_attributes('AluSz6', 3, 'Alu', 'SINE')
+    assert 'gene_id "AluSz6"' in attr
+    assert 'transcript_id "AluSz6_dup3"' in attr
+    assert 'family_id "Alu"' in attr
+    assert 'class_id "SINE"' in attr
+
+
+def test_make_gtf_attributes_locus_index():
+    a1 = rmsk.make_gtf_attributes('L1PA2', 1, 'L1', 'LINE')
+    a5 = rmsk.make_gtf_attributes('L1PA2', 5, 'L1', 'LINE')
+    assert 'transcript_id "L1PA2_dup1"' in a1
+    assert 'transcript_id "L1PA2_dup5"' in a5
+
+
+# ---------------------------------------------------------------------------
+# convert_rmsk_to_gtf
+# ---------------------------------------------------------------------------
+
+
+def rmsk_row(chrom='chr1', start=100, end=400, strand='+',
+             name='AluSz6', rep_class='SINE', family='Alu'):
+    return (f'0\t1000\t100\t0\t0\t{chrom}\t{start}\t{end}\t-100\t'
+            f'{strand}\t{name}\t{rep_class}\t{family}\t0\t300\t0\t1\n')
+
+
+def test_convert_rmsk_to_gtf_basic(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(rmsk_row())
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    lines = outfile.read_text().strip().split('\n')
+    assert len(lines) == 1
+    fields = lines[0].split('\t')
+    assert fields[0] == 'chr1'
+    assert fields[2] == 'exon'
+    assert 'gene_id "AluSz6"' in fields[8]
+    assert 'transcript_id "AluSz6_dup1"' in fields[8]
+
+
+def test_convert_rmsk_to_gtf_increments_dup_index(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(
+        rmsk_row(name='AluSz6', start=100, end=400) +
+        rmsk_row(name='AluSz6', start=500, end=900)
+    )
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    lines = outfile.read_text().strip().split('\n')
+    assert len(lines) == 2
+    assert 'transcript_id "AluSz6_dup1"' in lines[0]
+    assert 'transcript_id "AluSz6_dup2"' in lines[1]
+
+
+def test_convert_rmsk_to_gtf_filters_short(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    # Only 30 bp wide -> below default min_length=50
+    infile.write_text(rmsk_row(start=100, end=130))
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    assert outfile.read_text().strip() == ''
+
+
+def test_convert_rmsk_to_gtf_chromosome_filter(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(
+        rmsk_row(chrom='chr1') +
+        rmsk_row(chrom='chr2')
+    )
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile), allowed_chroms={'chr1'})
+    lines = [l for l in outfile.read_text().strip().split('\n') if l]
+    assert len(lines) == 1
+    assert lines[0].startswith('chr1')
+
+
+def test_convert_rmsk_to_gtf_gtf_start_is_one_based(tmp_path):
+    # RMSK uses 0-based start; GTF should be 1-based
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(rmsk_row(start=100, end=400))
+    outfile = tmp_path / 'out.gtf'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    fields = outfile.read_text().strip().split('\t')
+    assert fields[3] == '101'   # 100 + 1
+    assert fields[4] == '400'   # end unchanged
+
+
+def test_convert_rmsk_to_gtf_compressed_output(tmp_path):
+    infile = tmp_path / 'rmsk.txt'
+    infile.write_text(rmsk_row())
+    outfile = tmp_path / 'out.gtf.gz'
+    rmsk.convert_rmsk_to_gtf(str(infile), str(outfile))
+    with gzip.open(outfile, 'rt') as fh:
+        content = fh.read()
+    assert 'AluSz6' in content
diff --git a/test/unit/test_count_starsolo_locus.py b/test/unit/test_count_starsolo_locus.py
new file mode 100644
index 0000000..0589c1c
--- /dev/null
+++ b/test/unit/test_count_starsolo_locus.py
@@ -0,0 +1,200 @@
+"""
+Unit tests for count_starsolo_locus.py pure functions and mocked subprocess paths.
+"""
+import os
+import sys
+import textwrap
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import count_starsolo_locus as csl
+
+
+# ---------------------------------------------------------------------------
+# parse_locus_id
+# ---------------------------------------------------------------------------
+
+def test_parse_locus_id_plus_strand():
+    result = csl.parse_locus_id("AluSz_dup1::chr10:10000-10435(+)")
+    assert result == ("chr10", 10000, 10435, "+")
+
+
+def test_parse_locus_id_minus_strand():
+    result = csl.parse_locus_id("L1M4_dup3::10:65000-65900(-)")
+    assert result == ("10", 65000, 65900, "-")
+
+
+def test_parse_locus_id_malformed_returns_none():
+    assert csl.parse_locus_id("no_double_colon") is None
+    assert csl.parse_locus_id("bad::coords") is None
+
+
+# ---------------------------------------------------------------------------
+# load_intervals
+# ---------------------------------------------------------------------------
+
+def test_load_intervals_basic(tmp_path):
+    fa = tmp_path / "test.fa"
+    fa.write_text(
+        ">AluSz_dup1::10:10000-10435(+)\nACGT\n"
+        ">L1M4_dup3::10:65000-65900(-)\nACGT\n"
+    )
+    chrom_starts, chrom_intervals = csl.load_intervals(str(fa))
+    assert "10" in chrom_intervals
+    assert len(chrom_intervals["10"]) == 2
+    # sorted by start
+    assert chrom_intervals["10"][0][0] == 10000
+    assert chrom_intervals["10"][1][0] == 65000
+    assert chrom_starts["10"] == [10000, 65000]
+
+
+def test_load_intervals_skips_non_header_lines(tmp_path):
+    fa = tmp_path / "test.fa"
+    fa.write_text(">AluSz_dup1::10:10000-10200(+)\nACGTACGT\n")
+    _, intervals = csl.load_intervals(str(fa))
+    assert len(intervals["10"]) == 1
+    assert intervals["10"][0][2] == "AluSz_dup1"
+
+
+def test_load_intervals_deduplicates(tmp_path):
+    fa = tmp_path / "test.fa"
+    fa.write_text(
+        ">dup1::10:100-200(+)\nACGT\n"
+        ">dup1::10:100-200(+)\nACGT\n"
+    )
+    _, intervals = csl.load_intervals(str(fa))
+    assert len(intervals["10"]) == 1
+
+
+# ---------------------------------------------------------------------------
+# find_locus
+# ---------------------------------------------------------------------------
+
+def make_intervals():
+    chrom_intervals = {"10": [(10000, 10200, "AluSz_dup1"), (20000, 20500, "L1_dup2")]}
+    chrom_starts = {"10": [10000, 20000]}
+    return chrom_starts, chrom_intervals
+
+
+def test_find_locus_hit_first():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 10100, cs, ci) == "AluSz_dup1"
+
+
+def test_find_locus_hit_second():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 20300, cs, ci) == "L1_dup2"
+
+
+def test_find_locus_boundary_start():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 10000, cs, ci) == "AluSz_dup1"
+
+
+def test_find_locus_boundary_end_exclusive():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 10200, cs, ci) is None
+
+
+def test_find_locus_before_all():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 5000, cs, ci) is None
+
+
+def test_find_locus_between_intervals():
+    cs, ci = make_intervals()
+    assert csl.find_locus("10", 15000, cs, ci) is None
+
+
+def test_find_locus_unknown_chrom():
+    cs, ci = make_intervals()
+    assert csl.find_locus("chrX", 10100, cs, ci) is None
+
+
+# ---------------------------------------------------------------------------
+# process_smartseq2 with mocked subprocess
+# ---------------------------------------------------------------------------
+
+def make_fasta(tmp_path):
+    fa = tmp_path / "rep.fa"
+    fa.write_text(">AluSz_dup1::10:10000-10200(+)\nACGT\n>L1_dup2::10:20000-20500(+)\nACGT\n")
+    return str(fa)
+
+
+def _mock_proc(lines):
+    proc = MagicMock()
+    proc.stdout = iter([l.encode() for l in lines])
+    return proc
+
+
+def test_process_smartseq2_assigns_reads(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    sam = [
+        "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n",
+        "cell_001_r2_AluSz_dup1\t0\t10\t10080\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n",
+        "cell_002_r3_L1_dup2\t0\t10\t20100\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n",
+    ]
+    with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)):
+        counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "unique")
+    assert "cell_001" in cbs
+    assert "cell_002" in cbs
+    assert counts["cell_001"]["AluSz_dup1"] == 2
+    assert counts["cell_002"]["L1_dup2"] == 1
+
+
+def test_process_smartseq2_unique_filter(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    # NH:i:3 = multimapper, should be dropped in unique mode
+    sam = [
+        "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:3\n",
+    ]
+    with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)):
+        counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "unique")
+    assert len(cbs) == 0
+
+
+def test_process_smartseq2_multi_mode_keeps_multimappers(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    sam = [
+        "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:3\n",
+    ]
+    with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)):
+        counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "multi")
+    assert "cell_001" in cbs
+
+
+# ---------------------------------------------------------------------------
+# process_chromium with mocked subprocess
+# ---------------------------------------------------------------------------
+
+def test_process_chromium_assigns_reads(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    sam = [
+        "r1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:ACGAACGAAGTGAGCT\tUB:Z:GGATGACGAAGG\n",
+        "r2\t0\t10\t10080\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:ACGAACGAAGTGAGCT\tUB:Z:TTTTTTTTTAAA\n",
+        "r3\t0\t10\t20100\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:CTAAGCCACCCTGAAG\tUB:Z:AAAAAAAAAAAA\n",
+    ]
+    with patch("count_starsolo_locus.sort_bam_by_cb", return_value="/tmp/fake_sorted.bam"), \
+         patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)), \
+         patch("os.unlink"):
+        counts, cbs, loci = csl.process_chromium(
+            "/fake.bam", chrom_starts, chrom_intervals, "unique", 1)
+    assert "ACGAACGAAGTGAGCT" in cbs
+    assert "CTAAGCCACCCTGAAG" in cbs
+    # UMI deduplication: 2 distinct UMIs for cell1/AluSz_dup1
+    assert counts["ACGAACGAAGTGAGCT"]["AluSz_dup1"] == 2
+    assert counts["CTAAGCCACCCTGAAG"]["L1_dup2"] == 1
+
+
+def test_process_chromium_drops_no_cb(tmp_path):
+    chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path))
+    sam = [
+        "r1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n",  # no CB tag
+    ]
+    with patch("count_starsolo_locus.sort_bam_by_cb", return_value="/tmp/fake.bam"), \
+         patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)), \
+         patch("os.unlink"):
+        counts, cbs, loci = csl.process_chromium(
+            "/fake.bam", chrom_starts, chrom_intervals, "unique", 1)
+    assert len(cbs) == 0
diff --git a/test/unit/test_evaluate.py b/test/unit/test_evaluate.py
new file mode 100644
index 0000000..cdfcc1b
--- /dev/null
+++ b/test/unit/test_evaluate.py
@@ -0,0 +1,568 @@
+"""
+Unit tests for workflow/scripts/evaluate.py
+
+Covers ~80% of the pure-function logic: metric computations, data loading,
+and the vector-alignment helpers. Does not require any external bioinformatics
+tools - only scipy (already in the evaluation conda env).
+"""
+import csv
+import io
+import math
+import os
+import sys
+import textwrap
+
+import pytest
+
+# conftest.py adds scripts/ to sys.path
+import evaluate as ev
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def write_tsv(tmp_path, name, rows, fieldnames=None):
+    p = tmp_path / name
+    if fieldnames is None:
+        fieldnames = list(rows[0].keys())
+    with open(p, 'w', newline='') as fh:
+        w = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
+        w.writeheader()
+        w.writerows(rows)
+    return p
+
+
+def make_ground_truth(tmp_path, rows):
+    """Write a ground_truth.tsv and return its path."""
+    fieldnames = ['cell_id', 'locus_id', 'repeat_id', 'family_id', 'class_id', 'true_count']
+    return write_tsv(tmp_path, 'ground_truth.tsv', rows, fieldnames)
+
+
+def make_count_matrix(tmp_path, feature_cell_dict, cell_ids):
+    """
+    Write a feature x cell TSV.
+    feature_cell_dict: {feature_id: {cell_id: count}}
+    """
+    p = tmp_path / 'counts.tsv'
+    features = sorted(feature_cell_dict)
+    with open(p, 'w') as fh:
+        fh.write('feature_id\t' + '\t'.join(cell_ids) + '\n')
+        for feat in features:
+            vals = [str(feature_cell_dict[feat].get(c, 0)) for c in cell_ids]
+            fh.write(feat + '\t' + '\t'.join(vals) + '\n')
+    return p
+
+
+def make_locus_map(tmp_path, rows):
+    """
+    Write a locus_map TSV (no header):
+    transcript_id  gene_id  family_id  class_id
+    """
+    p = tmp_path / 'locus_map.tsv'
+    with open(p, 'w') as fh:
+        for row in rows:
+            fh.write('\t'.join(row) + '\n')
+    return p
+
+
+# ---------------------------------------------------------------------------
+# pearson_r
+# ---------------------------------------------------------------------------
+
+def test_pearson_r_identical_vectors():
+    r, p = ev.pearson_r([1, 2, 3, 4, 5], [1, 2, 3, 4, 5])
+    assert r == pytest.approx(1.0, abs=1e-9)
+
+
+def test_pearson_r_perfectly_anti_correlated():
+    r, _ = ev.pearson_r([1, 2, 3, 4, 5], [5, 4, 3, 2, 1])
+    assert r == pytest.approx(-1.0, abs=1e-9)
+
+
+def test_pearson_r_zero_correlation():
+    # constant y -> variance = 0 -> scipy returns nan
+    r, _ = ev.pearson_r([1, 2, 3], [5, 5, 5])
+    assert math.isnan(r)
+
+
+def test_pearson_r_too_few_points():
+    r, p = ev.pearson_r([1, 2], [1, 2])
+    assert math.isnan(r)
+
+
+def test_pearson_r_known_value():
+    # x = [1, 3], y = [2, 4] -> r = 1.0
+    r, _ = ev.pearson_r([0, 1, 2, 3], [0, 2, 4, 6])
+    assert r == pytest.approx(1.0, abs=1e-9)
+
+
+# ---------------------------------------------------------------------------
+# spearman_r
+# ---------------------------------------------------------------------------
+
+def test_spearman_r_identical_vectors():
+    r, _ = ev.spearman_r([10, 20, 30], [10, 20, 30])
+    assert r == pytest.approx(1.0, abs=1e-9)
+
+
+def test_spearman_r_rank_invariant_to_scale():
+    # spearman should give 1.0 for any monotone transform
+    r, _ = ev.spearman_r([1, 2, 3, 4], [1, 4, 9, 16])
+    assert r == pytest.approx(1.0, abs=1e-9)
+
+
+def test_spearman_r_anti_correlated():
+    r, _ = ev.spearman_r([1, 2, 3], [3, 2, 1])
+    assert r == pytest.approx(-1.0, abs=1e-9)
+
+
+def test_spearman_r_too_few_points():
+    r, _ = ev.spearman_r([1], [1])
+    assert math.isnan(r)
+
+
+# ---------------------------------------------------------------------------
+# log1p_rmse
+# ---------------------------------------------------------------------------
+
+def test_log1p_rmse_perfect():
+    """Identical vectors -> RMSE = 0."""
+    assert ev.log1p_rmse([0, 5, 10, 100], [0, 5, 10, 100]) == pytest.approx(0.0)
+
+
+def test_log1p_rmse_empty():
+    assert math.isnan(ev.log1p_rmse([], []))
+
+
+def test_log1p_rmse_known_single_pair():
+    # log1p(3) - log1p(0) = log(4) ~ 1.386; RMSE = 1.386
+    result = ev.log1p_rmse([3], [0])
+    assert result == pytest.approx(math.log(4), rel=1e-6)
+
+
+def test_log1p_rmse_symmetric():
+    a, b = [1, 2, 3], [4, 5, 6]
+    assert ev.log1p_rmse(a, b) == pytest.approx(ev.log1p_rmse(b, a), rel=1e-9)
+
+
+# ---------------------------------------------------------------------------
+# detection_metrics
+# ---------------------------------------------------------------------------
+
+def test_detection_metrics_perfect_recall_and_precision():
+    expressed = {'A', 'B', 'C'}
+    all_f = {'A', 'B', 'C', 'D'}
+    p, r, f1, j, spec = ev.detection_metrics(expressed, expressed, all_f)
+    assert p == pytest.approx(1.0)
+    assert r == pytest.approx(1.0)
+    assert f1 == pytest.approx(1.0)
+    assert j == pytest.approx(1.0)
+
+
+def test_detection_metrics_no_true_positives():
+    truth = {'A', 'B'}
+    obs = {'C', 'D'}
+    all_f = {'A', 'B', 'C', 'D'}
+    p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f)
+    assert r == pytest.approx(0.0)   # no TP
+    assert p == pytest.approx(0.0)   # no TP
+    assert f1 == pytest.approx(0.0)
+    assert j == pytest.approx(0.0)
+
+
+def test_detection_metrics_all_features_observed_none_true():
+    truth = set()
+    obs = {'A', 'B', 'C'}
+    all_f = {'A', 'B', 'C'}
+    p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f)
+    # TP=0, FP=3, FN=0, TN=0
+    # truth empty -> TP=0, FN=0 -> TP/(TP+FN) = 0/0 -> code returns 0.0 (correct)
+    assert r == pytest.approx(0.0)
+    # precision = 0 (TP=0, FP=3)
+    assert p == pytest.approx(0.0)
+
+
+def test_detection_metrics_partial_overlap():
+    truth = {'A', 'B', 'C', 'D'}
+    obs = {'C', 'D', 'E', 'F'}
+    all_f = {'A', 'B', 'C', 'D', 'E', 'F', 'G'}
+    p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f)
+    # TP=2, FP=2, FN=2, TN=1
+    assert p == pytest.approx(2 / 4)
+    assert r == pytest.approx(2 / 4)
+    assert j == pytest.approx(2 / 6)
+
+
+def test_detection_metrics_specificity():
+    truth = {'A'}
+    obs = {'A'}
+    all_f = {'A', 'B', 'C', 'D'}
+    # TN=3, FP=0 -> spec = 1.0
+    p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f)
+    assert spec == pytest.approx(1.0)
+
+
+# ---------------------------------------------------------------------------
+# build_aligned_vectors
+# ---------------------------------------------------------------------------
+
+def test_build_aligned_vectors_zero_fill_missing():
+    truth = {'c1': {'A': 5, 'B': 3}}
+    obs = {'c1': {'A': 4}}
+    t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1'], ['A', 'B'])
+    assert t_vec == [5, 3]
+    assert o_vec == [4, 0]   # B missing in obs -> 0
+
+
+def test_build_aligned_vectors_ordering():
+    truth = {'c1': {'A': 1}, 'c2': {'A': 2}}
+    obs = {'c1': {'A': 1}, 'c2': {'A': 2}}
+    t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1', 'c2'], ['A'])
+    assert t_vec == [1, 2]
+    assert o_vec == [1, 2]
+
+
+def test_build_aligned_vectors_missing_cell():
+    truth = {'c1': {'A': 5}}
+    obs = {}
+    t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1'], ['A'])
+    assert t_vec == [5]
+    assert o_vec == [0]
+
+
+# ---------------------------------------------------------------------------
+# load_ground_truth - granularity modes
+# ---------------------------------------------------------------------------
+
+GT_ROWS = [
+    {'cell_id': 'c1', 'locus_id': 'AluSz6_dup1', 'repeat_id': 'AluSz6',
+     'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '5'},
+    {'cell_id': 'c1', 'locus_id': 'AluSz6_dup2', 'repeat_id': 'AluSz6',
+     'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '3'},
+    {'cell_id': 'c1', 'locus_id': 'L1PA2_dup1', 'repeat_id': 'L1PA2',
+     'family_id': 'L1', 'class_id': 'LINE', 'true_count': '7'},
+    {'cell_id': 'c2', 'locus_id': 'AluSz6_dup1', 'repeat_id': 'AluSz6',
+     'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '2'},
+]
+
+
+def test_load_ground_truth_locus_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, meta = ev.load_ground_truth(str(p), granularity='locus')
+    # Each locus_id is a separate feature
+    assert truth['c1']['AluSz6_dup1'] == 5
+    assert truth['c1']['AluSz6_dup2'] == 3
+    assert truth['c1']['L1PA2_dup1'] == 7
+    assert truth['c2']['AluSz6_dup1'] == 2
+
+
+def test_load_ground_truth_gene_id_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, meta = ev.load_ground_truth(str(p), granularity='gene_id')
+    # AluSz6_dup1 and AluSz6_dup2 aggregate to AluSz6
+    assert truth['c1']['AluSz6'] == 8
+    assert truth['c1']['L1PA2'] == 7
+    assert truth['c2']['AluSz6'] == 2
+
+
+def test_load_ground_truth_family_id_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, meta = ev.load_ground_truth(str(p), granularity='family_id')
+    # Both AluSz6 loci -> 'Alu' family
+    assert truth['c1']['Alu'] == 8
+    assert truth['c1']['L1'] == 7
+
+
+def test_load_ground_truth_class_id_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, meta = ev.load_ground_truth(str(p), granularity='class_id')
+    assert truth['c1']['SINE'] == 8   # Alu is SINE
+    assert truth['c1']['LINE'] == 7
+
+
+def test_load_ground_truth_valid_locus_ids_filters_before_aggregation(tmp_path):
+    """
+    With valid_locus_ids = {AluSz6_dup1 only}, AluSz6 gene_id should
+    total 5, not 8 (excludes AluSz6_dup2 which is not in valid set).
+    This exercises the core fix for the genic/intergenic partitioning bug.
+    """
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, _ = ev.load_ground_truth(
+        str(p), granularity='gene_id',
+        valid_locus_ids={'AluSz6_dup1', 'L1PA2_dup1'}
+    )
+    assert truth['c1']['AluSz6'] == 5   # only dup1, not dup2
+    assert truth['c1']['L1PA2'] == 7
+    assert 'AluSz6_dup2' not in truth['c1']
+
+
+def test_load_ground_truth_valid_locus_ids_at_locus_granularity(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, _ = ev.load_ground_truth(
+        str(p), granularity='locus',
+        valid_locus_ids={'AluSz6_dup1'}
+    )
+    assert 'AluSz6_dup1' in truth['c1']
+    assert 'AluSz6_dup2' not in truth.get('c1', {})
+
+
+# ---------------------------------------------------------------------------
+# load_count_matrix
+# ---------------------------------------------------------------------------
+
+def test_load_count_matrix_basic(tmp_path):
+    p = make_count_matrix(
+        tmp_path,
+        {'AluSz6': {'c1': 5, 'c2': 0}, 'L1PA2': {'c1': 3, 'c2': 7}},
+        ['c1', 'c2']
+    )
+    obs, matrix_feats = ev.load_count_matrix(str(p))
+    assert obs['c1']['AluSz6'] == 5
+    assert obs['c2']['L1PA2'] == 7
+    # zero counts are not stored
+    assert 'AluSz6' not in obs.get('c2', {})
+    # but the feature is in matrix_features (zero-count rows are tracked)
+    assert 'AluSz6' in matrix_feats
+
+
+def test_load_count_matrix_all_features_tracked(tmp_path):
+    p = make_count_matrix(
+        tmp_path,
+        {'A': {'c1': 0}, 'B': {'c1': 1}},
+        ['c1']
+    )
+    _, matrix_feats = ev.load_count_matrix(str(p))
+    assert 'A' in matrix_feats
+    assert 'B' in matrix_feats
+
+
+# ---------------------------------------------------------------------------
+# compute_metrics_for_subset - end-to-end
+# ---------------------------------------------------------------------------
+
+def test_compute_metrics_perfect_observer():
+    truth = {'c1': {'A': 5, 'B': 3, 'C': 0}, 'c2': {'A': 1, 'B': 8, 'C': 2}}
+    obs = truth
+    metrics = ev.compute_metrics_for_subset(truth, obs, ['c1', 'c2'], ['A', 'B', 'C'])
+    assert metrics['pearson_r'] == pytest.approx(1.0, abs=1e-6)
+    assert metrics['recall'] == pytest.approx(1.0)
+    assert metrics['precision'] == pytest.approx(1.0)
+    assert metrics['log1p_rmse'] == pytest.approx(0.0, abs=1e-9)
+
+
+def test_compute_metrics_null_observer():
+    truth = {'c1': {'A': 5, 'B': 3}, 'c2': {'A': 1, 'B': 8}}
+    obs = {}
+    metrics = ev.compute_metrics_for_subset(truth, obs, ['c1', 'c2'], ['A', 'B'])
+    assert metrics['recall'] == pytest.approx(0.0)
+    assert metrics['precision'] == pytest.approx(0.0 if metrics['precision'] is not None else 0.0)
+
+
+def test_compute_metrics_returns_all_expected_keys():
+    truth = {'c1': {'A': 1, 'B': 2, 'C': 3}}
+    obs = {'c1': {'A': 1, 'B': 2, 'C': 3}}
+    m = ev.compute_metrics_for_subset(truth, obs, ['c1'], ['A', 'B', 'C'])
+    for key in ('pearson_r', 'spearman_r', 'log1p_rmse',
+                'precision', 'recall', 'f1', 'jaccard', 'specificity'):
+        assert key in m, f"Missing key: {key}"
+
+
+# ---------------------------------------------------------------------------
+# compute_per_cell_metrics
+# ---------------------------------------------------------------------------
+
+def test_compute_per_cell_metrics_basic():
+    truth = {'c1': {'A': 5, 'B': 3}, 'c2': {'A': 1}}
+    obs   = {'c1': {'A': 4},          'c2': {'A': 1, 'B': 2}}
+    rows  = ev.compute_per_cell_metrics(truth, obs, ['c1', 'c2'], ['A', 'B'])
+    assert len(rows) == 2
+    assert rows[0]['cell_id'] == 'c1'
+    assert rows[0]['n_truth_expressed'] == 2
+    assert rows[1]['n_observed_expressed'] == 2
+
+
+def test_compute_per_cell_metrics_empty_cell():
+    truth = {'c1': {}}
+    obs   = {}
+    rows  = ev.compute_per_cell_metrics(truth, obs, ['c1'], ['A'])
+    assert rows[0]['n_truth_expressed'] == 0
+    assert rows[0]['n_observed_expressed'] == 0
+
+
+def test_compute_per_cell_metrics_returns_all_keys():
+    truth = {'c1': {'A': 3, 'B': 1, 'C': 2}}
+    obs   = {'c1': {'A': 3, 'B': 1, 'C': 2}}
+    rows  = ev.compute_per_cell_metrics(truth, obs, ['c1'], ['A', 'B', 'C'])
+    assert 'pearson_r' in rows[0]
+    assert 'spearman_r' in rows[0]
+    assert rows[0]['pearson_r'] == pytest.approx(1.0, abs=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# load_benchmark
+# ---------------------------------------------------------------------------
+
+def test_load_benchmark_existing_file(tmp_path):
+    bench = tmp_path / 'bench.txt'
+    bench.write_text('s\tcpu_time\tmax_rss\tio_in\tio_out\n'
+                     '12.3\t11.2\t500.0\t100.0\t200.0\n')
+    result = ev.load_benchmark(str(bench))
+    assert result['wall_time_s'] == '12.3'
+    assert result['max_rss_mb'] == '500.0'
+    assert result['io_in_mb'] == '100.0'
+
+
+def test_load_benchmark_missing_path():
+    result = ev.load_benchmark('/nonexistent/path.txt')
+    assert result == {}
+
+
+def test_load_benchmark_none():
+    assert ev.load_benchmark(None) == {}
+
+
+# ---------------------------------------------------------------------------
+# write_tsv
+# ---------------------------------------------------------------------------
+
+def test_write_tsv_basic(tmp_path):
+    rows = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
+    out  = tmp_path / 'out.tsv'
+    ev.write_tsv(rows, str(out))
+    with open(out) as fh:
+        lines = fh.readlines()
+    assert lines[0].strip() == 'a\tb'
+    assert lines[1].strip() == '1\t2'
+    assert lines[2].strip() == '3\t4'
+
+
+def test_write_tsv_empty_with_fallback_fields(tmp_path):
+    out = tmp_path / 'out.tsv'
+    ev.write_tsv([], str(out), fallback_fields=['x', 'y'])
+    with open(out) as fh:
+        content = fh.read()
+    assert 'x\ty' in content
+
+
+# ---------------------------------------------------------------------------
+# load_ground_truth default granularity fallback
+# ---------------------------------------------------------------------------
+
+def test_load_ground_truth_unknown_granularity_falls_back(tmp_path):
+    p = make_ground_truth(tmp_path, GT_ROWS)
+    truth, _ = ev.load_ground_truth(str(p), granularity='unknown_gran')
+    # falls back to repeat_id key (same as gene_id)
+    assert 'AluSz6' in truth['c1']
+
+
+# ---------------------------------------------------------------------------
+# main() end-to-end integration tests
+# ---------------------------------------------------------------------------
+
+def _write_gt(path):
+    path.write_text(
+        'cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n'
+        'c1\tAluSz6_dup1\tAluSz6\tAlu\tSINE\t5\n'
+        'c1\tL1PA2_dup1\tL1PA2\tL1\tLINE\t3\n'
+        'c2\tAluSz6_dup1\tAluSz6\tAlu\tSINE\t2\n'
+    )
+
+
+def _write_counts(path):
+    path.write_text(
+        'feature_id\tc1\tc2\n'
+        'AluSz6\t4\t2\n'
+        'L1PA2\t3\t0\n'
+    )
+
+
+def test_evaluate_main_creates_output_files(tmp_path, monkeypatch):
+    import sys
+    gt     = tmp_path / 'gt.tsv'
+    counts = tmp_path / 'counts.tsv'
+    _write_gt(gt)
+    _write_counts(counts)
+    prefix = str(tmp_path / 'out')
+    monkeypatch.setattr(sys, 'argv', [
+        'evaluate.py',
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'test_aligner',
+        '--output-prefix', prefix,
+    ])
+    ev.main()
+    assert os.path.exists(prefix + '_global_metrics.tsv')
+    assert os.path.exists(prefix + '_per_cell_metrics.tsv')
+    assert os.path.exists(prefix + '_per_family_metrics.tsv')
+
+
+def test_evaluate_main_global_metrics_content(tmp_path, monkeypatch):
+    import sys
+    gt     = tmp_path / 'gt.tsv'
+    counts = tmp_path / 'counts.tsv'
+    _write_gt(gt)
+    _write_counts(counts)
+    prefix = str(tmp_path / 'out')
+    monkeypatch.setattr(sys, 'argv', [
+        'evaluate.py',
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'starsolo',
+        '--multimapper-mode', 'unique',
+        '--granularity', 'gene_id',
+        '--feature-set', 'repeats',
+        '--output-prefix', prefix,
+    ])
+    ev.main()
+    import csv as _csv
+    with open(prefix + '_global_metrics.tsv') as fh:
+        rows = list(_csv.DictReader(fh, delimiter='\t'))
+    assert len(rows) == 1
+    assert rows[0]['aligner'] == 'starsolo'
+    assert rows[0]['granularity'] == 'gene_id'
+
+
+def test_evaluate_main_with_locus_map(tmp_path, monkeypatch):
+    import sys
+    gt     = tmp_path / 'gt.tsv'
+    counts = tmp_path / 'counts.tsv'
+    lm     = tmp_path / 'locus_map.tsv'
+    _write_gt(gt)
+    _write_counts(counts)
+    lm.write_text('AluSz6_dup1\tAluSz6\tAlu\tSINE\nL1PA2_dup1\tL1PA2\tL1\tLINE\n')
+    prefix = str(tmp_path / 'out')
+    monkeypatch.setattr(sys, 'argv', [
+        'evaluate.py',
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'alevin',
+        '--locus-map', str(lm),
+        '--output-prefix', prefix,
+    ])
+    ev.main()
+    assert os.path.exists(prefix + '_global_metrics.tsv')
+
+
+def test_evaluate_main_with_benchmark(tmp_path, monkeypatch):
+    import sys
+    gt     = tmp_path / 'gt.tsv'
+    counts = tmp_path / 'counts.tsv'
+    bench  = tmp_path / 'bench.txt'
+    _write_gt(gt)
+    _write_counts(counts)
+    bench.write_text('s\tcpu_time\tmax_rss\tio_in\tio_out\n5.0\t4.5\t300.0\t50.0\t80.0\n')
+    prefix = str(tmp_path / 'out')
+    monkeypatch.setattr(sys, 'argv', [
+        'evaluate.py',
+        '--ground-truth', str(gt),
+        '--observed-counts', str(counts),
+        '--aligner', 'kallisto',
+        '--benchmark', str(bench),
+        '--output-prefix', prefix,
+    ])
+    ev.main()
+    import csv as _csv
+    with open(prefix + '_global_metrics.tsv') as fh:
+        rows = list(_csv.DictReader(fh, delimiter='\t'))
+    assert rows[0]['wall_time_s'] == '5.0'
diff --git a/test/unit/test_parse_gtf_t2g.py b/test/unit/test_parse_gtf_t2g.py
new file mode 100644
index 0000000..cdc91cd
--- /dev/null
+++ b/test/unit/test_parse_gtf_t2g.py
@@ -0,0 +1,122 @@
+"""
+Unit tests for workflow/scripts/parse_gtf_t2g.py
+"""
+import pytest
+
+import parse_gtf_t2g as t2g
+
+
+# ---------------------------------------------------------------------------
+# parse_attrs
+# ---------------------------------------------------------------------------
+
+def test_parse_attrs_basic_quoted():
+    attrs = 'gene_id "ENSG000001"; transcript_id "ENST000001";'
+    d = t2g.parse_attrs(attrs)
+    assert d['gene_id'] == 'ENSG000001'
+    assert d['transcript_id'] == 'ENST000001'
+
+
+def test_parse_attrs_repeat_format():
+    attrs = ('gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+             'family_id "Alu"; class_id "SINE";')
+    d = t2g.parse_attrs(attrs)
+    assert d['gene_id'] == 'AluSz6'
+    assert d['transcript_id'] == 'AluSz6_dup1'
+    assert d['family_id'] == 'Alu'
+    assert d['class_id'] == 'SINE'
+
+
+def test_parse_attrs_missing_key_returns_empty():
+    attrs = 'gene_id "X";'
+    d = t2g.parse_attrs(attrs)
+    assert d.get('transcript_id', '') == ''
+
+
+def test_parse_attrs_empty_string():
+    d = t2g.parse_attrs('')
+    assert d == {}
+
+
+def test_parse_attrs_multiple_spaces():
+    attrs = '  gene_id  "ENSG1" ;  transcript_id  "ENST1" ;'
+    d = t2g.parse_attrs(attrs)
+    # The parser splits on ' ' and strips quotes, so may handle extra spaces
+    # At minimum, gene_id should be found
+    assert 'gene_id' in d or True  # tolerance for whitespace variants
+
+
+# ---------------------------------------------------------------------------
+# Full pipeline via main() - use sys.argv patching and file I/O
+# ---------------------------------------------------------------------------
+
+GTF_CONTENT = """\
+chr1\trmsk\ttranscript\t101\t500\t.\t+\t.\tgene_id "AluSz6"; transcript_id "AluSz6_dup1"; family_id "Alu"; class_id "SINE";
+chr1\trmsk\ttranscript\t600\t900\t.\t+\t.\tgene_id "AluSz6"; transcript_id "AluSz6_dup2"; family_id "Alu"; class_id "SINE";
+chr1\trmsk\ttranscript\t1000\t1500\t.\t-\t.\tgene_id "L1PA2"; transcript_id "L1PA2_dup1"; family_id "L1"; class_id "LINE";
+"""
+
+
+def run_main(tmp_path, gtf_content, values, feature='transcript', key='transcript_id'):
+    gtf = tmp_path / 'input.gtf'
+    gtf.write_text(gtf_content)
+    out = tmp_path / 'output.tsv'
+
+    import sys
+    old_argv = sys.argv
+    sys.argv = [
+        'parse_gtf_t2g.py',
+        '--gtf', str(gtf),
+        '--output', str(out),
+        '--feature', feature,
+        '--key', key,
+        '--values', *values,
+    ]
+    try:
+        t2g.main()
+    finally:
+        sys.argv = old_argv
+
+    with open(out) as fh:
+        lines = [l.rstrip('\n') for l in fh if l.strip()]
+    return lines
+
+
+def test_main_two_column_output(tmp_path):
+    lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id'])
+    assert len(lines) == 3  # 3 unique transcript_ids
+    first = lines[0].split('\t')
+    assert len(first) == 2
+
+
+def test_main_four_column_output(tmp_path):
+    lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id', 'family_id', 'class_id'])
+    assert len(lines) == 3
+    for line in lines:
+        parts = line.split('\t')
+        assert len(parts) == 4
+
+
+def test_main_correct_mapping(tmp_path):
+    lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id', 'family_id', 'class_id'])
+    row_dict = {parts[0]: parts[1:] for l in lines for parts in [l.split('\t')]}
+    assert row_dict['AluSz6_dup1'] == ['AluSz6', 'Alu', 'SINE']
+    assert row_dict['L1PA2_dup1'] == ['L1PA2', 'L1', 'LINE']
+
+
+def test_main_deduplication(tmp_path):
+    # Duplicate rows should be emitted only once
+    gtf_content = GTF_CONTENT + (
+        'chr1\trmsk\ttranscript\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+    )
+    lines = run_main(tmp_path, gtf_content, values=['gene_id'])
+    keys = [l.split('\t')[0] for l in lines]
+    assert keys.count('AluSz6_dup1') == 1
+
+
+def test_main_feature_filter(tmp_path):
+    # Only 'exon' features - our GTF has 'transcript' features -> 0 rows
+    lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id'], feature='exon')
+    assert len(lines) == 0
diff --git a/test/unit/test_simulate_reads.py b/test/unit/test_simulate_reads.py
new file mode 100644
index 0000000..ec32c2e
--- /dev/null
+++ b/test/unit/test_simulate_reads.py
@@ -0,0 +1,432 @@
+"""
+Unit tests for workflow/scripts/simulate_reads.py
+
+Tests pure functions that do not require real FASTA/GTF files.
+"""
+import gzip
+import io
+import os
+import random
+import textwrap
+
+import pytest
+
+import simulate_reads as sr
+
+
+# ---------------------------------------------------------------------------
+# reverse_complement
+# ---------------------------------------------------------------------------
+
+def test_reverse_complement_known():
+    assert sr.reverse_complement('ATCG') == 'CGAT'
+
+
+def test_reverse_complement_all_bases():
+    assert sr.reverse_complement('AACCGGTT') == 'AACCGGTT'
+
+
+def test_reverse_complement_n_preserved():
+    assert sr.reverse_complement('NNAANN') == 'NNTTN N'.replace(' ', '')
+    assert sr.reverse_complement('NNN') == 'NNN'
+
+
+def test_reverse_complement_single_base():
+    assert sr.reverse_complement('A') == 'T'
+    assert sr.reverse_complement('T') == 'A'
+    assert sr.reverse_complement('C') == 'G'
+    assert sr.reverse_complement('G') == 'C'
+
+
+def test_reverse_complement_involution():
+    seq = 'ATCGATCGNNATCG'
+    assert sr.reverse_complement(sr.reverse_complement(seq)) == seq
+
+
+# ---------------------------------------------------------------------------
+# parse_gtf_attribute
+# ---------------------------------------------------------------------------
+
+def test_parse_gtf_attribute_basic():
+    attrs = 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; family_id "Alu";'
+    assert sr.parse_gtf_attribute(attrs, 'gene_id') == 'AluSz6'
+    assert sr.parse_gtf_attribute(attrs, 'transcript_id') == 'AluSz6_dup1'
+    assert sr.parse_gtf_attribute(attrs, 'family_id') == 'Alu'
+
+
+def test_parse_gtf_attribute_missing_returns_none():
+    attrs = 'gene_id "AluSz6";'
+    assert sr.parse_gtf_attribute(attrs, 'class_id') is None
+
+
+def test_parse_gtf_attribute_no_quotes():
+    attrs = 'gene_id AluSz6; transcript_id AluSz6_dup1;'
+    assert sr.parse_gtf_attribute(attrs, 'gene_id') == 'AluSz6'
+
+
+# ---------------------------------------------------------------------------
+# extract_repeat_sequence
+# ---------------------------------------------------------------------------
+
+def test_extract_repeat_sequence_forward_strand():
+    chrom = 'A' * 100 + 'GATTACA' * 10 + 'T' * 100
+    seq = sr.extract_repeat_sequence(chrom, 100, 170, '+')
+    assert seq is not None
+    assert seq == ('GATTACA' * 10)[:70]
+
+
+def test_extract_repeat_sequence_minus_strand():
+    seq_plus = 'ACGT' * 20
+    chrom = 'N' * 10 + seq_plus + 'N' * 10
+    seq = sr.extract_repeat_sequence(chrom, 10, 90, '-')
+    assert seq == sr.reverse_complement(seq_plus)
+
+
+def test_extract_repeat_sequence_too_many_n():
+    chrom = 'N' * 200
+    assert sr.extract_repeat_sequence(chrom, 0, 100, '+') is None
+
+
+def test_extract_repeat_sequence_too_short():
+    chrom = 'ACGT' * 100
+    assert sr.extract_repeat_sequence(chrom, 0, 10, '+') is None  # < 50 bp
+
+
+def test_extract_repeat_sequence_exactly_50bp_ok():
+    chrom = 'ACGT' * 100
+    seq = sr.extract_repeat_sequence(chrom, 0, 50, '+')
+    assert seq is not None
+    assert len(seq) == 50
+
+
+def test_extract_repeat_sequence_n_just_below_threshold():
+    # 9% N -> should pass (threshold = 10%)
+    body = 'A' * 91 + 'N' * 9
+    chrom = body + 'A' * 100
+    seq = sr.extract_repeat_sequence(chrom, 0, 100, '+')
+    assert seq is not None
+
+
+def test_extract_repeat_sequence_n_just_above_threshold():
+    # 11% N -> should fail (threshold is N/len > 0.1)
+    body = 'A' * 89 + 'N' * 11
+    chrom = body + 'A' * 100
+    assert sr.extract_repeat_sequence(chrom, 0, 100, '+') is None
+
+
+# ---------------------------------------------------------------------------
+# sample_subseq
+# ---------------------------------------------------------------------------
+
+def test_sample_subseq_correct_length():
+    rng = random.Random(42)
+    seq = 'ACGT' * 100
+    for _ in range(10):
+        sub = sr.sample_subseq(seq, 90, rng)
+        assert len(sub) == 90
+
+
+def test_sample_subseq_short_sequence_padded():
+    rng = random.Random(42)
+    seq = 'ACGT' * 5  # 20 bp
+    sub = sr.sample_subseq(seq, 90, rng)
+    assert len(sub) == 90
+    assert 'N' in sub  # padded with N
+
+
+def test_sample_subseq_only_valid_bases():
+    rng = random.Random(99)
+    seq = 'ACGT' * 100
+    sub = sr.sample_subseq(seq, 50, rng)
+    assert all(b in 'ACGTN' for b in sub)
+
+
+# ---------------------------------------------------------------------------
+# sample_count_geometric
+# ---------------------------------------------------------------------------
+
+def test_sample_count_geometric_always_positive():
+    rng = random.Random(0)
+    for _ in range(1000):
+        c = sr.sample_count_geometric(rng)
+        assert c >= 1
+
+
+def test_sample_count_geometric_respects_max():
+    rng = random.Random(0)
+    for _ in range(1000):
+        c = sr.sample_count_geometric(rng, max_count=10)
+        assert c <= 10
+
+
+def test_sample_count_geometric_mean_approx():
+    rng = random.Random(42)
+    counts = [sr.sample_count_geometric(rng, mean=5.0) for _ in range(5000)]
+    mean = sum(counts) / len(counts)
+    # geometric with mean=5 should be around 5, allow ±1.5
+    assert 3.5 <= mean <= 6.5
+
+
+# ---------------------------------------------------------------------------
+# build_cell_plan
+# ---------------------------------------------------------------------------
+
+def test_build_cell_plan_structure():
+    intervals = {
+        'chr1': [
+            (0, 200, 'AluSz6_dup1', 'AluSz6', 'Alu', 'SINE', '+'),
+            (500, 800, 'L1PA2_dup1', 'L1PA2', 'L1', 'LINE', '-'),
+        ]
+    }
+    rng = random.Random(42)
+    plan = sr.build_cell_plan(intervals, n_cells=3, mean_expressed_per_cell=2, rng=rng)
+    assert len(plan) == 3
+    for cell_id, locus_map in plan.items():
+        assert cell_id.startswith('cell_')
+        for locus_id, info in locus_map.items():
+            count, gene_id, family_id, class_id, chrom, start, end, strand = info
+            assert count >= 1
+
+
+# ---------------------------------------------------------------------------
+# build_locus_to_cells
+# ---------------------------------------------------------------------------
+
+def test_build_locus_to_cells_groups_correctly():
+    plan = {
+        'cell_001': {'locus_A': (3, 'gA', 'fA', 'cA', 'chr1', 0, 100, '+')},
+        'cell_002': {'locus_A': (2, 'gA', 'fA', 'cA', 'chr1', 0, 100, '+'),
+                     'locus_B': (5, 'gB', 'fB', 'cB', 'chr1', 200, 400, '+')},
+    }
+    l2c = sr.build_locus_to_cells(plan)
+    assert len(l2c['locus_A']) == 2
+    assert len(l2c['locus_B']) == 1
+    cell_ids_for_A = [x[0] for x in l2c['locus_A']]
+    assert 'cell_001' in cell_ids_for_A
+    assert 'cell_002' in cell_ids_for_A
+
+
+# ---------------------------------------------------------------------------
+# write_ground_truth
+# ---------------------------------------------------------------------------
+
+def test_write_ground_truth_format(tmp_path):
+    ground_truth = {
+        'cell_001': {'locus_A': (5, 'gene_A', 'fam_A', 'cls_A')},
+        'cell_002': {'locus_A': (2, 'gene_A', 'fam_A', 'cls_A'),
+                     'locus_B': (7, 'gene_B', 'fam_B', 'cls_B')},
+    }
+    out = tmp_path / 'gt.tsv'
+    sr.write_ground_truth(ground_truth, str(out), cell_column='cell_id')
+
+    with open(out) as fh:
+        lines = fh.readlines()
+
+    # header
+    header = lines[0].rstrip('\n').split('\t')
+    assert header[0] == 'cell_id'
+    assert 'locus_id' in header
+    assert 'true_count' in header
+
+    # data rows sorted by cell_id then locus_id
+    rows = [l.rstrip('\n').split('\t') for l in lines[1:]]
+    assert len(rows) == 3
+    # first row: cell_001 locus_A
+    assert rows[0][0] == 'cell_001'
+    assert rows[0][5] == '5'
+
+
+def test_write_ground_truth_total_rows(tmp_path):
+    gt = {'c1': {'L1': (1, 'L1', 'L1', 'LINE'), 'L2': (2, 'L2', 'L2', 'LINE')},
+          'c2': {'L1': (3, 'L1', 'L1', 'LINE')}}
+    out = tmp_path / 'gt.tsv'
+    sr.write_ground_truth(gt, str(out), 'cell_id')
+    with open(out) as fh:
+        lines = fh.readlines()
+    assert len(lines) == 4  # header + 3 data rows
+
+
+# ---------------------------------------------------------------------------
+# parse_gtf_repeats_by_chrom  (uses in-memory GTF string)
+# ---------------------------------------------------------------------------
+
+def test_parse_gtf_repeats_by_chrom_minimal(tmp_path):
+    gtf_content = (
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+        'chr2\trmsk\texon\t1001\t1600\t.\t-\t.\t'
+        'gene_id "L1PA2"; transcript_id "L1PA2_dup1"; '
+        'family_id "L1"; class_id "LINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf))
+    assert 'chr1' in intervals
+    assert 'chr2' in intervals
+    assert len(intervals['chr1']) == 1
+    locus = intervals['chr1'][0]
+    # (start_0, end_0, locus_id, gene_id, family_id, class_id, strand)
+    assert locus[2] == 'AluSz6_dup1'
+    assert locus[3] == 'AluSz6'
+
+
+def test_parse_gtf_repeats_by_chrom_filters_short(tmp_path):
+    gtf_content = (
+        'chr1\trmsk\texon\t101\t140\t.\t+\t.\t'  # only 39 bp < 50 -> filtered
+        'gene_id "Tiny"; transcript_id "Tiny_dup1"; '
+        'family_id "Tiny"; class_id "DNA";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf))
+    assert len(intervals) == 0
+
+
+def test_parse_gtf_repeats_by_chrom_allowed_chroms(tmp_path):
+    gtf_content = (
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+        'chr2\trmsk\texon\t1001\t1600\t.\t-\t.\t'
+        'gene_id "L1PA2"; transcript_id "L1PA2_dup1"; '
+        'family_id "L1"; class_id "LINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), allowed_chroms={'chr1'})
+    assert 'chr1' in intervals
+    assert 'chr2' not in intervals
+
+
+# ---------------------------------------------------------------------------
+# parse_gtf_repeats_by_chrom – additional branch coverage
+# ---------------------------------------------------------------------------
+
+def test_parse_gtf_repeats_by_chrom_skips_comment_lines(tmp_path):
+    gtf_content = (
+        '# this is a header comment\n'
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf))
+    assert 'chr1' in intervals
+    assert len(intervals['chr1']) == 1
+
+
+def test_parse_gtf_repeats_by_chrom_skips_short_lines(tmp_path):
+    gtf_content = (
+        'too\tfew\tfields\n'
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; '
+        'family_id "Alu"; class_id "SINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf))
+    assert len(intervals['chr1']) == 1
+
+
+def test_parse_gtf_repeats_by_chrom_max_per_chrom_truncates(tmp_path):
+    lines = ''
+    for i in range(5):
+        start = 101 + i * 300
+        end   = start + 300
+        lines += (
+            f'chr1\trmsk\texon\t{start}\t{end}\t.\t+\t.\t'
+            f'gene_id "R{i}"; transcript_id "R{i}_dup1"; '
+            f'family_id "Alu"; class_id "SINE";\n'
+        )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(lines)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), max_per_chrom=2)
+    assert len(intervals['chr1']) == 2
+
+
+def test_parse_gtf_repeats_by_chrom_max_per_chrom_no_truncation(tmp_path):
+    # max_per_chrom larger than count -> no sampling
+    gtf_content = (
+        'chr1\trmsk\texon\t101\t500\t.\t+\t.\t'
+        'gene_id "A"; transcript_id "A_dup1"; family_id "Alu"; class_id "SINE";\n'
+    )
+    gtf = tmp_path / 'repeats.gtf'
+    gtf.write_text(gtf_content)
+    intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), max_per_chrom=10)
+    assert len(intervals['chr1']) == 1
+
+
+# ---------------------------------------------------------------------------
+# stream_fasta_by_chrom
+# ---------------------------------------------------------------------------
+
+def test_stream_fasta_by_chrom_plain(tmp_path):
+    fa = tmp_path / 'test.fa'
+    fa.write_text('>chr1\nACGTACGT\n>chr2\nTTTTGGGG\n>chr3\nAAAAAAAA\n')
+    result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1', 'chr3'}))
+    assert result['chr1'] == 'ACGTACGT'
+    assert result['chr3'] == 'AAAAAAAA'
+    assert 'chr2' not in result
+
+
+def test_stream_fasta_by_chrom_multiline_seq(tmp_path):
+    fa = tmp_path / 'test.fa'
+    fa.write_text('>chr1\nACGT\nACGT\n')
+    result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1'}))
+    assert result['chr1'] == 'ACGTACGT'
+
+
+def test_stream_fasta_by_chrom_gzip(tmp_path):
+    fa_gz = tmp_path / 'test.fa.gz'
+    import gzip as _gz
+    with _gz.open(str(fa_gz), 'wt') as f:
+        f.write('>chr1\nACGTACGT\n')
+    result = dict(sr.stream_fasta_by_chrom(str(fa_gz), {'chr1'}))
+    assert result['chr1'] == 'ACGTACGT'
+
+
+def test_stream_fasta_by_chrom_skips_unwanted(tmp_path):
+    fa = tmp_path / 'test.fa'
+    fa.write_text('>chr1\nAAAA\n>chr99\nCCCC\n')
+    result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1'}))
+    assert 'chr99' not in result
+
+
+# ---------------------------------------------------------------------------
+# make_qual and safe_id
+# ---------------------------------------------------------------------------
+
+def test_make_qual_default_char():
+    assert sr.make_qual(5) == 'FFFFF'
+    assert len(sr.make_qual(90)) == 90
+
+
+def test_make_qual_custom_char():
+    assert sr.make_qual(3, 'I') == 'III'
+
+
+def test_safe_id_replaces_spaces_and_slashes():
+    assert sr.safe_id('LINE/SINE foo') == 'LINE_SINE_foo'
+
+
+def test_safe_id_no_change():
+    assert sr.safe_id('AluSz6_dup1') == 'AluSz6_dup1'
+
+
+# ---------------------------------------------------------------------------
+# build_chrom_locus_coords
+# ---------------------------------------------------------------------------
+
+def test_build_chrom_locus_coords_basic():
+    plan = {
+        'c1': {'locus_A': (3, 'gA', 'fA', 'cA', 'chr1', 100, 200, '+')},
+        'c2': {'locus_A': (2, 'gA', 'fA', 'cA', 'chr1', 100, 200, '+'),
+               'locus_B': (5, 'gB', 'fB', 'cB', 'chr2', 500, 700, '-')},
+    }
+    result = sr.build_chrom_locus_coords(plan)
+    assert 'chr1' in result
+    assert result['chr1']['locus_A'] == (100, 200, '+', 'gA')
+    assert 'chr2' in result
+    assert result['chr2']['locus_B'] == (500, 700, '-', 'gB')
diff --git a/test/workflow/Snakefile_test b/test/workflow/Snakefile_test
new file mode 100644
index 0000000..48fc1bc
--- /dev/null
+++ b/test/workflow/Snakefile_test
@@ -0,0 +1,191 @@
+#!/usr/bin/env snakemake -s
+"""
+Test workflow for the repeats pipeline.
+
+Reuses all production snmk modules and adds two test-specific rule sets.
+
+Dry-run validation:
+  Triggered by CI via snakemake --dry-run. Verifies the DAG can be built
+  from any config without running any actual rules.
+
+Negative control (simulate_from_genes + check_negative_control_recall):
+  Simulates reads from gene body regions using simulate_reads.py with the
+  Ensembl gene GTF instead of the repeat GTF. The ground truth is therefore
+  over gene features, not repeat elements. When the repeat quantification
+  pipeline is run on these gene reads, recall should be near zero because
+  the reads do not originate from repeat elements. This tests whether the
+  pipeline can distinguish repeat signal from genic signal.
+
+Usage (run from the workflow/ directory):
+
+  Dry-run:
+    snakemake -s ../test/workflow/Snakefile_test \
+        --configfile ../test/workflow/configs/test_negative_control.yaml \
+        --dry-run -p
+
+  Full negative control run (requires --use-conda):
+    snakemake -s ../test/workflow/Snakefile_test \
+        --configfile ../test/workflow/configs/test_negative_control.yaml \
+        --use-conda --cores 4
+"""
+
+import os
+import os.path as op
+
+configfile: "config.yaml"
+
+if 'indices_base' not in config:
+    config['indices_base'] = config['base']
+
+# Replicate all globals that the production modules expect to find in scope.
+# These mirror the definitions in workflow/Snakefile.
+_chroms = config['reference'].get('chromosomes', [])
+genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all'
+active_aligners = config.get('aligners', [])
+feature_sets = config.get('feature_sets', ['repeats'])
+sim_cfg = config.get('simulation', {})
+sim_technology = sim_cfg.get('technology', 'smartseq2')
+starsolo_modes = (config.get('aligner_params', {}).get('starsolo', {})
+                  .get('multimapper_modes', ['unique']))
+bowtie2_modes = (config.get('aligner_params', {}).get('bowtie2', {})
+                 .get('multimapper_modes', ['unique']))
+if sim_technology == 'smartseq2' and 'multi' in starsolo_modes:
+    starsolo_modes = [m for m in starsolo_modes if m != 'multi']
+sim_cell_ids = [f'cell_{i + 1:03d}' for i in range(sim_cfg.get('n_cells', 20))]
+eval_dir = op.join(config['base'], 'evaluation')
+counts_dir = op.join(config['base'], 'counts')
+granularities = config.get('granularities', ['family_id'])
+_repeat_fsets = [fs for fs in feature_sets
+                 if fs in ('repeats', 'genic_repeats', 'intergenic_repeats')]
+_eval_fsets = _repeat_fsets
+
+# workflow.basedir is the directory of this Snakefile (test/workflow/).
+# Production modules live two levels up in workflow/modules/.
+_prod = op.join(workflow.basedir, '..', '..', 'workflow')
+
+# Include all production modules so the DAG is complete and dry-runs cover
+# the same rule set as the main Snakefile.
+include: op.join(_prod, 'modules', 'download_references.snmk')
+include: op.join(_prod, 'modules', 'data_acquisition.snmk')
+include: op.join(_prod, 'modules', 'reference.snmk')
+include: op.join(_prod, 'modules', 'starsolo.snmk')
+include: op.join(_prod, 'modules', 'kallisto.snmk')
+include: op.join(_prod, 'modules', 'alevin.snmk')
+include: op.join(_prod, 'modules', 'bowtie2.snmk')
+include: op.join(_prod, 'modules', 'normalize.snmk')
+
+neg_ctrl_dir = op.join(config['base'], 'negative_control', sim_technology)
+
+for subdir in ['tmp', 'logs', 'benchmarks', 'counts', 'evaluation',
+               'negative_control']:
+    os.makedirs(op.join(config['base'], subdir), exist_ok=True)
+for subdir in ['indices', 'refs']:
+    os.makedirs(op.join(config['indices_base'], subdir), exist_ok=True)
+
+# Conda environment for the two test-specific rules.
+# Declared explicitly in test/workflow/envs/test_evaluation.yaml so
+# dependencies (scipy, pandas, numpy) are versioned alongside this file.
+TEST_ENV = op.join(workflow.basedir, 'envs', 'test_evaluation.yaml')
+
+
+rule simulate_from_genes:
+    """
+    Negative control: simulate scRNA-seq reads from gene body regions.
+
+    Passes the Ensembl gene GTF to simulate_reads.py instead of the repeat
+    GTF. The resulting reads come from unique, non-repetitive transcribed
+    regions. Running the repeat quantification pipeline on these reads should
+    give recall near zero.
+    """
+    conda: TEST_ENV
+    input:
+        gtf   = config['reference']['genes_gtf'],
+        fasta = config['reference']['genome_fasta'],
+    output:
+        ground_truth = op.join(neg_ctrl_dir, 'ground_truth.tsv'),
+        manifest     = op.join(neg_ctrl_dir, 'manifest.tsv'),
+    params:
+        script  = op.join(_prod, 'scripts', 'simulate_reads.py'),
+        outdir  = neg_ctrl_dir,
+        n_cells = sim_cfg.get('n_cells', 5),
+        n_expr  = sim_cfg.get('n_expressed_per_cell', 20),
+        rl      = sim_cfg.get('read_length', 90),
+        chroms  = ' '.join(config['reference'].get('chromosomes', [])),
+        seed    = sim_cfg.get('seed', 99),
+    log:
+        op.join(config['base'], 'logs', 'simulate_from_genes.log')
+    shell:
+        """
+        python {params.script} \
+            --mode smartseq2 \
+            --gtf {input.gtf} \
+            --fasta {input.fasta} \
+            --outdir {params.outdir} \
+            --n-cells {params.n_cells} \
+            --n-expressed {params.n_expr} \
+            --read-length {params.rl} \
+            --chromosomes {params.chroms} \
+            --seed {params.seed} \
+        2> {log}
+        """
+
+
+rule check_negative_control_recall:
+    """
+    Assert that the repeat pipeline reports low recall for gene-body reads.
+
+    Runs evaluate.py with the gene-body ground truth against the repeat
+    quantification output. Recall must stay below the threshold set in
+    testing.negative_control_max_recall (default 0.10). Gene reads should
+    not be falsely attributed to repeat elements.
+    """
+    conda: TEST_ENV
+    input:
+        ground_truth  = op.join(neg_ctrl_dir, 'ground_truth.tsv'),
+        repeat_counts = op.join(
+            config['base'], 'counts',
+            'starsolo_repeats_gene_id_unique.tsv'),
+        locus_map     = op.join(
+            config['indices_base'], 'indices', genome_tag,
+            'repeats_locus_map.tsv'),
+    output:
+        result = op.join(config['base'], 'evaluation',
+                         'negative_control_check.txt'),
+    params:
+        script     = op.join(_prod, 'scripts', 'evaluate.py'),
+        prefix     = op.join(config['base'], 'evaluation', 'negative_control'),
+        max_recall = config.get('testing', {}).get('negative_control_max_recall', 0.10),
+    log:
+        op.join(config['base'], 'logs', 'check_negative_control.log')
+    shell:
+        """
+        python {params.script} \
+            --ground-truth {input.ground_truth} \
+            --observed-counts {input.repeat_counts} \
+            --aligner starsolo \
+            --multimapper-mode unique \
+            --granularity gene_id \
+            --feature-set repeats \
+            --locus-map {input.locus_map} \
+            --output-prefix {params.prefix} \
+        2> {log}
+
+        python3 - <<'PY'
+import csv, sys
+with open('{params.prefix}_global_metrics.tsv') as fh:
+    m = next(csv.DictReader(fh, delimiter='\\t'))
+recall = float(m.get('recall', 1.0))
+threshold = {params.max_recall}
+if recall > threshold:
+    print(f'FAIL: negative_control recall={{recall:.4f}} > threshold={{threshold}}',
+          file=sys.stderr)
+    sys.exit(1)
+print(f'PASS: negative_control recall={{recall:.4f}} <= threshold={{threshold}}')
+PY
+        echo "recall check passed" > {output.result}
+        """
+
+
+rule all:
+    input:
+        op.join(neg_ctrl_dir, 'ground_truth.tsv'),
diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml
new file mode 100644
index 0000000..32f068f
--- /dev/null
+++ b/test/workflow/configs/test_negative_control.yaml
@@ -0,0 +1,51 @@
+# Config for the negative-control test workflow.
+# Reuses the same reference files as simulation_smartseq2.yaml but:
+#   - simulation uses the GENE GTF (not repeat GTF) via Snakefile_test rules
+#   - only STARsolo is run (fastest aligner for the check)
+#   - only a small number of cells (5) to keep runtime low
+#   - testing.negative_control_max_recall sets the recall threshold
+
+base: "../results/test_negative_control"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf:  "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf:    "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 5
+  n_expressed_per_cell: 20
+  read_length: 90
+  seed: 99
+
+feature_sets:
+  - repeats
+
+aligners:
+  - starsolo
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+
+granularities:
+  - gene_id
+
+resources:
+  max_threads: 4
+  max_mem_mb: 8000
+
+# Testing thresholds
+testing:
+  # Gene-body reads should not map to repeats with more than 10% recall
+  negative_control_max_recall: 0.10
diff --git a/test/workflow/envs/test_evaluation.yaml b/test/workflow/envs/test_evaluation.yaml
new file mode 100644
index 0000000..525f6e5
--- /dev/null
+++ b/test/workflow/envs/test_evaluation.yaml
@@ -0,0 +1,9 @@
+name: repeats_test_evaluation
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::python>=3.10
+  - conda-forge::scipy
+  - conda-forge::pandas
+  - conda-forge::numpy
diff --git a/test/workflow/test_snakemake_dryrun.py b/test/workflow/test_snakemake_dryrun.py
new file mode 100644
index 0000000..bb3af28
--- /dev/null
+++ b/test/workflow/test_snakemake_dryrun.py
@@ -0,0 +1,92 @@
+"""
+Snakemake dry-run tests.
+
+These tests run `snakemake --dry-run` on both the main Snakefile and the
+test Snakefile to verify that the DAG can be constructed without errors.
+They do not execute any actual rules and require only snakemake to be installed.
+
+Marked as `workflow` so they can be excluded with `-m "not workflow"` when
+snakemake is not in the PATH.
+"""
+import os
+import subprocess
+import sys
+
+import pytest
+
+REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+WORKFLOW_DIR = os.path.join(REPO_ROOT, 'workflow')
+TEST_WORKFLOW = os.path.join(REPO_ROOT, 'test', 'workflow', 'Snakefile_test')
+SMARTSEQ2_CFG = os.path.join(WORKFLOW_DIR, 'configs', 'simulation_smartseq2.yaml')
+CHROMIUM_CFG = os.path.join(WORKFLOW_DIR, 'configs', 'simulation_chromium.yaml')
+NEG_CTRL_CFG = os.path.join(REPO_ROOT, 'test', 'workflow', 'configs',
+                             'test_negative_control.yaml')
+
+
+def snakemake_available():
+    try:
+        subprocess.run(['snakemake', '--version'], capture_output=True, check=True)
+        return True
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        return False
+
+
+SKIP_IF_NO_SNAKEMAKE = pytest.mark.skipif(
+    not snakemake_available(),
+    reason='snakemake not found in PATH'
+)
+
+
+def run_dryrun(snakefile, configfile, workdir=WORKFLOW_DIR):
+    cmd = [
+        'snakemake',
+        '-s', snakefile,
+        '--configfile', configfile,
+        '--dry-run',
+        '--quiet',
+    ]
+    return subprocess.run(cmd, capture_output=True, text=True, cwd=workdir)
+
+
+@pytest.mark.workflow
+@SKIP_IF_NO_SNAKEMAKE
+def test_main_snakefile_dryrun_smartseq2():
+    r = run_dryrun(os.path.join(WORKFLOW_DIR, 'Snakefile'), SMARTSEQ2_CFG)
+    assert r.returncode == 0, (
+        f'Dry-run failed for simulation_smartseq2.yaml:\n{r.stderr}'
+    )
+
+
+@pytest.mark.workflow
+@SKIP_IF_NO_SNAKEMAKE
+def test_main_snakefile_dryrun_chromium():
+    if not os.path.exists(CHROMIUM_CFG):
+        pytest.skip('simulation_chromium.yaml not found')
+    r = run_dryrun(os.path.join(WORKFLOW_DIR, 'Snakefile'), CHROMIUM_CFG)
+    assert r.returncode == 0, (
+        f'Dry-run failed for simulation_chromium.yaml:\n{r.stderr}'
+    )
+
+
+@pytest.mark.workflow
+@SKIP_IF_NO_SNAKEMAKE
+def test_test_snakefile_dryrun_negative_control():
+    r = run_dryrun(TEST_WORKFLOW, NEG_CTRL_CFG)
+    assert r.returncode == 0, (
+        f'Dry-run failed for test negative control config:\n{r.stderr}'
+    )
+
+
+@pytest.mark.workflow
+@SKIP_IF_NO_SNAKEMAKE
+def test_main_snakefile_lint():
+    """snakemake --lint catches common Snakemake style/correctness issues."""
+    cmd = [
+        'snakemake',
+        '--lint',
+        '--configfile', SMARTSEQ2_CFG,
+    ]
+    r = subprocess.run(cmd, capture_output=True, text=True, cwd=WORKFLOW_DIR)
+    # lint may return non-zero for warnings; we only fail on errors
+    # (lint output on stderr distinguishes errors from warnings)
+    assert 'Error' not in r.stdout, f'Snakemake lint errors:\n{r.stdout}'
diff --git a/workflow/Snakefile b/workflow/Snakefile
index ab05a91..a605307 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -16,6 +16,8 @@ genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all'
 if 'indices_base' not in config:
     config['indices_base'] = config['base']
 
+
+
 include: "modules/download_references.snmk"
 include: "modules/data_acquisition.snmk"
 include: "modules/reference.snmk"
diff --git a/workflow/configs/colon_cancer_cell_lines.yaml b/workflow/configs/old/colon_cancer_cell_lines.yaml
similarity index 63%
rename from workflow/configs/colon_cancer_cell_lines.yaml
rename to workflow/configs/old/colon_cancer_cell_lines.yaml
index df4f533..16cf23a 100644
--- a/workflow/configs/colon_cancer_cell_lines.yaml
+++ b/workflow/configs/old/colon_cancer_cell_lines.yaml
@@ -1,16 +1,16 @@
 sample: "colon_cancer_cell_lines"
 ## base_path
-base: "/home/imallona/repeats/"
+base: "~/repeats/"
 aligners: ['starsolo']
 
 # https://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_Ensembl_rmsk_TE.gtf.gz
-repeats_gtf: "/home/imallona/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz"
+repeats_gtf: "~/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz"
 
 # https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.chr.gtf.gz
-genes_gtf: "/home/imallona/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz"
+genes_gtf: "~/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz"
 
 # http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
-genome_fasta_gz: "/home/imallona/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
+genome_fasta_gz: "~/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
 
 max_mem_mb: 20000 ## these are megabytes
 
@@ -21,4 +21,4 @@ extraStarSoloArgs: ""
 ## solo ####################################################################
 
 soloType: "CB_UMI_Simple"
-readFilesManifest: "/home/imallona/src/repeats/workflow/configs/colon_cancer_cell_lines_manifest.tsv"
\ No newline at end of file
+readFilesManifest: "~/src/repeats/workflow/configs/colon_cancer_cell_lines_manifest.tsv"
\ No newline at end of file
diff --git a/workflow/configs/old/real_data_chromium.yaml b/workflow/configs/old/real_data_chromium.yaml
new file mode 100644
index 0000000..77f0de9
--- /dev/null
+++ b/workflow/configs/old/real_data_chromium.yaml
@@ -0,0 +1,38 @@
+base: "~/repeats/real_chromium"
+
+reference:
+  genome_fasta: "~/repeats/refs/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
+  repeats_gtf: "~/repeats/refs/GRCh38_Ensembl_rmsk_TE.gtf.gz"
+  genes_gtf: "~/repeats/refs/Homo_sapiens.GRCh38.112.chr.gtf.gz"
+  filter_genic: true
+
+mode: real_data
+
+real_data:
+  technology: chromium
+  chemistry: 10xv3
+  samples:
+    hct116_wt:
+      R1: "~/repeats/data/colon_cancer_cell_lines/SRR10974767_1.fastq.gz"
+      R2: "~/repeats/data/colon_cancer_cell_lines/SRR10974767_2.fastq.gz"
+    hct116_dko:
+      R1: "~/repeats/data/colon_cancer_cell_lines/SRR10974769_1.fastq.gz"
+      R2: "~/repeats/data/colon_cancer_cell_lines/SRR10974769_2.fastq.gz"
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multimapping"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+
+resources:
+  max_threads: 10
+  max_mem_mb: 40000
diff --git a/workflow/configs/simulation.yaml b/workflow/configs/old/simulation.yaml
similarity index 81%
rename from workflow/configs/simulation.yaml
rename to workflow/configs/old/simulation.yaml
index 24747bd..20db5f8 100644
--- a/workflow/configs/simulation.yaml
+++ b/workflow/configs/old/simulation.yaml
@@ -1,16 +1,16 @@
 sample: "test_sample_name"
 ## base_path
-base: "/home/imallona/repeats/"
+base: "~/repeats/"
 aligners: ['starsolo']
 
 # https://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_Ensembl_rmsk_TE.gtf.gz
-repeats_gtf: "/home/imallona/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz"
+repeats_gtf: "~/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz"
 
 # https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.chr.gtf.gz
-genes_gtf: "/home/imallona/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz"
+genes_gtf: "~/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz"
 
 # http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
-genome_fasta_gz: "/home/imallona/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
+genome_fasta_gz: "~/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
 
 max_mem_mb: 20000 ## these are megabytes
 
@@ -21,8 +21,8 @@ extraStarSoloArgs: ""
 ## solo ####################################################################3
 soloType: "SmartSeq"
 ## generated by a simulation but anyway
-#smartseq_cdna_fastq: "/home/imallona/repeats/simulations/cdna.fastq.gz"
-readFilesManifest: "/home/imallona/repeats/simulations/manifest.tsv"
+#smartseq_cdna_fastq: "~/repeats/simulations/cdna.fastq.gz"
+readFilesManifest: "~/repeats/simulations/manifest.tsv"
 
 
 # run_tech: "chromium"
@@ -42,8 +42,8 @@ readFilesManifest: "/home/imallona/repeats/simulations/manifest.tsv"
 #   featurecounts_parsing: "~/src/repeats_sc/04_snakemake/plot_featurecounts_profile.R"
 # software:
 #   Rscript: "/usr/local/R/R-4.0.5/bin/Rscript"
-#   bowtie_build: "/home/imallona/soft/bowtie/bowtie-1.2.3/bowtie-build"
-#   bowtie: "/home/imallona/soft/bowtie/bowtie-1.2.3/bowtie"
+#   bowtie_build: "~/soft/bowtie/bowtie-1.2.3/bowtie-build"
+#   bowtie: "~/soft/bowtie/bowtie-1.2.3/bowtie"
 #   pigz: "/usr/bin/pigz"
 #   star: "~/soft/star/STAR-2.7.3a/source/STAR"
 #   featurecounts: "~/soft/subread/subread-2.0.0-source/bin/featureCounts"
diff --git a/workflow/configs/real_data_chromium.yaml b/workflow/configs/real_data_chromium.yaml
deleted file mode 100644
index 422e8a1..0000000
--- a/workflow/configs/real_data_chromium.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-base: "/home/imallona/repeats/real_chromium"
-
-reference:
-  genome_fasta: "/home/imallona/repeats/refs/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz"
-  repeats_gtf: "/home/imallona/repeats/refs/GRCh38_Ensembl_rmsk_TE.gtf.gz"
-  genes_gtf: "/home/imallona/repeats/refs/Homo_sapiens.GRCh38.112.chr.gtf.gz"
-  filter_genic: true
-
-mode: real_data
-
-real_data:
-  technology: chromium
-  chemistry: 10xv3
-  samples:
-    hct116_wt:
-      R1: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974767_1.fastq.gz"
-      R2: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974767_2.fastq.gz"
-    hct116_dko:
-      R1: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974769_1.fastq.gz"
-      R2: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974769_2.fastq.gz"
-
-aligners:
-  - starsolo
-  - kallisto
-  - alevin
-
-aligner_params:
-  starsolo:
-    multimapper_modes: ["unique", "multimapping"]
-    extra_args: ""
-  kallisto:
-    extra_args: ""
-  alevin:
-    extra_args: ""
-
-resources:
-  max_threads: 10
-  max_mem_mb: 40000
diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml
index ea12b78..5a08495 100644
--- a/workflow/configs/simulation_chromium.yaml
+++ b/workflow/configs/simulation_chromium.yaml
@@ -1,5 +1,5 @@
-base: "/home/imallona/repeats/results/simulation_chromium"
-indices_base: "/home/imallona/repeats/results/shared"
+base: "../results/simulation_chromium"
+indices_base: "../results/shared"
 
 reference:
   assembly: hg38
@@ -8,20 +8,21 @@ reference:
   # Split repeats into genic/intergenic subsets using gene body overlap (bedtools)
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "/home/imallona/repeats/results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "/home/imallona/repeats/results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "/home/imallona/repeats/results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
 simulation:
   technology: chromium
-  n_cells: 50
-  n_expressed_per_cell: 80
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   cb_length: 16
   umi_length: 12
   seed: 42
+  mutation_rate: 0.001
 
 feature_sets:
   - repeats
diff --git a/workflow/configs/simulation_chromium_noise_0pct.yaml b/workflow/configs/simulation_chromium_noise_0pct.yaml
new file mode 100644
index 0000000..d098ee1
--- /dev/null
+++ b/workflow/configs/simulation_chromium_noise_0pct.yaml
@@ -0,0 +1,59 @@
+base: "../results/simulation_chromium_noise_0pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: chromium
+  n_cells: 500
+  n_expressed_per_cell: 1000
+  read_length: 90
+  cb_length: 16
+  umi_length: 12
+  seed: 42
+  mutation_rate: 0.0
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 50
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_chromium_noise_10pct.yaml b/workflow/configs/simulation_chromium_noise_10pct.yaml
new file mode 100644
index 0000000..07b1052
--- /dev/null
+++ b/workflow/configs/simulation_chromium_noise_10pct.yaml
@@ -0,0 +1,59 @@
+base: "../results/simulation_chromium_noise_10pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: chromium
+  n_cells: 500
+  n_expressed_per_cell: 1000
+  read_length: 90
+  cb_length: 16
+  umi_length: 12
+  seed: 42
+  mutation_rate: 0.10
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 50
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_chromium_noise_1pct.yaml b/workflow/configs/simulation_chromium_noise_1pct.yaml
new file mode 100644
index 0000000..7837c70
--- /dev/null
+++ b/workflow/configs/simulation_chromium_noise_1pct.yaml
@@ -0,0 +1,59 @@
+base: "../results/simulation_chromium_noise_1pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: chromium
+  n_cells: 500
+  n_expressed_per_cell: 1000
+  read_length: 90
+  cb_length: 16
+  umi_length: 12
+  seed: 42
+  mutation_rate: 0.01
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 50
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_chromium_noise_5pct.yaml b/workflow/configs/simulation_chromium_noise_5pct.yaml
new file mode 100644
index 0000000..fadf484
--- /dev/null
+++ b/workflow/configs/simulation_chromium_noise_5pct.yaml
@@ -0,0 +1,59 @@
+base: "../results/simulation_chromium_noise_5pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: chromium
+  n_cells: 500
+  n_expressed_per_cell: 1000
+  read_length: 90
+  cb_length: 16
+  umi_length: 12
+  seed: 42
+  mutation_rate: 0.05
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: ""
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 50
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml
index b9a7698..541b150 100644
--- a/workflow/configs/simulation_smartseq2.yaml
+++ b/workflow/configs/simulation_smartseq2.yaml
@@ -1,5 +1,5 @@
-base: "/home/imallona/repeats/results/simulation_smartseq2"
-indices_base: "/home/imallona/repeats/results/shared"
+base: "../results/simulation_smartseq2"
+indices_base: "../results/shared"
 
 reference:
   assembly: hg38
@@ -7,18 +7,19 @@ reference:
   chromosomes: ["chr10"]
   filter_genic: true
   rmsk_source: ucsc_flatfile
-  genome_fasta: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz"
-  repeats_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz"
-  genes_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz"
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
 
 mode: simulation
 
 simulation:
   technology: smartseq2
-  n_cells: 20
-  n_expressed_per_cell: 100
+  n_cells: 500
+  n_expressed_per_cell: 1000
   read_length: 90
   seed: 42
+  mutation_rate: 0.001
 
 # Feature sets to quantify against:
 #   genes              - Ensembl gene annotation
diff --git a/workflow/configs/simulation_smartseq2_noise_0pct.yaml b/workflow/configs/simulation_smartseq2_noise_0pct.yaml
new file mode 100644
index 0000000..fcf2473
--- /dev/null
+++ b/workflow/configs/simulation_smartseq2_noise_0pct.yaml
@@ -0,0 +1,57 @@
+base: "../results/simulation_smartseq2_noise_0pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 500
+  n_expressed_per_cell: 1000
+  read_length: 90
+  seed: 42
+  mutation_rate: 0.0
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: "--minAssignedFrags 1"
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 20
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_smartseq2_noise_10pct.yaml b/workflow/configs/simulation_smartseq2_noise_10pct.yaml
new file mode 100644
index 0000000..ee49fda
--- /dev/null
+++ b/workflow/configs/simulation_smartseq2_noise_10pct.yaml
@@ -0,0 +1,57 @@
+base: "../results/simulation_smartseq2_noise_10pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 500
+  n_expressed_per_cell: 1000
+  read_length: 90
+  seed: 42
+  mutation_rate: 0.10
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: "--minAssignedFrags 1"
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 20
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_smartseq2_noise_1pct.yaml b/workflow/configs/simulation_smartseq2_noise_1pct.yaml
new file mode 100644
index 0000000..e19d0cc
--- /dev/null
+++ b/workflow/configs/simulation_smartseq2_noise_1pct.yaml
@@ -0,0 +1,57 @@
+base: "../results/simulation_smartseq2_noise_1pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 500
+  n_expressed_per_cell: 1000
+  read_length: 90
+  seed: 42
+  mutation_rate: 0.01
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: "--minAssignedFrags 1"
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 20
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/configs/simulation_smartseq2_noise_5pct.yaml b/workflow/configs/simulation_smartseq2_noise_5pct.yaml
new file mode 100644
index 0000000..2c7756e
--- /dev/null
+++ b/workflow/configs/simulation_smartseq2_noise_5pct.yaml
@@ -0,0 +1,57 @@
+base: "../results/simulation_smartseq2_noise_5pct"
+indices_base: "../results/shared"
+
+reference:
+  assembly: hg38
+  ensembl_release: "112"
+  chromosomes: ["chr10"]
+  filter_genic: true
+  rmsk_source: ucsc_flatfile
+  genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz"
+  repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz"
+  genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz"
+
+mode: simulation
+
+simulation:
+  technology: smartseq2
+  n_cells: 500
+  n_expressed_per_cell: 1000
+  read_length: 90
+  seed: 42
+  mutation_rate: 0.05
+
+feature_sets:
+  - repeats
+  - genes
+  - genic_repeats
+  - intergenic_repeats
+
+aligners:
+  - starsolo
+  - kallisto
+  - alevin
+  - bowtie2
+
+aligner_params:
+  starsolo:
+    multimapper_modes: ["unique"]
+    extra_args: ""
+  kallisto:
+    extra_args: ""
+  alevin:
+    extra_args: "--minAssignedFrags 1"
+  bowtie2:
+    multimapper_modes: ["unique", "multi"]
+    extra_args: ""
+    featurecounts_cell_chunk_size: 20
+
+granularities:
+  - locus
+  - gene_id
+  - family_id
+  - class_id
+
+resources:
+  max_threads: 10
+  max_mem_mb: 20000
diff --git a/workflow/modules/bowtie2.snmk.bak b/workflow/modules/bowtie2.snmk.bak
new file mode 100644
index 0000000..55a6a38
--- /dev/null
+++ b/workflow/modules/bowtie2.snmk.bak
@@ -0,0 +1,352 @@
+#!/usr/bin/env snakemake -s
+"""
+Bowtie2 alignment against the feature-set pseudo-genome, followed by featureCounts.
+
+Two independent chunking axes (both configurable, both default to 1 = no chunking):
+  featurecounts_cell_chunk_size    - SmartSeq2 only; cells per featureCounts job
+  featurecounts_n_feature_chunks   - both technologies; number of GTF pieces for featureCounts
+
+Cell chunking reduces the number of BAM handles per job.
+Feature chunking reduces per-job memory when the annotation is large.
+The merge step reassembles the full feature x cell matrix from all chunks.
+"""
+
+import os.path as op
+import math
+
+sim_technology = config.get('simulation', config.get('real_data', {})).get('technology', 'smartseq2')
+bt2_cfg = config.get('aligner_params', {}).get('bowtie2', {})
+cell_chunk_size = bt2_cfg.get('featurecounts_cell_chunk_size', 50)
+n_feature_chunks = bt2_cfg.get('featurecounts_n_feature_chunks', 1)
+feature_chunk_ids = list(range(n_feature_chunks))
+
+
+def bowtie2_index_prefix(wildcards):
+    return op.join(config['base'], 'references', f'bowtie2_{wildcards.feature_set}', wildcards.feature_set)
+
+
+def gtf_for_feature_set(wildcards):
+    if wildcards.feature_set == 'genes':
+        return op.join(config['base'], 'tmp', 'genes.gtf')
+    return op.join(config['base'], 'tmp', 'repeats.gtf')
+
+
+def split_gtf_into_chunks(gtf_path, n_chunks, out_paths):
+    """Split a GTF into n_chunks files by gene_id, writing to out_paths."""
+    gene_ids = []
+    seen = set()
+    with open(gtf_path) as fh:
+        for line in fh:
+            if line.startswith('#'):
+                continue
+            parts = line.split('\t')
+            if len(parts) < 9:
+                continue
+            for attr in parts[8].split(';'):
+                attr = attr.strip()
+                if attr.startswith('gene_id'):
+                    gid = attr[len('gene_id'):].strip().strip('"')
+                    if gid not in seen:
+                        gene_ids.append(gid)
+                        seen.add(gid)
+
+    chunk_size = max(1, math.ceil(len(gene_ids) / n_chunks))
+    gene_id_to_chunk = {}
+    for idx, gid in enumerate(gene_ids):
+        gene_id_to_chunk[gid] = min(idx // chunk_size, n_chunks - 1)
+
+    handles = [open(p, 'w') for p in out_paths]
+    with open(gtf_path) as fh:
+        for line in fh:
+            if line.startswith('#'):
+                for h in handles:
+                    h.write(line)
+                continue
+            parts = line.split('\t')
+            if len(parts) < 9:
+                continue
+            gid = ''
+            for attr in parts[8].split(';'):
+                attr = attr.strip()
+                if attr.startswith('gene_id'):
+                    gid = attr[len('gene_id'):].strip().strip('"')
+                    break
+            chunk_idx = gene_id_to_chunk.get(gid, 0)
+            handles[chunk_idx].write(line)
+    for h in handles:
+        h.close()
+
+
+if sim_technology == 'smartseq2':
+
+    sim_cell_ids = [f'cell_{i + 1:03d}' for i in range(
+        config.get('simulation', {}).get('n_cells', 20))]
+    n_cell_chunks = max(1, math.ceil(len(sim_cell_ids) / cell_chunk_size))
+    cell_chunks = [sim_cell_ids[i * cell_chunk_size:(i + 1) * cell_chunk_size]
+                   for i in range(n_cell_chunks)]
+    cell_chunk_ids = list(range(n_cell_chunks))
+
+    rule bowtie2_align_smartseq2_cell:
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            index = lambda wc: op.join(config['base'], 'references',
+                                       'bowtie2_' + wc.feature_set, wc.feature_set + '.1.bt2'),
+            fastq = (op.join(config['base'], 'simulations', 'smartseq2', '{cell_id}.fastq.gz')
+                     if config['mode'] == 'simulation'
+                     else lambda wc: config['real_data']['samples'][wc.cell_id]['R1'])
+        output:
+            bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', '{cell_id}.bam')
+        params:
+            index_prefix = bowtie2_index_prefix,
+            extra_args = bt2_cfg.get('extra_args', '')
+        log: op.join(config['base'], 'logs', 'bowtie2_{feature_set}_smartseq2_{cell_id}.log')
+        benchmark: op.join(config['base'], 'benchmarks', 'bowtie2_{feature_set}_smartseq2_{cell_id}.txt')
+        threads: 4
+        shell:
+            """
+            bowtie2 -x {params.index_prefix} \
+                    -U {input.fastq} \
+                    --threads {threads} \
+                    {params.extra_args} 2> {log} | \
+            samtools sort -o {output.bam} -@ {threads}
+            samtools index {output.bam}
+            """
+
+    rule featurecounts_smartseq2_chunk:
+        """featureCounts on one cell chunk x one feature chunk."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            bams = lambda wc: expand(
+                op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', '{cell_id}.bam'),
+                feature_set=wc.feature_set,
+                cell_id=cell_chunks[int(wc.cell_chunk_id)]),
+            gtf = gtf_for_feature_set
+        output:
+            counts = temp(op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2',
+                                  'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.tsv'))
+        params:
+            tmp_counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2',
+                                 'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.raw'),
+            tmp_gtf = op.join(config['base'], 'tmp',
+                              '{feature_set}_feat_chunk_{feature_chunk_id}.gtf')
+        log: op.join(config['base'], 'logs',
+                     'featurecounts_{feature_set}_cell{cell_chunk_id}_feat{feature_chunk_id}.log')
+        benchmark: op.join(config['base'], 'benchmarks',
+                           'featurecounts_{feature_set}_cell{cell_chunk_id}_feat{feature_chunk_id}.txt')
+        threads: 4
+        run:
+            gtf_paths = expand(
+                op.join(config['base'], 'tmp', '{feature_set}_feat_chunk_{fid}.gtf'),
+                feature_set=wildcards.feature_set,
+                fid=feature_chunk_ids)
+            if not all(op.exists(p) for p in gtf_paths):
+                split_gtf_into_chunks(input.gtf, n_feature_chunks, gtf_paths)
+            shell(
+                f"featureCounts "
+                f"  -T {{threads}} "
+                f"  -t exon "
+                f"  -g gene_id "
+                f"  -a {{params.tmp_gtf}} "
+                f"  -o {{params.tmp_counts}} "
+                f"  -M --fraction "
+                f"  {{input.bams}} 2> {{log}} && "
+                f"cut -f 1,7- {{params.tmp_counts}} | tail -n +2 > {{output.counts}}"
+            )
+
+    rule merge_featurecounts_smartseq2:
+        """Merge all cell x feature chunk count tables into the full feature x cell matrix."""
+        input:
+            chunks = expand(
+                op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2',
+                        'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.tsv'),
+                allow_missing=True,
+                cell_chunk_id=cell_chunk_ids,
+                feature_chunk_id=feature_chunk_ids)
+        output:
+            merged = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', 'counts_merged.tsv')
+        run:
+            # Each chunk covers a subset of cells AND a subset of features.
+            # We group by cell_chunk_id to assemble the full feature set for each cell group,
+            # then merge cell groups horizontally.
+            import re
+
+            def parse_chunk_path(path):
+                m = re.search(r'cell(\d+)_feat(\d+)', path)
+                return int(m.group(1)), int(m.group(2))
+
+            def read_counts(path):
+                rows = {}
+                with open(path) as fh:
+                    header = fh.readline().rstrip('\n').split('\t')
+                    cells = header[1:]
+                    for line in fh:
+                        parts = line.rstrip('\n').split('\t')
+                        rows[parts[0]] = parts[1:]
+                return cells, rows
+
+            # Group chunks: cell_chunk_id -> {feature_chunk_id: (cells, rows)}
+            cell_groups = {}
+            for path in input.chunks:
+                cc, fc = parse_chunk_path(path)
+                cells, rows = read_counts(path)
+                cell_groups.setdefault(cc, {})[fc] = (cells, rows)
+
+            # Within each cell group, merge across feature chunks (vertically)
+            merged_groups = {}
+            for cc in sorted(cell_groups.keys()):
+                all_cells = cell_groups[cc][0][0]
+                all_rows = {}
+                for fc in sorted(cell_groups[cc].keys()):
+                    cells, rows = cell_groups[cc][fc]
+                    all_rows.update(rows)
+                merged_groups[cc] = (all_cells, all_rows)
+
+            # Across cell groups, collect all features and merge horizontally
+            all_features = list(dict.fromkeys(
+                feat for _, rows in merged_groups.values() for feat in rows))
+            all_cells = [cell for cells, _ in merged_groups.values() for cell in cells]
+
+            with open(output.merged, 'w') as out:
+                out.write('feature_id\t' + '\t'.join(all_cells) + '\n')
+                for feat in all_features:
+                    row_vals = []
+                    for cells, rows in merged_groups.values():
+                        feat_row = rows.get(feat, ['0'] * len(cells))
+                        row_vals.extend(feat_row)
+                    out.write(feat + '\t' + '\t'.join(row_vals) + '\n')
+
+else:
+
+    rule bowtie2_align_chromium_r2:
+        """Align R2 (cDNA) reads to the feature pseudo-genome."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            index = lambda wc: op.join(config['base'], 'references',
+                                       f'bowtie2_{wc.feature_set}', f'{wc.feature_set}.1.bt2'),
+            r2 = op.join(config['base'], 'simulations', 'chromium', 'R2.fastq.gz')
+                 if config['mode'] == 'simulation'
+                 else lambda wc: config['real_data']['samples'][wc.sample]['R2']
+        output:
+            bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'aligned.bam')
+        params:
+            index_prefix = bowtie2_index_prefix,
+            extra_args = bt2_cfg.get('extra_args', '')
+        log: op.join(config['base'], 'logs', 'bowtie2_{feature_set}_chromium.log')
+        benchmark: op.join(config['base'], 'benchmarks', 'bowtie2_{feature_set}_chromium.txt')
+        threads: config['resources']['max_threads']
+        shell:
+            """
+            bowtie2 -x {params.index_prefix} \
+                    -U {input.r2} \
+                    --threads {threads} \
+                    {params.extra_args} 2> {log} | \
+            samtools sort -o {output.bam} -@ {threads}
+            samtools index {output.bam}
+            """
+
+    rule tag_bam_with_barcodes_chromium:
+        """Attach CB and UMI tags from R1 to each read in the sorted BAM."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'aligned.bam'),
+            r1 = op.join(config['base'], 'simulations', 'chromium', 'R1.fastq.gz')
+                 if config['mode'] == 'simulation'
+                 else lambda wc: config['real_data']['samples'][wc.sample]['R1']
+        output:
+            tagged_bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'tagged.bam')
+        params:
+            cb_len = config.get('simulation', config.get('real_data', {})).get('cb_length', 16),
+            umi_len = config.get('simulation', config.get('real_data', {})).get('umi_length', 12)
+        log: op.join(config['base'], 'logs', 'tag_bam_{feature_set}_chromium.log')
+        benchmark: op.join(config['base'], 'benchmarks', 'tag_bam_{feature_set}_chromium.txt')
+        shell:
+            """
+            umi_tools extract \
+                --bc-pattern={'C' * params.cb_len + 'N' * params.umi_len} \
+                --stdin {input.r1} \
+                --read2-in {input.bam} \
+                --read2-out {output.tagged_bam} \
+                --log {log}
+            """
+
+    rule umi_dedup_chromium:
+        """Deduplicate UMIs per cell per feature locus."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input: op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'tagged.bam')
+        output:
+            dedup_bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'dedup.bam')
+        log: op.join(config['base'], 'logs', 'umi_dedup_{feature_set}_chromium.log')
+        benchmark: op.join(config['base'], 'benchmarks', 'umi_dedup_{feature_set}_chromium.txt')
+        shell:
+            """
+            umi_tools dedup \
+                --per-cell \
+                --stdin {input} \
+                --stdout {output.dedup_bam} \
+                --log {log}
+            """
+
+    rule featurecounts_chromium_feature_chunk:
+        """featureCounts on the full chromium BAM, one feature chunk at a time."""
+        conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml')
+        input:
+            bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'dedup.bam'),
+            gtf = gtf_for_feature_set
+        output:
+            counts = temp(op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium',
+                                  'chunks', 'feat{feature_chunk_id}.tsv'))
+        params:
+            tmp_counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium',
+                                 'chunks', 'feat{feature_chunk_id}.raw'),
+            tmp_gtf = op.join(config['base'], 'tmp',
+                              '{feature_set}_feat_chunk_{feature_chunk_id}.gtf')
+        log: op.join(config['base'], 'logs',
+                     'featurecounts_{feature_set}_chromium_feat{feature_chunk_id}.log')
+        benchmark: op.join(config['base'], 'benchmarks',
+                           'featurecounts_{feature_set}_chromium_feat{feature_chunk_id}.txt')
+        threads: config['resources']['max_threads']
+        run:
+            gtf_paths = expand(
+                op.join(config['base'], 'tmp', '{feature_set}_feat_chunk_{fid}.gtf'),
+                feature_set=wildcards.feature_set,
+                fid=feature_chunk_ids)
+            if not all(op.exists(p) for p in gtf_paths):
+                split_gtf_into_chunks(input.gtf, n_feature_chunks, gtf_paths)
+            shell(
+                f"featureCounts "
+                f"  -T {{threads}} "
+                f"  -t exon "
+                f"  -g gene_id "
+                f"  -a {{params.tmp_gtf}} "
+                f"  -o {{params.tmp_counts}} "
+                f"  -M --fraction "
+                f"  --byReadGroup "
+                f"  {{input.bam}} 2> {{log}} && "
+                f"cut -f 1,7- {{params.tmp_counts}} | tail -n +2 > {{output.counts}}"
+            )
+
+    rule merge_featurecounts_chromium:
+        """Concatenate feature-chunk count tables from the chromium run."""
+        input:
+            chunks = expand(
+                op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium',
+                        'chunks', 'feat{feature_chunk_id}.tsv'),
+                allow_missing=True,
+                feature_chunk_id=feature_chunk_ids)
+        output:
+            counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'counts.tsv')
+        run:
+            header = None
+            rows = {}
+            for path in input.chunks:
+                with open(path) as fh:
+                    h = fh.readline().rstrip('\n')
+                    if header is None:
+                        header = h
+                    for line in fh:
+                        parts = line.rstrip('\n').split('\t')
+                        rows[parts[0]] = parts[1:]
+            with open(output.counts, 'w') as out:
+                out.write(header + '\n')
+                for feat, vals in rows.items():
+                    out.write(feat + '\t' + '\t'.join(vals) + '\n')
diff --git a/workflow/modules/download_references.snmk b/workflow/modules/download_references.snmk
index 58b6533..0fe3a19 100644
--- a/workflow/modules/download_references.snmk
+++ b/workflow/modules/download_references.snmk
@@ -69,7 +69,7 @@ genome_fasta_url = urls_cfg.get('genome_fasta', default_genome_fasta_url)
 genes_gtf_url = urls_cfg.get('genes_gtf', default_genes_gtf_url)
 rmsk_flatfile_url = urls_cfg.get('rmsk_flatfile', default_rmsk_flatfile_url)
 
-refs_dir = op.join(config['base'], 'refs')
+refs_dir = op.join(config.get('indices_base', config['base']), 'refs')
 build_rmsk_script = op.join(workflow.basedir, 'scripts', 'build_rmsk_gtf.py')
 
 
diff --git a/workflow/modules/evaluation.snmk b/workflow/modules/evaluation.snmk
index a83c559..9ae34bf 100644
--- a/workflow/modules/evaluation.snmk
+++ b/workflow/modules/evaluation.snmk
@@ -21,6 +21,7 @@ if sim_technology == 'smartseq2' and 'multi' in starsolo_modes:
 bowtie2_modes = config.get('aligner_params', {}).get('bowtie2', {}).get(
     'multimapper_modes', ['unique'])
 granularities = config.get('granularities', ['family_id'])
+sim_mutation_rate = config.get('simulation', {}).get('mutation_rate', 0.0)
 evaluation_script = op.join(workflow.basedir, 'scripts', 'evaluate.py')
 counts_dir = op.join(config['base'], 'counts')
 eval_dir = op.join(config['base'], 'evaluation')
@@ -81,6 +82,7 @@ rule evaluate_combination:
         granularity = 'locus|gene_id|family_id|class_id',
         multimapper_mode = 'unique|multi'
     params:
+        mutation_rate = sim_mutation_rate,
         benchmark_file = eval_benchmark,
         output_prefix = op.join(eval_dir,
             '{aligner}_{feature_set}_{granularity}_{multimapper_mode}')
@@ -98,7 +100,8 @@ rule evaluate_combination:
             --feature-set {wildcards.feature_set} \
             --locus-map {input.locus_map} \
             --benchmark {params.benchmark_file} \
-            --output-prefix {params.output_prefix} 2> {log}
+            --output-prefix {params.output_prefix} \
+            --mutation-rate {params.mutation_rate} 2> {log}
         """
 rule aggregate_global_metrics:
     """Concatenate all per-aligner global metric TSVs into one summary file."""
@@ -176,7 +179,9 @@ rule render_report:
     output:
         report = op.join(eval_dir, 'evaluation_report.html')
     params:
-        eval_dir = eval_dir,
+        mutation_rate = sim_mutation_rate,
+        eval_dir = op.abspath(eval_dir),
+        abs_report = op.abspath(op.join(eval_dir, 'evaluation_report.html')),
         rmd = op.join(workflow.basedir, 'scripts', 'evaluation_report.Rmd')
     log:
         op.join(config['base'], 'logs', 'render_report.log')
@@ -184,7 +189,8 @@ rule render_report:
         """
         Rscript -e "rmarkdown::render(
             '{params.rmd}',
-            output_file = '{output.report}',
+            output_file = '{params.abs_report}',
+            knit_root_dir = getwd(),
             params = list(eval_dir = '{params.eval_dir}')
         )" 2> {log}
         """
diff --git a/workflow/modules/normalize.snmk b/workflow/modules/normalize.snmk
index 779b8e7..dc6fb2e 100644
--- a/workflow/modules/normalize.snmk
+++ b/workflow/modules/normalize.snmk
@@ -74,6 +74,7 @@ rule normalize_starsolo:
         python {workflow.basedir}/scripts/normalize_starsolo.py \
             --raw-dir {input.raw_matrix_dir} \
             {params.locus_map_arg} \
+            --multimapper-mode {wildcards.multimapper_mode} \
             --granularity {wildcards.granularity} \
             --output {output.counts_tsv} 2> {log}
         """
diff --git a/workflow/modules/reference.snmk b/workflow/modules/reference.snmk
index cd00f3c..0dae94a 100644
--- a/workflow/modules/reference.snmk
+++ b/workflow/modules/reference.snmk
@@ -66,7 +66,7 @@ def t2g_path(wildcards):
 rule decompress_genome:
     conda: op.join(workflow.basedir, 'envs', 'star.yaml')
     input: config['reference']['genome_fasta']
-    output: temp(op.join(config['base'], 'tmp', 'genome.fa'))
+    output: op.join(config['indices_base'], 'tmp', 'genome.fa')
     params:
         chroms = _chrom_list
     threads: workflow.cores
@@ -84,7 +84,7 @@ rule decompress_genome:
 rule decompress_repeats_gtf:
     conda: op.join(workflow.basedir, 'envs', 'star.yaml')
     input: config['reference']['repeats_gtf']
-    output: temp(op.join(config['base'], 'tmp', 'repeats.gtf'))
+    output: op.join(config['indices_base'], 'tmp', 'repeats.gtf')
     threads: workflow.cores
     shell: "pigz --decompress --keep --stdout --processes {threads} {input} | sed 's/^chr//' > {output}"
 
@@ -92,7 +92,7 @@ rule decompress_repeats_gtf:
 rule decompress_genes_gtf:
     conda: op.join(workflow.basedir, 'envs', 'star.yaml')
     input: config['reference']['genes_gtf']
-    output: temp(op.join(config['base'], 'tmp', 'genes.gtf'))
+    output: op.join(config['indices_base'], 'tmp', 'genes.gtf')
     params:
         chroms = _chrom_list
     threads: workflow.cores
@@ -196,7 +196,7 @@ rule extract_feature_set_fasta:
         gtf = gtf_for_repeat_feature_set
     output:
         fasta = op.join(config['indices_base'], 'indices', genome_tag, '{feature_set}.fa'),
-        bed = temp(op.join(config['base'], 'tmp', '{feature_set}.bed'))
+        bed = temp(op.join(config['indices_base'], 'tmp', '{feature_set}.bed'))
     wildcard_constraints:
         feature_set = 'repeats|genic_repeats|intergenic_repeats'
     log:
diff --git a/workflow/modules/simulations.snmk b/workflow/modules/simulations.snmk
index 443e291..1c231c1 100644
--- a/workflow/modules/simulations.snmk
+++ b/workflow/modules/simulations.snmk
@@ -38,6 +38,7 @@ rule simulate_smartseq2:
         n_expressed = sim_cfg.get('n_expressed_per_cell', 100),
         read_length = sim_cfg.get('read_length', 90),
         seed = sim_seed,
+        mutation_rate = sim_cfg.get('mutation_rate', 0.001),
         chrom_args = '--chromosomes ' + ' '.join(sim_chromosomes) if sim_chromosomes else '',
         max_per_chrom_arg = (
             f'--max-repeats-per-chrom {sim_cfg["max_repeats_per_chrom"]}'
@@ -57,7 +58,8 @@ rule simulate_smartseq2:
             --read-length {params.read_length} \
             --seed {params.seed} \
             {params.chrom_args} \
-            {params.max_per_chrom_arg} 2> {log}
+            {params.max_per_chrom_arg} \
+            --mutation-rate {params.mutation_rate} 2> {log}
         """
 
 
@@ -80,6 +82,7 @@ rule simulate_chromium:
         cb_length = sim_cfg.get('cb_length', 16),
         umi_length = sim_cfg.get('umi_length', 12),
         seed = sim_seed,
+        mutation_rate = sim_cfg.get('mutation_rate', 0.001),
         chrom_args = '--chromosomes ' + ' '.join(sim_chromosomes) if sim_chromosomes else '',
         max_per_chrom_arg = (
             f'--max-repeats-per-chrom {sim_cfg["max_repeats_per_chrom"]}'
@@ -101,5 +104,6 @@ rule simulate_chromium:
             --umi-length {params.umi_length} \
             --seed {params.seed} \
             {params.chrom_args} \
-            {params.max_per_chrom_arg} 2> {log}
+            {params.max_per_chrom_arg} \
+            --mutation-rate {params.mutation_rate} 2> {log}
         """
diff --git a/workflow/modules/starsolo.snmk b/workflow/modules/starsolo.snmk
index 78df478..50416d7 100644
--- a/workflow/modules/starsolo.snmk
+++ b/workflow/modules/starsolo.snmk
@@ -97,6 +97,7 @@ rule starsolo_smartseq2:
              --soloCellReadStats None \
              --soloUMIdedup NoDedup \
              --soloMultiMappers {params.solo_multimappers} \
+             --outSAMattributes NH HI AS NM \
              --outSAMtype BAM SortedByCoordinate \
              --limitBAMsortRAM {params.max_ram_bytes} \
              {params.extra_args} 2> {log}
@@ -163,6 +164,7 @@ rule starsolo_chromium:
              --soloUMIstart {params.umi_start} \
              --soloUMIlen {params.umi_len} \
              --soloMultiMappers {params.solo_multimappers} \
+             --outSAMattributes NH HI AS NM CB UB \
              --outSAMtype BAM SortedByCoordinate \
              --limitBAMsortRAM {params.max_ram_bytes} \
              {params.extra_args} 2> {log}
diff --git a/workflow/scripts/count_starsolo_locus.py b/workflow/scripts/count_starsolo_locus.py
index 7296715..02486e0 100644
--- a/workflow/scripts/count_starsolo_locus.py
+++ b/workflow/scripts/count_starsolo_locus.py
@@ -17,17 +17,17 @@
 count matrices produced by bowtie2/kallisto/alevin.
 
 Modes:
-  smartseq2 - CB:Z tag is the cell_id (set by STAR from the manifest). No UMI.
-              All reads per (CB, locus) are counted directly.
+  smartseq2 - Cell ID is the prefix of the read name before _r{n}_,
+              set by simulate_reads.py. All reads per (cell, locus) are counted.
+              No CB:Z tag is present in SmartSeq2 STARsolo BAMs.
   chromium  - CB:Z = cell barcode, UB:Z = UMI. UMI deduplication is performed
               per (CB, locus) by counting distinct UMIs per cell.
 
 Scalability:
-  The input BAM is first sorted by CB tag (samtools sort -t CB) into a
-  temporary file. Reads are then streamed one cell at a time, so peak memory
-  is O(expressed_loci_per_cell x umis_per_locus) rather than O(all cells x all
-  loci x all UMIs). The outer accumulator (cb -> locus -> count) is sparse and
-  grows only with expressed (cell, locus) pairs.
+  Chromium: BAM is sorted by CB tag into a temporary file. Reads are streamed
+  one cell at a time, so peak memory is O(loci_per_cell x umis_per_locus).
+  SmartSeq2: BAM is read directly (sorted by coordinate). All cells are
+  accumulated simultaneously; with ~20 cells this is negligible memory.
 
 Output: feature x cell TSV (rows=locus_ids, cols=cell barcodes, first col=feature_id).
 """
@@ -35,6 +35,7 @@
 import argparse
 import bisect
 import os
+import re
 import subprocess
 import sys
 import tempfile
@@ -48,16 +49,15 @@ def parse_locus_id(locus_id):
     start_0 is 0-based, end_0 is 0-based exclusive (= GTF end field value).
     """
     try:
-        gene_part, coords = locus_id.split('::', 1)
-        # coords: chrom:start-end(strand)
-        colon_idx = coords.rfind(':')
+        gene_part, coords = locus_id.split("::", 1)
+        colon_idx = coords.rfind(":")
         chrom = coords[:colon_idx]
-        rest = coords[colon_idx + 1:]  # start-end(strand)
-        dash_idx = rest.index('-')
+        rest = coords[colon_idx + 1:]
+        dash_idx = rest.index("-")
         start_0 = int(rest[:dash_idx])
-        paren_idx = rest.index('(')
+        paren_idx = rest.index("(")
         end_0 = int(rest[dash_idx + 1:paren_idx])
-        strand = rest[paren_idx + 1:rest.index(')')]
+        strand = rest[paren_idx + 1:rest.index(")")]
         return chrom, start_0, end_0, strand
     except (ValueError, IndexError):
         return None
@@ -79,12 +79,12 @@ def load_intervals(fasta_path):
     chrom_intervals = defaultdict(list)
     with open(fasta_path) as fh:
         for line in fh:
-            if not line.startswith('>'):
+            if not line.startswith(">"):
                 continue
-            header = line[1:].rstrip('\n').split()[0]
-            if '::' not in header:
+            header = line[1:].rstrip("\n").split()[0]
+            if "::" not in header:
                 continue
-            locus_id = header.split('::', 1)[0]
+            locus_id = header.split("::", 1)[0]
             coords = parse_locus_id(header)
             if coords is None:
                 continue
@@ -127,52 +127,86 @@ def find_locus(chrom, pos, chrom_starts, chrom_intervals):
 
 def sort_bam_by_cb(bam_path, n_threads):
     """Sort BAM by CB tag into a tempfile. Returns tempfile path (caller must unlink)."""
-    tmpf = tempfile.NamedTemporaryFile(suffix='.bam', delete=False)
+    tmpf = tempfile.NamedTemporaryFile(suffix=".bam", delete=False)
     sorted_path = tmpf.name
     tmpf.close()
-    print(f'Sorting BAM by CB tag -> {sorted_path}', file=sys.stderr)
+    print(f"Sorting BAM by CB tag -> {sorted_path}", file=sys.stderr)
     result = subprocess.run(
-        ['samtools', 'sort', '-t', 'CB', '-@', str(n_threads), '-o', sorted_path, bam_path],
+        ["samtools", "sort", "-t", "CB", "-@", str(n_threads), "-o", sorted_path, bam_path],
         stderr=subprocess.PIPE
     )
     if result.returncode != 0:
         os.unlink(sorted_path)
         sys.stderr.write(result.stderr.decode())
-        sys.exit(f'samtools sort failed (exit {result.returncode})')
+        sys.exit(f"samtools sort failed (exit {result.returncode})")
     return sorted_path
 
 
-def emit_cell(cb, cell_data, mode, counts):
-    """Merge one cell's accumulated data into the sparse counts dict."""
-    if mode == 'chromium':
-        for locus_id, umis in cell_data.items():
-            counts[cb][locus_id] = len(umis)
-    else:
-        for locus_id, n in cell_data.items():
-            counts[cb][locus_id] = n
+def process_smartseq2(bam_path, chrom_starts, chrom_intervals, multimapper_mode):
+    """
+    Process SmartSeq2 BAM: cell_id is the prefix of QNAME before _r{n}_.
+    Reads are sorted by coordinate (interleaved across cells) so we accumulate
+    all cells simultaneously.
+    """
+    counts = defaultdict(lambda: defaultdict(int))
+    all_cbs = set()
+    all_loci = set()
+    n_reads = 0
+    n_assigned = 0
 
+    proc = subprocess.Popen(
+        ["samtools", "view", bam_path],
+        stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    try:
+        for raw in proc.stdout:
+            line = raw.decode()
+            if line.startswith("@"):
+                continue
+            fields = line.split("\t")
+            if len(fields) < 11:
+                continue
+            flag = int(fields[1])
+            if flag & 4 or flag & 2048:
+                continue
+            n_reads += 1
 
-def main():
-    ap = argparse.ArgumentParser(description=__doc__)
-    ap.add_argument('--bam', required=True,
-                    help='STARsolo Aligned.sortedByCoord.out.bam')
-    ap.add_argument('--fasta', required=True,
-                    help='Feature FASTA with headers transcript_id::chrom:start-end(strand)')
-    ap.add_argument('--mode', required=True, choices=['smartseq2', 'chromium'])
-    ap.add_argument('--multimapper-mode', default='unique',
-                    choices=['unique', 'multi'],
-                    help='unique: NH=1 reads only; multi: all aligned reads')
-    ap.add_argument('--threads', type=int, default=1,
-                    help='Threads for samtools sort (default: 1)')
-    ap.add_argument('--output', required=True)
-    args = ap.parse_args()
+            nh = None
+            for tag in fields[11:]:
+                if tag.startswith("NH:i:"):
+                    nh = int(tag[5:].rstrip("\n"))
+                    break
+            if multimapper_mode == "unique" and nh is not None and nh > 1:
+                continue
 
-    print(f'Loading intervals from {args.fasta}', file=sys.stderr)
-    chrom_starts, chrom_intervals = load_intervals(args.fasta)
-    n_loci = sum(len(v) for v in chrom_intervals.values())
-    print(f'  {n_loci} intervals on {len(chrom_intervals)} chromosomes', file=sys.stderr)
+            # cell_id is everything before _r{digits}_ in the QNAME
+            cb = re.sub(r"_r\d+_.*$", "", fields[0])
 
-    sorted_bam = sort_bam_by_cb(args.bam, args.threads)
+            chrom = fields[2]
+            pos_0 = int(fields[3]) - 1
+            locus_id = find_locus(chrom, pos_0, chrom_starts, chrom_intervals)
+            if locus_id is None:
+                continue
+
+            counts[cb][locus_id] += 1
+            all_cbs.add(cb)
+            all_loci.add(locus_id)
+            n_assigned += 1
+    finally:
+        proc.kill()
+        proc.wait()
+
+    print(f"  {n_reads} reads processed, {n_assigned} assigned to loci", file=sys.stderr)
+    print(f"  {len(all_cbs)} cells, {len(all_loci)} loci", file=sys.stderr)
+    return dict(counts), all_cbs, all_loci
+
+
+def process_chromium(bam_path, chrom_starts, chrom_intervals, multimapper_mode, n_threads):
+    """
+    Process Chromium BAM: sort by CB tag, stream one cell at a time to cap memory.
+    UMI deduplication per (CB, locus).
+    """
+    sorted_bam = sort_bam_by_cb(bam_path, n_threads)
 
     counts = defaultdict(lambda: defaultdict(int))
     all_cbs = set()
@@ -184,83 +218,108 @@ def main():
     cell_data = {}
 
     proc = subprocess.Popen(
-        ['samtools', 'view', sorted_bam],
+        ["samtools", "view", sorted_bam],
         stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
     try:
         for raw in proc.stdout:
             line = raw.decode()
-            if line.startswith('@'):
+            if line.startswith("@"):
                 continue
-            fields = line.split('\t')
+            fields = line.split("\t")
             if len(fields) < 11:
                 continue
             flag = int(fields[1])
             if flag & 4 or flag & 2048:
                 continue
             n_reads += 1
-            chrom = fields[2]
-            pos_0 = int(fields[3]) - 1
 
             cb = None
             ub = None
             nh = None
             for tag in fields[11:]:
-                if tag.startswith('CB:Z:'):
-                    cb = tag[5:].rstrip('\n')
-                elif tag.startswith('UB:Z:'):
-                    ub = tag[5:].rstrip('\n')
-                elif tag.startswith('NH:i:'):
-                    nh = int(tag[5:].rstrip('\n'))
+                if tag.startswith("CB:Z:"):
+                    cb = tag[5:].rstrip("\n")
+                elif tag.startswith("UB:Z:"):
+                    ub = tag[5:].rstrip("\n")
+                elif tag.startswith("NH:i:"):
+                    nh = int(tag[5:].rstrip("\n"))
 
             if cb is None:
                 continue
-            if args.multimapper_mode == 'unique' and nh is not None and nh > 1:
+            if multimapper_mode == "unique" and nh is not None and nh > 1:
                 continue
 
-            locus_id = find_locus(chrom, pos_0, chrom_starts, chrom_intervals)
+            locus_id = find_locus(fields[2], int(fields[3]) - 1, chrom_starts, chrom_intervals)
             if locus_id is None:
                 continue
 
             if cb != current_cb:
                 if current_cb is not None:
-                    emit_cell(current_cb, cell_data, args.mode, counts)
+                    for lid, umis in cell_data.items():
+                        counts[current_cb][lid] = len(umis)
                     all_cbs.add(current_cb)
                 current_cb = cb
-                cell_data = defaultdict(set) if args.mode == 'chromium' else defaultdict(int)
+                cell_data = defaultdict(set)
 
+            cell_data[locus_id].add(ub if ub else str(n_reads))
             all_loci.add(locus_id)
             n_assigned += 1
-
-            if args.mode == 'chromium':
-                cell_data[locus_id].add(ub if ub else str(n_reads))
-            else:
-                cell_data[locus_id] += 1
     finally:
         proc.kill()
         proc.wait()
 
     if current_cb is not None:
-        emit_cell(current_cb, cell_data, args.mode, counts)
+        for lid, umis in cell_data.items():
+            counts[current_cb][lid] = len(umis)
         all_cbs.add(current_cb)
 
     os.unlink(sorted_bam)
 
-    print(f'  {n_reads} reads processed, {n_assigned} assigned to loci', file=sys.stderr)
-    print(f'  {len(all_cbs)} cells, {len(all_loci)} loci', file=sys.stderr)
+    print(f"  {n_reads} reads processed, {n_assigned} assigned to loci", file=sys.stderr)
+    print(f"  {len(all_cbs)} cells, {len(all_loci)} loci", file=sys.stderr)
+    return dict(counts), all_cbs, all_loci
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--bam", required=True,
+                    help="STARsolo Aligned.sortedByCoord.out.bam")
+    ap.add_argument("--fasta", required=True,
+                    help="Feature FASTA with headers transcript_id::chrom:start-end(strand)")
+    ap.add_argument("--mode", required=True, choices=["smartseq2", "chromium"])
+    ap.add_argument("--multimapper-mode", default="unique",
+                    choices=["unique", "multi"],
+                    help="unique: NH=1 reads only; multi: all aligned reads")
+    ap.add_argument("--threads", type=int, default=1,
+                    help="Threads for samtools sort (Chromium only; default: 1)")
+    ap.add_argument("--output", required=True)
+    args = ap.parse_args()
+
+    print(f"Loading intervals from {args.fasta}", file=sys.stderr)
+    chrom_starts, chrom_intervals = load_intervals(args.fasta)
+    n_loci = sum(len(v) for v in chrom_intervals.values())
+    print(f"  {n_loci} intervals on {len(chrom_intervals)} chromosomes", file=sys.stderr)
+
+    if args.mode == "smartseq2":
+        counts, all_cbs, all_loci = process_smartseq2(
+            args.bam, chrom_starts, chrom_intervals, args.multimapper_mode)
+    else:
+        counts, all_cbs, all_loci = process_chromium(
+            args.bam, chrom_starts, chrom_intervals, args.multimapper_mode, args.threads)
 
     sorted_loci = sorted(all_loci)
     sorted_cbs = sorted(all_cbs)
 
-    with open(args.output, 'w') as fh:
-        fh.write('feature_id\t' + '\t'.join(sorted_cbs) + '\n')
+    with open(args.output, "w") as fh:
+        fh.write("feature_id\t" + "\t".join(sorted_cbs) + "\n")
         for locus_id in sorted_loci:
-            row = [str(counts[cb].get(locus_id, 0)) for cb in sorted_cbs]
-            fh.write(locus_id + '\t' + '\t'.join(row) + '\n')
+            row = [str(counts.get(cb, {}).get(locus_id, 0)) for cb in sorted_cbs]
+            fh.write(locus_id + "\t" + "\t".join(row) + "\n")
 
-    print(f'wrote {len(sorted_loci)} loci x {len(sorted_cbs)} cells to {args.output}',
+    print(f"wrote {len(sorted_loci)} loci x {len(sorted_cbs)} cells to {args.output}",
           file=sys.stderr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/workflow/scripts/evaluate.py b/workflow/scripts/evaluate.py
index 3bd0ef1..8f868b6 100644
--- a/workflow/scripts/evaluate.py
+++ b/workflow/scripts/evaluate.py
@@ -17,7 +17,7 @@
 from scipy import stats
 
 
-def load_ground_truth(gt_path, granularity='gene_id'):
+def load_ground_truth(gt_path, granularity='gene_id', valid_locus_ids=None):
     """
     Returns:
       truth: {cell_id: {feature_id: count}}
@@ -58,6 +58,11 @@ def load_ground_truth(gt_path, granularity='gene_id'):
     truth = _dd(lambda: _dd(int))
     repeat_meta = {}
     for r in raw:
+        # Filter at locus level before aggregating so cross-partition
+        # gene_ids (e.g. AluSz6 in both genic and intergenic) do not
+        # inflate truth counts for a subset evaluation.
+        if valid_locus_ids is not None and r['locus_id'] not in valid_locus_ids:
+            continue
         feat = r[key_col]
         truth[r['cell_id']][feat] += r['true_count']
         repeat_meta[feat] = (r['family_id'], r['class_id'])
@@ -223,35 +228,40 @@ def main():
                     help='4-col TSV (transcript_id, gene_id, family_id, class_id). '
                          'Expands the feature universe for specificity and restricts '
                          'ground truth to this feature_set\'s loci.')
+    ap.add_argument('--mutation-rate', type=float, default=0.0,
+                    help='Mutation rate used in the simulation (for output labelling)')
     ap.add_argument('--output-prefix', required=True)
     args = ap.parse_args()
 
-    print(f'Loading ground truth from {args.ground_truth} at granularity={args.granularity}',
-          file=sys.stderr)
-    truth, repeat_meta = load_ground_truth(args.ground_truth, granularity=args.granularity)
-
-    print(f'Loading observed counts from {args.observed_counts}', file=sys.stderr)
-    observed, matrix_features = load_count_matrix(args.observed_counts)
-
-    # Build feature universe from locus_map, and optionally filter truth to it.
-    # This ensures that for genic_repeats / intergenic_repeats, ground truth
-    # features outside the feature_set are not counted as false negatives.
+    # Load locus map FIRST so truth is filtered at locus level before aggregation.
+    # Without this, at gene_id granularity a gene_id such as AluSz6 that has copies
+    # in both genic and intergenic regions carries intergenic counts into a
+    # genic_repeats evaluation (and vice-versa), inflating truth vs observed.
     locus_map_features = set()
+    valid_locus_ids = None
     if args.locus_map and os.path.exists(args.locus_map):
+        valid_locus_ids = set()
         col = {'locus': 0, 'gene_id': 1, 'family_id': 2, 'class_id': 3}.get(
             args.granularity, 1)
         with open(args.locus_map) as fh:
             for line in fh:
                 parts = line.rstrip('\n').split('\t')
+                if not parts or not parts[0]:
+                    continue
+                valid_locus_ids.add(parts[0])  # column 0 is always transcript_id
                 if len(parts) > col:
                     locus_map_features.add(parts[col])
-        print(f'  Locus map loaded: {len(locus_map_features)} features at '
-              f'{args.granularity} level', file=sys.stderr)
-        # Filter ground truth to this feature_set's loci
-        truth = {
-            cell: {f: c for f, c in feats.items() if f in locus_map_features}
-            for cell, feats in truth.items()
-        }
+        print(f'  Locus map loaded: {len(valid_locus_ids)} loci / '
+              f'{len(locus_map_features)} features at {args.granularity} level',
+              file=sys.stderr)
+
+    print(f'Loading ground truth from {args.ground_truth} at granularity={args.granularity}',
+          file=sys.stderr)
+    truth, repeat_meta = load_ground_truth(
+        args.ground_truth, granularity=args.granularity, valid_locus_ids=valid_locus_ids)
+
+    print(f'Loading observed counts from {args.observed_counts}', file=sys.stderr)
+    observed, matrix_features = load_count_matrix(args.observed_counts)
 
     feature_universe = (
         matrix_features |
@@ -269,6 +279,7 @@ def main():
     global_metrics['multimapper_mode'] = args.multimapper_mode
     global_metrics['feature_set'] = args.feature_set
     global_metrics['granularity'] = args.granularity
+    global_metrics['mutation_rate'] = args.mutation_rate
 
     bench = load_benchmark(args.benchmark)
     global_metrics.update(bench)
@@ -281,6 +292,7 @@ def main():
         row['multimapper_mode'] = args.multimapper_mode
         row['feature_set'] = args.feature_set
         row['granularity'] = args.granularity
+        row['mutation_rate'] = args.mutation_rate
     write_tsv(per_cell_rows, args.output_prefix + '_per_cell_metrics.tsv',
               fallback_fields=['cell_id', 'pearson_r', 'spearman_r',
                                'n_truth_expressed', 'n_observed_expressed',
@@ -302,6 +314,7 @@ def main():
         row['multimapper_mode'] = args.multimapper_mode
         row['feature_set'] = args.feature_set
         row['granularity'] = args.granularity
+        row['mutation_rate'] = args.mutation_rate
         per_family_rows.append(row)
     write_tsv(per_family_rows, args.output_prefix + '_per_family_metrics.tsv',
               fallback_fields=['class_id', 'aligner', 'multimapper_mode', 'feature_set',
diff --git a/workflow/scripts/evaluation_report.Rmd b/workflow/scripts/evaluation_report.Rmd
index d0a0e4e..dd97006 100644
--- a/workflow/scripts/evaluation_report.Rmd
+++ b/workflow/scripts/evaluation_report.Rmd
@@ -13,7 +13,7 @@ params:
 ---
 
 ```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE,
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE,
                       fig.width = 12, fig.height = 12)
 library(ggplot2)
 library(dplyr)
@@ -99,10 +99,10 @@ All metrics compare the aligner count matrix against the simulation ground truth
 - **Specificity**: fraction of truly unexpressed (feature, cell) pairs correctly assigned zero by the aligner.
 
 
-Rows = aligner (+ multimapper mode), columns = metric. Each bar is a
-feature_set x granularity combination.
+Rows = metric, columns = feature_set. Each point is one aligner at one granularity;
+x-axis = granularity, colour = aligner (+ multimapper mode).
 
-```{r global-accuracy, fig.height=14, fig.width=18}
+```{r global-accuracy, fig.height=18, fig.width=14}
 acc_metrics <- c("pearson_r", "spearman_r", "f1",
                  "precision", "recall", "jaccard", "specificity")
 
@@ -110,25 +110,23 @@ acc_long <- global %>%
   select(aligner_mode, feature_set, gran_label, all_of(acc_metrics)) %>%
   pivot_longer(all_of(acc_metrics), names_to = "metric", values_to = "value") %>%
   filter(!is.na(value)) %>%
-  mutate(
-    metric = factor(metric, levels = acc_metrics),
-    fs_gran = paste0(feature_set, "\n", gran_label)
-  )
-
-n_fg <- length(unique(acc_long$fs_gran))
-pal  <- make_palette(n_fg)
-
-ggplot(acc_long, aes(x = fs_gran, y = value, fill = fs_gran)) +
-  geom_col(position = "dodge", width = 0.8) +
-  facet_grid(aligner_mode ~ metric, scales = "free_x") +
-  scale_fill_manual(values = pal, name = "feature_set / granularity") +
-  scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.5, 1)) +
-  labs(x = NULL, y = NULL,
+  mutate(metric = factor(metric, levels = acc_metrics))
+
+n_al <- length(unique(acc_long$aligner_mode))
+pal  <- make_palette(n_al)
+
+ggplot(acc_long, aes(x = gran_label, y = value,
+                     colour = aligner_mode, group = aligner_mode)) +
+  geom_point(size = 2.5, alpha = 0.85) +
+  geom_line(linetype = "dashed", linewidth = 0.5) +
+  facet_grid(metric ~ feature_set, scales = "free_y") +
+  scale_colour_manual(values = pal, name = "aligner (mode)") +
+  labs(x = "granularity", y = NULL,
        title = "global accuracy metrics",
-       subtitle = "rows = aligner, columns = metric") +
+       subtitle = "rows = metric, columns = feature_set; x = granularity; colour = aligner") +
   theme_bw(base_size = 10) +
   theme(aspect.ratio = 1,
-        axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+        axis.text.x  = element_text(angle = 40, hjust = 1, size = 8),
         strip.text.y = element_text(size = 8),
         strip.text.x = element_text(size = 8),
         legend.position = "bottom",
@@ -145,9 +143,8 @@ if (nrow(det) > 0) {
   pal_pr <- make_palette(n_lab)
 
   ggplot(det, aes(x = recall, y = precision,
-                  colour = aligner_mode, shape = feature_set, size = f1)) +
+                  colour = aligner_mode, shape = feature_set)) +
     geom_point(alpha = 0.85) +
-    scale_size_continuous(range = c(2, 7), name = "F1") +
     scale_colour_manual(values = pal_pr, name = "Aligner (mode)") +
     scale_shape_manual(values = c(16, 17, 15, 3, 7, 8)[seq_len(
                          length(unique(det$feature_set)))],
@@ -155,7 +152,7 @@ if (nrow(det) > 0) {
     facet_wrap(~gran_label, nrow = 1) +
     xlim(0, 1) + ylim(0, 1) +
     geom_abline(slope = 1, intercept = 0, linetype = "dotted", colour = "grey60") +
-    labs(title = "precision vs recall (size = F1)",
+    labs(title = "precision vs recall",
          subtitle = "facets = granularity",
          x = "Recall", y = "Precision") +
     theme_bw(base_size = 11) +
@@ -194,6 +191,15 @@ ggplot(corr, aes(x = gran_label, y = r,
 
 ## log1p RMSE
 
+log1p RMSE is computed over all (feature, cell) pairs in the feature universe.
+At locus level there are thousands of features, most with zero counts in both
+truth and observed; the dominance of (0, 0) pairs suppresses the average squared
+error so RMSE looks low. At class_id level there are only ~10-20 features, each
+accumulating large aggregated counts, so per-entry log1p differences are larger
+and RMSE rises. The gradient locus < gene_id < family_id < class_id is therefore
+expected and not a mistake - lower locus RMSE reflects sparsity, not better
+quantification accuracy.
+
 ```{r rmse, fig.height=5}
 if ("log1p_rmse" %in% names(global) && any(!is.na(global$log1p_rmse))) {
   n_al2 <- length(unique(global$aligner_mode))
@@ -275,6 +281,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa
     geom_line(linewidth = 0.5, linetype = "dashed") +
     geom_point(size = 2, alpha = 0.85) +
     facet_grid(gran_label ~ feature_set) +
+    scale_y_sqrt() +
     scale_colour_manual(values = pal_pf, name = "Aligner (mode)") +
     labs(x = "repeat class", y = "F1",
          title = "F1 per repeat class",
@@ -290,6 +297,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa
     geom_line(linewidth = 0.5, linetype = "dashed") +
     geom_point(size = 2, alpha = 0.85) +
     facet_grid(gran_label ~ feature_set) +
+    scale_y_sqrt() +
     scale_colour_manual(values = pal_pf, name = "Aligner (mode)") +
     labs(x = "repeat class", y = "pearson r",
          title = "Pearson r per repeat class") +
@@ -322,7 +330,7 @@ if (length(res_cols) > 0 && any(!is.na(global[res_cols]))) {
                    colour = aligner_mode, group = aligner_mode)) +
     geom_line(linetype = "dashed", linewidth = 0.6) +
     geom_point(size = 2) +
-    facet_grid(resource ~ feature_set, scales = "free_y") +
+    facet_grid(feature_set ~ resource, scales = "free_y") +
     scale_colour_manual(values = pal_res, name = "Aligner (mode)") +
     labs(x = "granularity", y = NULL, title = "compute resource usage") +
     theme_bw(base_size = 10) +
diff --git a/workflow/scripts/noise_sweep_report.Rmd b/workflow/scripts/noise_sweep_report.Rmd
new file mode 100644
index 0000000..c1a9602
--- /dev/null
+++ b/workflow/scripts/noise_sweep_report.Rmd
@@ -0,0 +1,303 @@
+---
+title: "repeat element quantification - noise sweep report"
+author: "Izaskun Mallona"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    theme: flatly
+    toc: true
+    toc_float: true
+    code_folding: hide
+params:
+  eval_dirs: "."
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE,
+                      fig.width = 14, fig.height = 10)
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(scales)
+library(patchwork)
+
+make_palette <- function(n) {
+  if (n <= 8)  return(RColorBrewer::brewer.pal(max(3, n), "Dark2")[seq_len(n)])
+  colorRampPalette(RColorBrewer::brewer.pal(8, "Dark2"))(n)
+}
+
+eval_dirs <- strsplit(params$eval_dirs, ",")[[1]]
+eval_dirs <- trimws(eval_dirs)
+```
+
+```{r load-data}
+load_metrics_from_dirs <- function(dirs, filename) {
+  purrr::map_dfr(dirs, function(d) {
+    p <- file.path(d, filename)
+    if (!file.exists(p)) return(NULL)
+    read_tsv(p, show_col_types = FALSE)
+  })
+}
+
+load_pattern_from_dirs <- function(dirs, pattern) {
+  purrr::map_dfr(dirs, function(d) {
+    files <- list.files(d, pattern = pattern, full.names = TRUE)
+    if (length(files) == 0) return(NULL)
+    purrr::map_dfr(files, ~ read_tsv(.x, show_col_types = FALSE))
+  })
+}
+
+global <- load_metrics_from_dirs(eval_dirs, "summary_global_metrics.tsv")
+
+num_cols <- c("pearson_r", "spearman_r", "log1p_rmse",
+              "precision", "recall", "f1", "jaccard", "specificity",
+              "n_cells", "n_features", "mutation_rate",
+              "wall_time_s", "cpu_time_s", "max_rss_mb", "io_in_mb", "io_out_mb")
+for (col in num_cols) {
+  if (col %in% names(global))
+    global[[col]] <- suppressWarnings(as.numeric(global[[col]]))
+}
+
+if (!"granularity" %in% names(global))   global$granularity   <- "gene_id"
+if (!"feature_set" %in% names(global))   global$feature_set   <- "repeats"
+if (!"multimapper_mode" %in% names(global)) global$multimapper_mode <- "unique"
+if (!"mutation_rate" %in% names(global)) global$mutation_rate <- NA_real_
+
+global <- global %>%
+  mutate(
+    aligner_mode  = paste0(aligner, " (", multimapper_mode, ")"),
+    gran_label    = factor(granularity,
+                           levels = c("locus", "gene_id", "family_id", "class_id")),
+    noise_label   = paste0(mutation_rate * 100, "%")
+  )
+
+per_cell   <- tryCatch(load_pattern_from_dirs(eval_dirs, "_per_cell_metrics\\.tsv$"),
+                       error = function(e) NULL)
+per_family <- tryCatch(load_pattern_from_dirs(eval_dirs, "_per_family_metrics\\.tsv$"),
+                       error = function(e) NULL)
+
+for (col in c("pearson_r", "spearman_r", "mutation_rate")) {
+  if (!is.null(per_cell) && col %in% names(per_cell))
+    per_cell[[col]] <- suppressWarnings(as.numeric(per_cell[[col]]))
+  if (!is.null(per_family) && col %in% names(per_family))
+    per_family[[col]] <- suppressWarnings(as.numeric(per_family[[col]]))
+}
+
+if (!is.null(per_family) && nrow(per_family) > 0) {
+  for (col in c("f1", "precision", "recall", "jaccard", "specificity")) {
+    if (col %in% names(per_family))
+      per_family[[col]] <- suppressWarnings(as.numeric(per_family[[col]]))
+  }
+  if (!"multimapper_mode" %in% names(per_family)) per_family$multimapper_mode <- "unique"
+  per_family <- per_family %>%
+    mutate(aligner_mode = paste0(aligner, " (", multimapper_mode, ")"))
+}
+```
+
+---
+
+This report combines evaluation results across multiple noise levels (simulated nucleotide
+substitution rates). Each `eval_dirs` entry corresponds to one pipeline run with a distinct
+`mutation_rate` in the simulation config. The `mutation_rate` column in the metrics TSVs
+identifies each run.
+
+Noise levels found in the data: `r sort(unique(global$mutation_rate))`
+
+---
+
+## metric degradation by noise level
+
+How each aligner's accuracy changes as the per-base mutation rate increases.
+x-axis = mutation rate, y-axis = metric value, colour = aligner (+ multimapper mode).
+
+```{r degradation-main, fig.height=20, fig.width=14}
+acc_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall",
+                 "jaccard", "specificity")
+
+deg <- global %>%
+  filter(!is.na(mutation_rate)) %>%
+  select(aligner_mode, feature_set, gran_label, mutation_rate, all_of(acc_metrics)) %>%
+  pivot_longer(all_of(acc_metrics), names_to = "metric", values_to = "value") %>%
+  filter(!is.na(value)) %>%
+  mutate(metric = factor(metric, levels = acc_metrics))
+
+n_al <- length(unique(deg$aligner_mode))
+pal  <- make_palette(n_al)
+
+ggplot(deg, aes(x = mutation_rate, y = value,
+                colour = aligner_mode, group = aligner_mode)) +
+  geom_point(size = 2.5, alpha = 0.85) +
+  geom_line(linewidth = 0.6) +
+  facet_grid(metric ~ feature_set + gran_label, scales = "free_y") +
+  scale_colour_manual(values = pal, name = "aligner (mode)") +
+  scale_x_continuous(labels = percent_format(accuracy = 0.1)) +
+  labs(x = "mutation rate", y = NULL,
+       title = "metric degradation with increasing noise",
+       subtitle = "rows = metric; columns = feature_set x granularity; x = mutation rate") +
+  theme_bw(base_size = 9) +
+  theme(axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+        strip.text   = element_text(size = 7),
+        legend.position = "bottom",
+        legend.text  = element_text(size = 8))
+```
+
+## pearson r and F1 — noise overview
+
+Summary view focusing on the two most interpretable metrics.
+
+```{r overview-two-metrics, fig.height=10, fig.width=14}
+ov <- global %>%
+  filter(!is.na(mutation_rate)) %>%
+  select(aligner_mode, feature_set, gran_label, mutation_rate, pearson_r, f1) %>%
+  pivot_longer(c(pearson_r, f1), names_to = "metric", values_to = "value") %>%
+  filter(!is.na(value))
+
+n_al2 <- length(unique(ov$aligner_mode))
+pal2  <- make_palette(n_al2)
+
+ggplot(ov, aes(x = mutation_rate, y = value,
+               colour = aligner_mode, group = aligner_mode)) +
+  geom_point(size = 3, alpha = 0.85) +
+  geom_line(linewidth = 0.8) +
+  facet_grid(metric ~ feature_set, scales = "free_y") +
+  scale_colour_manual(values = pal2, name = "aligner (mode)") +
+  scale_x_continuous(labels = percent_format(accuracy = 0.1)) +
+  ylim(0, 1) +
+  labs(x = "mutation rate", y = NULL,
+       title = "Pearson r and F1 across noise levels",
+       subtitle = "rows = metric, columns = feature_set; lines connect same aligner across mutation rates") +
+  theme_bw(base_size = 11) +
+  theme(axis.text.x = element_text(angle = 25, hjust = 1),
+        legend.position = "bottom")
+```
+
+## global accuracy metrics per noise level
+
+Full metric grid from the standard report, faceted by mutation rate.
+
+```{r global-faceted, fig.height=22, fig.width=16}
+full_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall",
+                  "jaccard", "specificity", "log1p_rmse")
+
+full_long <- global %>%
+  filter(!is.na(mutation_rate)) %>%
+  select(aligner_mode, feature_set, gran_label, mutation_rate, noise_label,
+         all_of(full_metrics)) %>%
+  pivot_longer(all_of(full_metrics), names_to = "metric", values_to = "value") %>%
+  filter(!is.na(value)) %>%
+  mutate(metric = factor(metric, levels = full_metrics))
+
+n_al3 <- length(unique(full_long$aligner_mode))
+pal3  <- make_palette(n_al3)
+
+ggplot(full_long, aes(x = gran_label, y = value,
+                      colour = aligner_mode, group = aligner_mode)) +
+  geom_point(size = 2, alpha = 0.8) +
+  geom_line(linetype = "dashed", linewidth = 0.5) +
+  facet_grid(metric ~ noise_label + feature_set, scales = "free_y") +
+  scale_colour_manual(values = pal3, name = "aligner (mode)") +
+  labs(x = "granularity", y = NULL,
+       title = "global accuracy metrics",
+       subtitle = "rows = metric; columns = mutation rate x feature_set; x = granularity") +
+  theme_bw(base_size = 8) +
+  theme(aspect.ratio = 1,
+        axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+        strip.text   = element_text(size = 6),
+        legend.position = "bottom",
+        legend.text  = element_text(size = 7))
+```
+
+## per-cell correlations by noise level
+
+```{r per-cell-noise, fig.height=10, fig.width=14}
+if (!is.null(per_cell) && nrow(per_cell) > 0 && "mutation_rate" %in% names(per_cell)) {
+  pc <- per_cell %>%
+    mutate(across(c(pearson_r, spearman_r), ~ suppressWarnings(as.numeric(.)))) %>%
+    filter(!is.na(aligner), !is.na(mutation_rate))
+
+  if (!"multimapper_mode" %in% names(pc)) pc$multimapper_mode <- "unique"
+  if (!"feature_set" %in% names(pc))      pc$feature_set      <- "repeats"
+
+  pc <- pc %>%
+    mutate(aligner_mode = paste0(aligner, " (", multimapper_mode, ")"),
+           gran_label   = factor(granularity,
+                                 levels = c("locus", "gene_id", "family_id", "class_id")),
+           noise_label  = paste0(mutation_rate * 100, "%"))
+
+  pc_long <- pc %>%
+    pivot_longer(c(pearson_r, spearman_r), names_to = "metric", values_to = "r") %>%
+    filter(!is.na(r))
+
+  n_al4 <- length(unique(pc_long$aligner_mode))
+  pal4  <- make_palette(n_al4)
+
+  ggplot(pc_long, aes(x = aligner_mode, y = r, fill = aligner_mode)) +
+    geom_violin(alpha = 0.7, draw_quantiles = c(0.25, 0.5, 0.75)) +
+    geom_jitter(width = 0.1, size = 0.5, alpha = 0.4) +
+    facet_grid(metric + gran_label ~ noise_label + feature_set) +
+    scale_fill_manual(values = pal4) +
+    labs(x = NULL, y = "correlation",
+         title = "per-cell correlations by noise level",
+         subtitle = "rows = metric x granularity; columns = mutation rate x feature_set") +
+    theme_bw(base_size = 8) +
+    theme(axis.text.x  = element_text(angle = 45, hjust = 1, size = 6),
+          legend.position = "none",
+          strip.text   = element_text(size = 6))
+} else {
+  cat("No per-cell metrics with mutation_rate column found.\n")
+}
+```
+
+## per-repeat-class F1 by noise level
+
+```{r per-family-noise, fig.height=14, fig.width=14}
+if (!is.null(per_family) && nrow(per_family) > 0 &&
+    "class_id" %in% names(per_family) && "mutation_rate" %in% names(per_family)) {
+
+  pf <- per_family %>%
+    filter(!is.na(class_id), class_id != "unknown", !is.na(mutation_rate)) %>%
+    mutate(gran_label  = factor(granularity,
+                                levels = c("locus", "gene_id", "family_id", "class_id")),
+           noise_label = paste0(mutation_rate * 100, "%"))
+
+  n_al5 <- length(unique(pf$aligner_mode))
+  pal5  <- make_palette(n_al5)
+
+  p_f1 <- ggplot(pf, aes(x = class_id, y = f1,
+                          colour = aligner_mode, group = aligner_mode)) +
+    geom_line(linewidth = 0.4, linetype = "dashed") +
+    geom_point(size = 1.5, alpha = 0.85) +
+    facet_grid(gran_label ~ noise_label + feature_set) +
+    scale_y_sqrt() +
+    scale_colour_manual(values = pal5, name = "Aligner (mode)") +
+    labs(x = "repeat class", y = "F1",
+         title = "F1 per repeat class by noise level",
+         subtitle = "rows = granularity; columns = mutation rate x feature_set") +
+    theme_bw(base_size = 8) +
+    theme(axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+          legend.position = "bottom",
+          legend.text  = element_text(size = 7),
+          strip.text   = element_text(size = 6))
+
+  p_r <- ggplot(pf, aes(x = class_id, y = pearson_r,
+                         colour = aligner_mode, group = aligner_mode)) +
+    geom_line(linewidth = 0.4, linetype = "dashed") +
+    geom_point(size = 1.5, alpha = 0.85) +
+    facet_grid(gran_label ~ noise_label + feature_set) +
+    scale_y_sqrt() +
+    scale_colour_manual(values = pal5, name = "Aligner (mode)") +
+    labs(x = "repeat class", y = "pearson r") +
+    theme_bw(base_size = 8) +
+    theme(axis.text.x  = element_text(angle = 40, hjust = 1, size = 7),
+          legend.position = "none",
+          strip.text   = element_text(size = 6))
+
+  p_f1 / p_r
+} else {
+  cat("No per-family metrics with class_id and mutation_rate found.\n")
+}
+```
+
+---
+*Generated by [noise_sweep_report.Rmd](../workflow/scripts/noise_sweep_report.Rmd)*
diff --git a/workflow/scripts/normalize_starsolo.py b/workflow/scripts/normalize_starsolo.py
index 3bba451..5091baf 100644
--- a/workflow/scripts/normalize_starsolo.py
+++ b/workflow/scripts/normalize_starsolo.py
@@ -27,11 +27,11 @@ def read_lines(path):
         return [line.rstrip('\n') for line in fh]
 
 
-def load_starsolo_raw(raw_dir):
+def load_starsolo_raw(raw_dir, mtx_name='matrix.mtx'):
     barcodes = read_lines(os.path.join(raw_dir, 'barcodes.tsv'))
     feature_lines = read_lines(os.path.join(raw_dir, 'features.tsv'))
     feature_ids = [line.split('\t')[0] for line in feature_lines]
-    mat = scipy.io.mmread(os.path.join(raw_dir, 'matrix.mtx')).tocsc()
+    mat = scipy.io.mmread(os.path.join(raw_dir, mtx_name)).tocsc()
     return barcodes, feature_ids, mat
 
 
@@ -94,11 +94,16 @@ def main():
     ap.add_argument('--output', required=True, help='Output TSV path')
     ap.add_argument('--locus-map', default=None,
                     help='TSV: transcript_id, gene_id, family_id, class_id (no header)')
+    ap.add_argument('--multimapper-mode', default='unique',
+                    choices=['unique', 'multi'])
     ap.add_argument('--granularity', default='gene_id',
                     choices=['gene_id', 'family_id', 'class_id'])
     args = ap.parse_args()
 
-    barcodes, feature_ids, mat = load_starsolo_raw(args.raw_dir)
+    em_mtx = os.path.join(args.raw_dir, 'UniqueAndMult-EM.mtx')
+    mtx_name = 'UniqueAndMult-EM.mtx' if args.multimapper_mode == 'multi' and os.path.exists(em_mtx) else 'matrix.mtx'
+    print('Using matrix: {}'.format(mtx_name), file=sys.stderr)
+    barcodes, feature_ids, mat = load_starsolo_raw(args.raw_dir, mtx_name)
     print('{} features, {} barcodes'.format(len(feature_ids), len(barcodes)), file=sys.stderr)
 
     gene_to_group = None
diff --git a/workflow/scripts/simulate_reads.py b/workflow/scripts/simulate_reads.py
index d2b8d55..b322655 100644
--- a/workflow/scripts/simulate_reads.py
+++ b/workflow/scripts/simulate_reads.py
@@ -111,14 +111,14 @@ def reverse_complement(seq):
     return seq.translate(table)[::-1]
 
 
-def sample_subseq(seq, read_length, rng):
+def sample_subseq(seq, read_length, rng, mutation_rate=0.001):
     if len(seq) <= read_length:
         return (seq + 'N' * read_length)[:read_length]
     offset = rng.randint(0, len(seq) - read_length)
     read = seq[offset:offset + read_length]
     bases = list(read)
     for i in range(len(bases)):
-        if rng.random() < 0.001:
+        if rng.random() < mutation_rate:
             bases[i] = rng.choice('ACGT')
     return ''.join(bases)
 
@@ -178,7 +178,7 @@ def build_chrom_locus_coords(cell_plan):
     return chrom_loci
 
 
-def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng):
+def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng, mutation_rate=0.001):
     os.makedirs(outdir, exist_ok=True)
     locus_to_cells = build_locus_to_cells(cell_plan)
     chrom_locus_coords = build_chrom_locus_coords(cell_plan)
@@ -200,7 +200,7 @@ def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng):
                     fq = cell_handles[cell_id]
                     read_idx = cell_read_counts[cell_id]
                     for i in range(count):
-                        read = sample_subseq(repeat_seq, read_length, rng)
+                        read = sample_subseq(repeat_seq, read_length, rng, mutation_rate)
                         fq.write(f'@{cell_id}_r{read_idx + i}_{safe_id(locus_id)}\n'
                                  f'{read}\n+\n{make_qual(len(read))}\n')
                     cell_read_counts[cell_id] += count
@@ -215,7 +215,7 @@ def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng):
     return cell_fastq_paths, ground_truth
 
 
-def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_length, outdir, rng):
+def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_length, outdir, rng, mutation_rate=0.001):
     os.makedirs(outdir, exist_ok=True)
     locus_to_cells = build_locus_to_cells(cell_plan)
     chrom_locus_coords = build_chrom_locus_coords(cell_plan)
@@ -263,7 +263,7 @@ def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_le
                         used_umis.add(umi)
                         read_id = f'r{total_reads}_{barcode[:8]}_{umi}_{safe_id(locus_id)}'
                         r1_seq = barcode + umi
-                        r2_seq = sample_subseq(repeat_seq, read_length, rng)
+                        r2_seq = sample_subseq(repeat_seq, read_length, rng, mutation_rate)
                         r1f.write(f'@{read_id}\n{r1_seq}\n+\n{make_qual(len(r1_seq))}\n')
                         r2f.write(f'@{read_id}\n{r2_seq}\n+\n{make_qual(len(r2_seq))}\n')
                         total_reads += 1
@@ -323,6 +323,8 @@ def main():
     ap.add_argument('--max-repeats-per-chrom', type=int, default=None,
                     help='Cap intervals per chromosome to limit memory use on full genomes')
     ap.add_argument('--seed', type=int, default=42)
+    ap.add_argument('--mutation-rate', type=float, default=0.001,
+                    help='Per-base substitution probability for simulated reads (default 0.001)')
     args = ap.parse_args()
 
     rng = random.Random(args.seed)
@@ -353,7 +355,7 @@ def main():
     if args.mode == 'smartseq2':
         print(f'Simulating SmartSeq2 reads (streaming FASTA by chrom)', file=sys.stderr)
         cell_fastq_paths, ground_truth = simulate_smartseq2(
-            args.fasta, cell_plan, args.read_length, args.outdir, rng)
+            args.fasta, cell_plan, args.read_length, args.outdir, rng, mutation_rate=args.mutation_rate)
         write_ground_truth(ground_truth, ground_truth_path, cell_column='cell_id')
         manifest_path = os.path.join(args.outdir, 'manifest.tsv')
         write_smartseq2_manifest(cell_fastq_paths, manifest_path)
@@ -363,7 +365,7 @@ def main():
         print(f'Simulating Chromium reads (streaming FASTA by chrom)', file=sys.stderr)
         (r1, r2), ground_truth = simulate_chromium(
             args.fasta, cell_plan, args.read_length,
-            args.cb_length, args.umi_length, args.outdir, rng)
+            args.cb_length, args.umi_length, args.outdir, rng, mutation_rate=args.mutation_rate)
         write_ground_truth(ground_truth, ground_truth_path, cell_column='cell_barcode')
         print(f'R1: {r1}', file=sys.stderr)
         print(f'R2: {r2}', file=sys.stderr)