imallona · imallona · Mar 23, 2026 · Mar 22, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,23 @@
+[run]
+source = workflow/scripts
+
+# Scripts requiring subprocess calls to samtools/BAM files or real aligner
+# output matrices cannot be meaningfully unit-tested without bioinformatics
+# infrastructure. Omit them so the coverage threshold applies only to
+# logic that can be exercised with synthetic data.
+omit =
+    workflow/scripts/count_pseudo_genome.py
+    workflow/scripts/count_pseudo_genome_chromium.py
+    workflow/scripts/normalize_starsolo.py
+    workflow/scripts/normalize_alevin_chromium.py
+    workflow/scripts/normalize_alevin_smartseq2.py
+    workflow/scripts/normalize_kallisto_chromium.py
+    workflow/scripts/normalize_kallisto_smartseq2_granular.py
+    workflow/scripts/merge_featurecounts.py
+    workflow/scripts/tag_bam_chromium.py
+    workflow/scripts/split_gtf_chunks.py
+
+[report]
+exclude_lines =
+    pragma: no cover
+    if __name__ == .__main__.:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,125 @@
+name: Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [master, dev]
+
+jobs:
+  # Unit and integration tests. Needs Python and scipy
+  unit-and-integration:
+    name: Unit and integration tests
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install Python dependencies
+        run: pip install pytest pytest-cov scipy
+
+      - name: Run unit and integration tests
+        run: |
+          pytest test/unit/ test/integration/ \
+            -v \
+            --tb=short \
+            --cov=workflow/scripts \
+            --cov-report=term-missing \
+            --cov-fail-under=70
+
+  # Snakemake dry-run. Installs snakemake via micromamba but does not run any rules.
+  snakemake-dryrun:
+    name: Snakemake dry-run
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          micromamba-version: latest
+          environment-name: snakemake
+          create-args: >-
+            -c bioconda
+            -c conda-forge
+            python=3.11
+            snakemake>=8
+            pytest
+            pytest-cov
+            scipy
+          init-shell: bash
+
+      - name: Dry-run main Snakefile with SmartSeq2 config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            --configfile configs/simulation_smartseq2.yaml \
+            --dry-run \
+            --rerun-triggers mtime \
+            --quiet --cores 2
+
+      - name: Dry-run main Snakefile with Chromium config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            --configfile configs/simulation_chromium.yaml \
+            --dry-run \
+            --rerun-triggers mtime \
+            --quiet --cores 2
+
+      - name: Dry-run test Snakefile with negative control config
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            -s ../test/workflow/Snakefile_test \
+            --configfile ../test/workflow/configs/test_negative_control.yaml \
+            --dry-run \
+            --rerun-triggers mtime \
+            --quiet --cores 2
+
+      - name: Run workflow dry-run pytest tests
+        shell: bash -el {0}
+        run: |
+          pytest test/workflow/test_snakemake_dryrun.py \
+            -v --tb=short -m workflow
+
+  # Full negative control run. Requires reference data on the runner and an env.
+  # triggered manually
+  negative-control-run:
+    name: Negative control full run
+    runs-on: ubuntu-latest
+    if: |
+      github.event_name == 'workflow_dispatch'
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          micromamba-version: latest
+          environment-name: snakemake
+          create-args: >-
+            -c bioconda
+            -c conda-forge
+            python=3.11
+            snakemake>=8
+          init-shell: bash
+
+      - name: Run negative control workflow
+        shell: bash -el {0}
+        working-directory: workflow
+        run: |
+          snakemake \
+            -s ../test/workflow/Snakefile_test \
+            --configfile ../test/workflow/configs/test_negative_control.yaml \
+            --use-conda \
+            --cores 2 \
+            --conda-frontend mamba
diff --git a/Makefile b/Makefile
@@ -0,0 +1,124 @@
+SHELL   := /bin/bash
+.DEFAULT_GOAL := all
+CORES   ?= 10
+WF      := workflow
+RESULTS := results
+
+CONDA_RUN := source ~/miniconda3/etc/profile.d/conda.sh && conda activate snakemake &&
+SM        := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES) --rerun-triggers mtime
+RSCRIPT   := cd $(WF) && $(CONDA_RUN) Rscript
+
+# noise sweep eval dirs (relative to workflow/, comma-separated for the Rmd param)
+NOISE_EDIRS_SS2 := $(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation
+NOISE_EDIRS_CHR := $(CURDIR)/$(RESULTS)/simulation_chromium_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_10pct/evaluation
+
+# output HTML files (relative to project root)
+NOISE_REPORT_SS2 := $(RESULTS)/noise_sweep_smartseq2.html
+NOISE_REPORT_CHR := $(RESULTS)/noise_sweep_chromium.html
+
+.PHONY: all \
+        simulation_smartseq2 simulation_chromium \
+        noise_smartseq2 noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct \
+        noise_chromium noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct \
+        report_noise_smartseq2 report_noise_chromium reports_noise \
+        help
+
+# -----------------------------------------------------------------------
+# base simulations
+# -----------------------------------------------------------------------
+
+simulation_smartseq2:
+	$(SM) --configfile configs/simulation_smartseq2.yaml
+
+simulation_chromium:
+	$(SM) --configfile configs/simulation_chromium.yaml
+
+# -----------------------------------------------------------------------
+# noise sweep — SmartSeq2
+# note: genome refs are reused from the base simulation_smartseq2 run;
+#       alignment indices are shared via results/shared/ (never rebuilt)
+# -----------------------------------------------------------------------
+
+noise_smartseq2_0pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_0pct.yaml
+
+noise_smartseq2_1pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_1pct.yaml
+
+noise_smartseq2_5pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_5pct.yaml
+
+noise_smartseq2_10pct:
+	$(SM) --configfile configs/simulation_smartseq2_noise_10pct.yaml
+
+noise_smartseq2: noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct
+
+# -----------------------------------------------------------------------
+# noise sweep — Chromium
+# -----------------------------------------------------------------------
+
+noise_chromium_0pct:
+	$(SM) --configfile configs/simulation_chromium_noise_0pct.yaml
+
+noise_chromium_1pct:
+	$(SM) --configfile configs/simulation_chromium_noise_1pct.yaml
+
+noise_chromium_5pct:
+	$(SM) --configfile configs/simulation_chromium_noise_5pct.yaml
+
+noise_chromium_10pct:
+	$(SM) --configfile configs/simulation_chromium_noise_10pct.yaml
+
+noise_chromium: noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct
+
+# -----------------------------------------------------------------------
+# noise sweep reports
+# file targets: only (re)rendered when the HTML does not yet exist;
+# use  make -B report_noise_smartseq2  to force a re-render
+# -----------------------------------------------------------------------
+
+$(NOISE_REPORT_SS2):
+	mkdir -p $(RESULTS)
+	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \
+	    output_file = '$(CURDIR)/$(NOISE_REPORT_SS2)', \
+	    params = list(eval_dirs = '$(NOISE_EDIRS_SS2)'))"
+
+$(NOISE_REPORT_CHR):
+	mkdir -p $(RESULTS)
+	$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \
+	    output_file = '$(CURDIR)/$(NOISE_REPORT_CHR)', \
+	    params = list(eval_dirs = '$(NOISE_EDIRS_CHR)'))"
+
+report_noise_smartseq2: $(NOISE_REPORT_SS2)
+report_noise_chromium:  $(NOISE_REPORT_CHR)
+reports_noise: report_noise_smartseq2 report_noise_chromium
+
+# -----------------------------------------------------------------------
+# all
+# -----------------------------------------------------------------------
+
+all: simulation_smartseq2 simulation_chromium noise_smartseq2 noise_chromium reports_noise
+
+# -----------------------------------------------------------------------
+# help
+# -----------------------------------------------------------------------
+
+help:
+	@echo "Usage: make [target] [CORES=N]   (default CORES=$(CORES))"
+	@echo ""
+	@echo "Base simulation runs:"
+	@echo "  simulation_smartseq2             SmartSeq2 base run (mutation_rate 0.1%)"
+	@echo "  simulation_chromium              Chromium base run  (mutation_rate 0.1%)"
+	@echo ""
+	@echo "Noise sweep runs (base simulation must have been run first):"
+	@echo "  noise_smartseq2                  all 4 noise levels for SmartSeq2"
+	@echo "  noise_smartseq2_{0,1,5,10}pct   individual SmartSeq2 noise level"
+	@echo "  noise_chromium                   all 4 noise levels for Chromium"
+	@echo "  noise_chromium_{0,1,5,10}pct    individual Chromium noise level"
+	@echo ""
+	@echo "Reports (file targets; use -B to force re-render):"
+	@echo "  report_noise_smartseq2           $(NOISE_REPORT_SS2)"
+	@echo "  report_noise_chromium            $(NOISE_REPORT_CHR)"
+	@echo "  reports_noise                    both noise sweep reports"
+	@echo ""
+	@echo "  all                              run everything in sequence"
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Repeat element quantification in bulk and single cells
+# Repetitive element quantification in bulk and single-cell RNA-seq
 
 ## Status
 
@@ -46,30 +46,64 @@ Exact package pins (platform-locked explicit exports) are in `workflow/envs/expl
 
 ## Running
 
+### Via Makefile (recommended)
+
+A `Makefile` at the project root orchestrates all configs and renders reports.
+
+```
+# Run everything (all simulations + noise sweep + reports)
+make
+
+# Individual targets
+make simulation_smartseq2      # SmartSeq2 base simulation
+make simulation_chromium       # Chromium base simulation
+make noise_smartseq2           # SmartSeq2 noise sweep (0%, 1%, 5%, 10% mutation rate)
+make noise_chromium            # Chromium noise sweep
+make report_noise_smartseq2    # Render SmartSeq2 noise sweep HTML report
+make report_noise_chromium     # Render Chromium noise sweep HTML report
+make help                      # List all targets
+```
+
+Tune parallelism with `make CORES=20`. The Makefile activates the `snakemake` conda
+environment automatically.
+
+### Manually
+
 ```
 source ~/miniconda3/etc/profile.d/conda.sh
 conda activate snakemake
 cd workflow
-snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10
-snakemake --configfile configs/simulation_chromium.yaml  --use-conda --cores 10
+snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10 --rerun-triggers mtime
+snakemake --configfile configs/simulation_chromium.yaml  --use-conda --cores 10 --rerun-triggers mtime
 ```
 
-The report is written to `{base}/evaluation/evaluation_report.html` as defined in
-the config file.
+The per-run evaluation report is written to `{base}/evaluation/evaluation_report.html`.
 
 ## Configuration
 
-Two reference configs are provided under `workflow/configs/`:
+Simulation configs are under `workflow/configs/`:
+
+| File | Technology | Cells | Expressed loci/cell | Mutation rate |
+|---|---|---|---|---|
+| `simulation_smartseq2.yaml` | SmartSeq2 | 500 | 1000 | 0.1% |
+| `simulation_chromium.yaml`  | 10x Chromium | 500 | 1000 | 0.1% |
+| `simulation_smartseq2_noise_0pct.yaml` | SmartSeq2 | 500 | 1000 | 0% |
+| `simulation_smartseq2_noise_1pct.yaml` | SmartSeq2 | 500 | 1000 | 1% |
+| `simulation_smartseq2_noise_5pct.yaml` | SmartSeq2 | 500 | 1000 | 5% |
+| `simulation_smartseq2_noise_10pct.yaml` | SmartSeq2 | 500 | 1000 | 10% |
+| `simulation_chromium_noise_0pct.yaml` | 10x Chromium | 500 | 1000 | 0% |
+| `simulation_chromium_noise_1pct.yaml` | 10x Chromium | 500 | 1000 | 1% |
+| `simulation_chromium_noise_5pct.yaml` | 10x Chromium | 500 | 1000 | 5% |
+| `simulation_chromium_noise_10pct.yaml` | 10x Chromium | 500 | 1000 | 10% |
 
-| File | Technology | Cells | Chr subset |
-|---|---|---|---|
-| `simulation_smartseq2.yaml` | SmartSeq2 | 20 | chr10 |
-| `simulation_chromium.yaml`  | 10x Chromium | 50 | chr10 |
+Unused real-data configs have been moved to `workflow/configs/old/`.
 
 Key parameters:
 
-- `base`: output directory for all results.
-- `indices_base`: directory for shared aligner indices (can be shared across runs).
+- `base`: run-specific output directory.
+- `indices_base`: shared directory for aligner indices and decompressed references.
+  All noise-sweep configs share the same `indices_base` so indices are built once.
+- `simulation.mutation_rate`: per-base substitution rate applied to simulated reads.
 - `feature_sets`: which repeat subsets to quantify (`repeats`, `genic_repeats`, `intergenic_repeats`).
 - `granularities`: aggregation levels (`gene_id`, `family_id`, `class_id`).
 - `aligner_params.{aligner}.multimapper_modes`: `unique` (best hit only) or `multi` (EM/all-alignments).
@@ -86,6 +120,12 @@ per-(barcode, locus) counts directly, without splitting into per-cell BAM files.
 
 See workflow/methods.md for full details.
 
+## Testing
+
+Unit tests, integration tests, and Snakemake dry-run tests are in the
+`test/` directory. See [test/README.md](test/README.md) for details on
+design, coverage, and how to run the tests.
+
 ## Contact
 
 izaskun dot mallona dot work at gmail.com

diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+testpaths = test
+addopts = -v --tb=short
+markers =
+    slow: marks tests as slow (requires bioinformatics tools or large data)
+    workflow: marks tests that require snakemake