diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..4c196c9 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,23 @@ +[run] +source = workflow/scripts + +# Scripts requiring subprocess calls to samtools/BAM files or real aligner +# output matrices cannot be meaningfully unit-tested without bioinformatics +# infrastructure. Omit them so the coverage threshold applies only to +# logic that can be exercised with synthetic data. +omit = + workflow/scripts/count_pseudo_genome.py + workflow/scripts/count_pseudo_genome_chromium.py + workflow/scripts/normalize_starsolo.py + workflow/scripts/normalize_alevin_chromium.py + workflow/scripts/normalize_alevin_smartseq2.py + workflow/scripts/normalize_kallisto_chromium.py + workflow/scripts/normalize_kallisto_smartseq2_granular.py + workflow/scripts/merge_featurecounts.py + workflow/scripts/tag_bam_chromium.py + workflow/scripts/split_gtf_chunks.py + +[report] +exclude_lines = + pragma: no cover + if __name__ == .__main__.: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..847d3e3 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,125 @@ +name: Tests + +on: + workflow_dispatch: + pull_request: + branches: [master, dev] + +jobs: + # Unit and integration tests. Needs Python and scipy + unit-and-integration: + name: Unit and integration tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install Python dependencies + run: pip install pytest pytest-cov scipy + + - name: Run unit and integration tests + run: | + pytest test/unit/ test/integration/ \ + -v \ + --tb=short \ + --cov=workflow/scripts \ + --cov-report=term-missing \ + --cov-fail-under=70 + + # Snakemake dry-run. Installs snakemake via micromamba but does not run any rules. + snakemake-dryrun: + name: Snakemake dry-run + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up micromamba + uses: mamba-org/setup-micromamba@v1 + with: + micromamba-version: latest + environment-name: snakemake + create-args: >- + -c bioconda + -c conda-forge + python=3.11 + snakemake>=8 + pytest + pytest-cov + scipy + init-shell: bash + + - name: Dry-run main Snakefile with SmartSeq2 config + shell: bash -el {0} + working-directory: workflow + run: | + snakemake \ + --configfile configs/simulation_smartseq2.yaml \ + --dry-run \ + --rerun-triggers mtime \ + --quiet --cores 2 + + - name: Dry-run main Snakefile with Chromium config + shell: bash -el {0} + working-directory: workflow + run: | + snakemake \ + --configfile configs/simulation_chromium.yaml \ + --dry-run \ + --rerun-triggers mtime \ + --quiet --cores 2 + + - name: Dry-run test Snakefile with negative control config + shell: bash -el {0} + working-directory: workflow + run: | + snakemake \ + -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --dry-run \ + --rerun-triggers mtime \ + --quiet --cores 2 + + - name: Run workflow dry-run pytest tests + shell: bash -el {0} + run: | + pytest test/workflow/test_snakemake_dryrun.py \ + -v --tb=short -m workflow + + # Full negative control run. Requires reference data on the runner and an env. + # triggered manually + negative-control-run: + name: Negative control full run + runs-on: ubuntu-latest + if: | + github.event_name == 'workflow_dispatch' + steps: + - uses: actions/checkout@v4 + + - name: Set up micromamba + uses: mamba-org/setup-micromamba@v1 + with: + micromamba-version: latest + environment-name: snakemake + create-args: >- + -c bioconda + -c conda-forge + python=3.11 + snakemake>=8 + init-shell: bash + + - name: Run negative control workflow + shell: bash -el {0} + working-directory: workflow + run: | + snakemake \ + -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --use-conda \ + --cores 2 \ + --conda-frontend mamba diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cc9a316 --- /dev/null +++ b/Makefile @@ -0,0 +1,124 @@ +SHELL := /bin/bash +.DEFAULT_GOAL := all +CORES ?= 10 +WF := workflow +RESULTS := results + +CONDA_RUN := source ~/miniconda3/etc/profile.d/conda.sh && conda activate snakemake && +SM := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES) --rerun-triggers mtime +RSCRIPT := cd $(WF) && $(CONDA_RUN) Rscript + +# noise sweep eval dirs (relative to workflow/, comma-separated for the Rmd param) +NOISE_EDIRS_SS2 := $(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation +NOISE_EDIRS_CHR := $(CURDIR)/$(RESULTS)/simulation_chromium_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_10pct/evaluation + +# output HTML files (relative to project root) +NOISE_REPORT_SS2 := $(RESULTS)/noise_sweep_smartseq2.html +NOISE_REPORT_CHR := $(RESULTS)/noise_sweep_chromium.html + +.PHONY: all \ + simulation_smartseq2 simulation_chromium \ + noise_smartseq2 noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct \ + noise_chromium noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct \ + report_noise_smartseq2 report_noise_chromium reports_noise \ + help + +# ----------------------------------------------------------------------- +# base simulations +# ----------------------------------------------------------------------- + +simulation_smartseq2: + $(SM) --configfile configs/simulation_smartseq2.yaml + +simulation_chromium: + $(SM) --configfile configs/simulation_chromium.yaml + +# ----------------------------------------------------------------------- +# noise sweep — SmartSeq2 +# note: genome refs are reused from the base simulation_smartseq2 run; +# alignment indices are shared via results/shared/ (never rebuilt) +# ----------------------------------------------------------------------- + +noise_smartseq2_0pct: + $(SM) --configfile configs/simulation_smartseq2_noise_0pct.yaml + +noise_smartseq2_1pct: + $(SM) --configfile configs/simulation_smartseq2_noise_1pct.yaml + +noise_smartseq2_5pct: + $(SM) --configfile configs/simulation_smartseq2_noise_5pct.yaml + +noise_smartseq2_10pct: + $(SM) --configfile configs/simulation_smartseq2_noise_10pct.yaml + +noise_smartseq2: noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct + +# ----------------------------------------------------------------------- +# noise sweep — Chromium +# ----------------------------------------------------------------------- + +noise_chromium_0pct: + $(SM) --configfile configs/simulation_chromium_noise_0pct.yaml + +noise_chromium_1pct: + $(SM) --configfile configs/simulation_chromium_noise_1pct.yaml + +noise_chromium_5pct: + $(SM) --configfile configs/simulation_chromium_noise_5pct.yaml + +noise_chromium_10pct: + $(SM) --configfile configs/simulation_chromium_noise_10pct.yaml + +noise_chromium: noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct + +# ----------------------------------------------------------------------- +# noise sweep reports +# file targets: only (re)rendered when the HTML does not yet exist; +# use make -B report_noise_smartseq2 to force a re-render +# ----------------------------------------------------------------------- + +$(NOISE_REPORT_SS2): + mkdir -p $(RESULTS) + $(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \ + output_file = '$(CURDIR)/$(NOISE_REPORT_SS2)', \ + params = list(eval_dirs = '$(NOISE_EDIRS_SS2)'))" + +$(NOISE_REPORT_CHR): + mkdir -p $(RESULTS) + $(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \ + output_file = '$(CURDIR)/$(NOISE_REPORT_CHR)', \ + params = list(eval_dirs = '$(NOISE_EDIRS_CHR)'))" + +report_noise_smartseq2: $(NOISE_REPORT_SS2) +report_noise_chromium: $(NOISE_REPORT_CHR) +reports_noise: report_noise_smartseq2 report_noise_chromium + +# ----------------------------------------------------------------------- +# all +# ----------------------------------------------------------------------- + +all: simulation_smartseq2 simulation_chromium noise_smartseq2 noise_chromium reports_noise + +# ----------------------------------------------------------------------- +# help +# ----------------------------------------------------------------------- + +help: + @echo "Usage: make [target] [CORES=N] (default CORES=$(CORES))" + @echo "" + @echo "Base simulation runs:" + @echo " simulation_smartseq2 SmartSeq2 base run (mutation_rate 0.1%)" + @echo " simulation_chromium Chromium base run (mutation_rate 0.1%)" + @echo "" + @echo "Noise sweep runs (base simulation must have been run first):" + @echo " noise_smartseq2 all 4 noise levels for SmartSeq2" + @echo " noise_smartseq2_{0,1,5,10}pct individual SmartSeq2 noise level" + @echo " noise_chromium all 4 noise levels for Chromium" + @echo " noise_chromium_{0,1,5,10}pct individual Chromium noise level" + @echo "" + @echo "Reports (file targets; use -B to force re-render):" + @echo " report_noise_smartseq2 $(NOISE_REPORT_SS2)" + @echo " report_noise_chromium $(NOISE_REPORT_CHR)" + @echo " reports_noise both noise sweep reports" + @echo "" + @echo " all run everything in sequence" diff --git a/README.md b/README.md index f8532c2..09ef24f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Repeat element quantification in bulk and single cells +# Repetitive element quantification in bulk and single-cell RNA-seq ## Status @@ -46,30 +46,64 @@ Exact package pins (platform-locked explicit exports) are in `workflow/envs/expl ## Running +### Via Makefile (recommended) + +A `Makefile` at the project root orchestrates all configs and renders reports. + +``` +# Run everything (all simulations + noise sweep + reports) +make + +# Individual targets +make simulation_smartseq2 # SmartSeq2 base simulation +make simulation_chromium # Chromium base simulation +make noise_smartseq2 # SmartSeq2 noise sweep (0%, 1%, 5%, 10% mutation rate) +make noise_chromium # Chromium noise sweep +make report_noise_smartseq2 # Render SmartSeq2 noise sweep HTML report +make report_noise_chromium # Render Chromium noise sweep HTML report +make help # List all targets +``` + +Tune parallelism with `make CORES=20`. The Makefile activates the `snakemake` conda +environment automatically. + +### Manually + ``` source ~/miniconda3/etc/profile.d/conda.sh conda activate snakemake cd workflow -snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10 -snakemake --configfile configs/simulation_chromium.yaml --use-conda --cores 10 +snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10 --rerun-triggers mtime +snakemake --configfile configs/simulation_chromium.yaml --use-conda --cores 10 --rerun-triggers mtime ``` -The report is written to `{base}/evaluation/evaluation_report.html` as defined in -the config file. +The per-run evaluation report is written to `{base}/evaluation/evaluation_report.html`. ## Configuration -Two reference configs are provided under `workflow/configs/`: +Simulation configs are under `workflow/configs/`: + +| File | Technology | Cells | Expressed loci/cell | Mutation rate | +|---|---|---|---|---| +| `simulation_smartseq2.yaml` | SmartSeq2 | 500 | 1000 | 0.1% | +| `simulation_chromium.yaml` | 10x Chromium | 500 | 1000 | 0.1% | +| `simulation_smartseq2_noise_0pct.yaml` | SmartSeq2 | 500 | 1000 | 0% | +| `simulation_smartseq2_noise_1pct.yaml` | SmartSeq2 | 500 | 1000 | 1% | +| `simulation_smartseq2_noise_5pct.yaml` | SmartSeq2 | 500 | 1000 | 5% | +| `simulation_smartseq2_noise_10pct.yaml` | SmartSeq2 | 500 | 1000 | 10% | +| `simulation_chromium_noise_0pct.yaml` | 10x Chromium | 500 | 1000 | 0% | +| `simulation_chromium_noise_1pct.yaml` | 10x Chromium | 500 | 1000 | 1% | +| `simulation_chromium_noise_5pct.yaml` | 10x Chromium | 500 | 1000 | 5% | +| `simulation_chromium_noise_10pct.yaml` | 10x Chromium | 500 | 1000 | 10% | -| File | Technology | Cells | Chr subset | -|---|---|---|---| -| `simulation_smartseq2.yaml` | SmartSeq2 | 20 | chr10 | -| `simulation_chromium.yaml` | 10x Chromium | 50 | chr10 | +Unused real-data configs have been moved to `workflow/configs/old/`. Key parameters: -- `base`: output directory for all results. -- `indices_base`: directory for shared aligner indices (can be shared across runs). +- `base`: run-specific output directory. +- `indices_base`: shared directory for aligner indices and decompressed references. + All noise-sweep configs share the same `indices_base` so indices are built once. +- `simulation.mutation_rate`: per-base substitution rate applied to simulated reads. - `feature_sets`: which repeat subsets to quantify (`repeats`, `genic_repeats`, `intergenic_repeats`). - `granularities`: aggregation levels (`gene_id`, `family_id`, `class_id`). - `aligner_params.{aligner}.multimapper_modes`: `unique` (best hit only) or `multi` (EM/all-alignments). @@ -86,6 +120,12 @@ per-(barcode, locus) counts directly, without splitting into per-cell BAM files. See workflow/methods.md for full details. +## Testing + +Unit tests, integration tests, and Snakemake dry-run tests are in the +`test/` directory. See [test/README.md](test/README.md) for details on +design, coverage, and how to run the tests. + ## Contact izaskun dot mallona dot work at gmail.com diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..9cf6b9f --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +testpaths = test +addopts = -v --tb=short +markers = + slow: marks tests as slow (requires bioinformatics tools or large data) + workflow: marks tests that require snakemake diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..d33baf9 --- /dev/null +++ b/test/README.md @@ -0,0 +1,209 @@ +# Tests + +This directory contains the test suite for the repeats pipeline. Tests are +run with pytest and cover the Python scripts in workflow/scripts/ as well as +the correctness of the evaluation logic end-to-end. + +See also: the main [README.md](../README.md) for pipeline usage. + +## Running the tests + +All commands below assume the repository root as the working directory unless +stated otherwise. + +### Unit and integration tests + +Install dependencies: + +``` +pip install pytest pytest-cov scipy +``` + +Run unit and integration tests: + +``` +cd /path/to/repeats +pytest test/unit/ test/integration/ -v +``` + +Run with coverage: + +``` +cd /path/to/repeats +pytest test/unit/ test/integration/ --cov=workflow/scripts --cov-report=term-missing +``` + +### Snakemake dry-run tests + +These require snakemake in PATH and are tagged with the `workflow` marker. +Run via pytest (from the repo root): + +``` +cd /path/to/repeats +pytest test/workflow/test_snakemake_dryrun.py -v -m workflow +``` + +Or run the dry-runs directly with snakemake. The working directory must be +`workflow/` for the production Snakefile, and paths to the test Snakefile and +configs are given relative to that directory: + +``` +cd /path/to/repeats/workflow + +snakemake --configfile configs/simulation_smartseq2.yaml --dry-run --quiet + +snakemake --configfile configs/simulation_chromium.yaml --dry-run --quiet + +snakemake -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --dry-run --quiet +``` + +### Negative control workflow (full run) + +Requires reference data accessible from the runner and snakemake with conda +support. Run from the `workflow/` directory: + +``` +cd /path/to/repeats/workflow + +snakemake -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --use-conda \ + --cores 4 \ + --conda-frontend mamba +``` + + +## Directory layout + +``` +test/ + unit/ unit tests for individual script functions + integration/ end-to-end tests using synthetic input files + workflow/ snakemake dry-run tests and the negative control workflow + Snakefile_test test workflow that reuses the production snmk modules + configs/ config for the negative control run + envs/ conda environment for test workflow rules +``` + + +## Unit tests (test/unit/) + +Each file targets one script in workflow/scripts/. + +test_evaluate.py covers: +- pearson_r, spearman_r: known values, edge cases (too few points, constant vector) +- log1p_rmse: perfect recovery gives 0.0, symmetry, empty input +- detection_metrics: all-correct, no-overlap, partial overlap, specificity +- build_aligned_vectors: zero-filling for missing features, cell ordering +- load_ground_truth: all four granularities (locus, gene_id, family_id, class_id) + and the valid_locus_ids filter that was added to fix the genic/intergenic + partitioning bug +- load_count_matrix: feature tracking including zero-count rows +- compute_metrics_for_subset: perfect observer and null observer + +test_simulate_reads.py covers: +- reverse_complement: known sequences, N passthrough, involution property +- parse_gtf_attribute: quoted and unquoted values, missing key +- extract_repeat_sequence: N content filter, strand handling, length filter +- sample_subseq: output length, N padding for short sequences +- sample_count_geometric: always >= 1, respects max_count, approximate mean +- build_cell_plan and build_locus_to_cells: output structure +- write_ground_truth: TSV format and row count +- parse_gtf_repeats_by_chrom: minimal GTF parsing, length filter, chrom filter + +test_build_rmsk_gtf.py covers: +- make_gtf_attributes: format and dup index incrementing +- convert_rmsk_to_gtf: basic output, dup index, length filter, chrom filter, + 1-based GTF coordinates, gzip output + +test_parse_gtf_t2g.py covers: +- parse_attrs: quoted values, missing keys, empty string +- main() with sys.argv patching: 2-col and 4-col output, correct mapping, + deduplication, feature-type filtering + + +## Integration tests (test/integration/) + +test_feature_set_consistency.py documents and tests a bug that caused lower +evaluation metrics for genic_repeats and intergenic_repeats at gene_id +granularity compared to the full repeats set, even for a hypothetically +perfect observer. + +Root cause: evaluate.py was aggregating ground truth counts across all loci +for a given gene_id (including both genic and intergenic copies), then +filtering at the gene_id level. For a gene_id like AluSz6 that has copies +in both contexts, the genic truth count included intergenic loci, inflating +truth relative to observed and degrading Pearson correlation and recall. + +The fix adds locus-level filtering before aggregation (see valid_locus_ids +in load_ground_truth). The tests in this file verify: + +1. gene_ids overlap between genic and intergenic locus maps (structural + precondition for the bug, expected in real data) +2. full repeats at gene_id with a perfect observer gives metrics of 1.0 + (baseline sanity check) +3. genic_repeats at locus granularity with a perfect observer gives 1.0 + (locus level is unaffected by the partition issue) +4. genic_repeats at gene_id with a perfect genic observer now gives 1.0 + (regression test for the fix; would fail if the fix were reverted) +5. intergenic_repeats at gene_id with a perfect observer gives 1.0 +6. subset metrics are not lower than full-set metrics for a perfect observer + (monotonicity property) + +test_metamorphic.py tests properties that must hold across input transformations +without needing exact expected values: + +- scale invariance: multiplying observed counts by a constant does not change + Pearson or Spearman +- scaling does change log1p_rmse +- monotone recall: adding true positives never decreases recall +- noise degrades metrics: progressively noisier observers give lower Pearson +- perfect beats null: a perfect observer strictly outperforms a zero observer + on all metrics +- granularity aggregation consistency: a perfect locus-level count matrix + summed to family_id also gives metrics of 1.0 +- random counts give low correlation (below 0.7 for 50-feature random data) + + +## Workflow tests (test/workflow/) + +test_snakemake_dryrun.py verifies that snakemake --dry-run succeeds for both +production configs (simulation_smartseq2.yaml and simulation_chromium.yaml) +and for the test negative control config. These tests are skipped if snakemake +is not in the PATH and are marked with the workflow marker. + +Snakefile_test is a separate Snakemake workflow that includes all production +snmk modules and adds the negative control rules: + +simulate_from_genes runs simulate_reads.py with the Ensembl gene GTF instead +of the repeat GTF. Reads come from gene body regions rather than repeat +elements. This gives a ground truth over gene features. + +check_negative_control_recall runs evaluate.py comparing that gene-body +ground truth against the repeat quantification output and asserts that recall +is below the threshold in testing.negative_control_max_recall (default 0.10). +A low recall means the pipeline correctly does not attribute gene-body reads +to repeat elements. + +The conda environment for these rules is defined in +test/workflow/envs/test_evaluation.yaml (python, scipy, pandas, numpy). + + +## GitHub Actions + +.github/workflows/tests.yml defines three jobs: + +unit-and-integration runs on every push and pull request. It uses plain +Python and does not need any bioinformatics tools installed. + +snakemake-dryrun runs on every push and pull request. It installs snakemake +via micromamba and runs dry-run checks on both production configs and the test +Snakefile. Snakemake installs conda dependencies itself when --use-conda is +passed, but dry-runs do not trigger conda installs. + +negative-control-run is only triggered manually (workflow_dispatch) or when +the repository variable ENABLE_FULL_INTEGRATION is set to true. It runs the +full negative control workflow with --use-conda and requires reference data +to be accessible from the runner. diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..ad80b2e --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,13 @@ +""" +Shared pytest configuration and path setup. + +Adds workflow/scripts to sys.path so unit tests can import scripts directly. +""" +import sys +import os + +SCRIPTS_DIR = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..', 'workflow', 'scripts') +) +if SCRIPTS_DIR not in sys.path: + sys.path.insert(0, SCRIPTS_DIR) diff --git a/test/integration/__init__.py b/test/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/integration/test_feature_set_consistency.py b/test/integration/test_feature_set_consistency.py new file mode 100644 index 0000000..3559831 --- /dev/null +++ b/test/integration/test_feature_set_consistency.py @@ -0,0 +1,393 @@ +""" +Integration tests for evaluation consistency across feature sets. + +Background +---------- +The pipeline evaluates each (feature_set × granularity × aligner) combination. +An initially puzzling result was that genic_repeats and intergenic_repeats at +gene_id granularity showed lower metrics than the full repeats set, even for a +hypothetically perfect observer. + +Root cause (now fixed in evaluate.py) +-------------------------------------- +At gene_id granularity, load_ground_truth() previously aggregated ALL loci for +a given gene_id (summing genic + intergenic copies), then filtered the already- +aggregated dict by checking whether the gene_id appeared in the genic locus_map. + +For a gene_id like AluSz6 that has copies in BOTH genic and intergenic regions: + + - genic locus_map contains AluSz6 (it has genic copies) + - OLD truth['c1']['AluSz6'] = 5 (genic) + 3 (intergenic) = 8 ← inflated + - observed from genic normalization = 5 (only genic copy) + - -> truth > observed -> degraded Pearson / RMSE, false-negative detections + +The fix moves locus_map loading to BEFORE load_ground_truth() and passes a +valid_locus_ids set so filtering happens AT LOCUS LEVEL before any aggregation. + +Scenario used by these tests +----------------------------- +We construct a minimal ground truth with: + - genic_alu_1 -> AluSz6, Alu, SINE, count=5 (GENIC) + - intergenic_alu_1 -> AluSz6, Alu, SINE, count=3 (INTERGENIC - same gene_id!) + - genic_alu_2 -> AluSx, Alu, SINE, count=7 (GENIC) + - intergenic_alu_2 -> AluSx, Alu, SINE, count=6 (INTERGENIC - same gene_id!) + - genic_L1_1 -> L1PA2, L1, LINE, count=2 (GENIC) + - genic_mir_1 -> MIR, MIR, SINE, count=4 (GENIC) + - genic_L2_1 -> L2, L2, LINE, count=3 (GENIC) + - intergenic_dna_1 -> TcMar, TcMar, DNA, count=1 (INTERGENIC) + - [TcMar has a genic reference locus genic_tcmar_ref not expressed in + simulation, but present in the genic locus_map - this triggers the + false-negative detection failure in the old code.] + +Genic locus_map contains: + genic_alu_1, genic_alu_2, genic_L1_1, genic_mir_1, genic_L2_1, genic_tcmar_ref + +Intergenic locus_map contains: + intergenic_alu_1, intergenic_alu_2, intergenic_dna_1 + +AluSz6 and AluSx appear in BOTH locus_maps (gene_id level). + +With the fix: + genic truth at gene_id: AluSz6=5, AluSx=7, L1PA2=2, MIR=4, L2=3, TcMar=0 + genic observed at gene_id (from genic normalization): AluSz6=5, AluSx=7, L1PA2=2, MIR=4, L2=3 + -> perfect match -> pearson=1.0, recall=1.0, precision=1.0 +""" +import csv +import math +import os +import subprocess +import sys + +import pytest + +SCRIPTS_DIR = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..', '..', 'workflow', 'scripts') +) +EVALUATE = os.path.join(SCRIPTS_DIR, 'evaluate.py') + +# --------------------------------------------------------------------------- +# Shared fixture: build the scenario files +# --------------------------------------------------------------------------- + +GROUND_TRUTH_ROWS = [ + # cell_id, locus_id, repeat_id (=gene_id), family_id, class_id, true_count + ('cell_001', 'genic_alu_1', 'AluSz6', 'Alu', 'SINE', '5'), + ('cell_001', 'intergenic_alu_1', 'AluSz6', 'Alu', 'SINE', '3'), + ('cell_001', 'genic_alu_2', 'AluSx', 'Alu', 'SINE', '7'), + ('cell_001', 'intergenic_alu_2', 'AluSx', 'Alu', 'SINE', '6'), + ('cell_001', 'genic_L1_1', 'L1PA2', 'L1', 'LINE', '2'), + ('cell_001', 'genic_mir_1', 'MIR', 'MIR', 'SINE', '4'), + ('cell_001', 'genic_L2_1', 'L2', 'L2', 'LINE', '3'), + ('cell_001', 'intergenic_dna_1', 'TcMar', 'TcMar', 'DNA', '1'), +] + +# Genic locus_map includes a reference locus for TcMar that was NOT expressed +# in the simulation. In the old code, TcMar's intergenic count (1) would +# bleed into the genic truth because TcMar is in the genic locus_map. +GENIC_LOCUS_MAP = [ + ('genic_alu_1', 'AluSz6', 'Alu', 'SINE'), + ('genic_alu_2', 'AluSx', 'Alu', 'SINE'), + ('genic_L1_1', 'L1PA2', 'L1', 'LINE'), + ('genic_mir_1', 'MIR', 'MIR', 'SINE'), + ('genic_L2_1', 'L2', 'L2', 'LINE'), + # reference locus for TcMar - exists in genome but NOT expressed + ('genic_tcmar_ref', 'TcMar', 'TcMar', 'DNA'), +] + +INTERGENIC_LOCUS_MAP = [ + ('intergenic_alu_1', 'AluSz6', 'Alu', 'SINE'), + ('intergenic_alu_2', 'AluSx', 'Alu', 'SINE'), + ('intergenic_dna_1', 'TcMar', 'TcMar', 'DNA'), +] + +FULL_LOCUS_MAP = GENIC_LOCUS_MAP + INTERGENIC_LOCUS_MAP + +# "Perfect genic observer" at gene_id level - exactly what genic normalization produces: +# only counts from genic loci, using GENIC_LOCUS_MAP. +GENIC_OBSERVED_GENE_ID = { + 'AluSz6': 5, + 'AluSx': 7, + 'L1PA2': 2, + 'MIR': 4, + 'L2': 3, + # TcMar: 0 (its only genic reference locus was not expressed) +} + +# "Perfect intergenic observer" at gene_id level +INTERGENIC_OBSERVED_GENE_ID = { + 'AluSz6': 3, + 'AluSx': 6, + 'TcMar': 1, +} + +# "Perfect full observer" at gene_id level +FULL_OBSERVED_GENE_ID = { + 'AluSz6': 8, + 'AluSx': 13, + 'L1PA2': 2, + 'MIR': 4, + 'L2': 3, + 'TcMar': 1, +} + +# "Perfect genic observer" at locus level +GENIC_OBSERVED_LOCUS = { + 'genic_alu_1': 5, + 'genic_alu_2': 7, + 'genic_L1_1': 2, + 'genic_mir_1': 4, + 'genic_L2_1': 3, +} + + +# --------------------------------------------------------------------------- +# File-writing helpers +# --------------------------------------------------------------------------- + +def write_ground_truth(tmp_path): + p = tmp_path / 'ground_truth.tsv' + with open(p, 'w') as fh: + fh.write('cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n') + for row in GROUND_TRUTH_ROWS: + fh.write('\t'.join(row) + '\n') + return p + + +def write_locus_map(tmp_path, name, rows): + p = tmp_path / name + with open(p, 'w') as fh: + for row in rows: + fh.write('\t'.join(row) + '\n') + return p + + +def write_count_matrix(tmp_path, name, feature_counts, cells=('cell_001',)): + p = tmp_path / name + features = sorted(feature_counts) + with open(p, 'w') as fh: + fh.write('feature_id\t' + '\t'.join(cells) + '\n') + for feat in features: + vals = [str(feature_counts[feat]) for _ in cells] + fh.write(feat + '\t' + '\t'.join(vals) + '\n') + return p + + +def run_evaluate(tmp_path, gt, counts, locus_map, granularity, feature_set): + prefix = str(tmp_path / f'{feature_set}_{granularity}') + cmd = [ + sys.executable, EVALUATE, + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'test', + '--multimapper-mode', 'unique', + '--granularity', granularity, + '--feature-set', feature_set, + '--locus-map', str(locus_map), + '--output-prefix', prefix, + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f'evaluate.py failed:\n{result.stderr}') + return read_global_metrics(prefix + '_global_metrics.tsv') + + +def read_global_metrics(path): + with open(path) as fh: + reader = csv.DictReader(fh, delimiter='\t') + return next(reader) + + +def parse_float(v): + try: + return float(v) + except (ValueError, TypeError): + return float('nan') + + +# --------------------------------------------------------------------------- +# Test 1: Structural - gene_id overlap between genic and intergenic locus_maps +# --------------------------------------------------------------------------- + +def test_gene_ids_overlap_between_genic_and_intergenic_locus_maps(): + """ + Documents the structural precondition for the bug: + AluSz6 and AluSx appear as gene_ids in BOTH genic and intergenic locus_maps. + This is expected in real data (repeat families span both contexts) and is + the reason why locus-level filtering before aggregation is necessary. + """ + genic_gene_ids = {row[1] for row in GENIC_LOCUS_MAP} + intergenic_gene_ids = {row[1] for row in INTERGENIC_LOCUS_MAP} + shared = genic_gene_ids & intergenic_gene_ids + assert len(shared) > 0, ( + 'Expected at least one gene_id in both genic and intergenic maps ' + f'but got genic={genic_gene_ids}, intergenic={intergenic_gene_ids}' + ) + assert 'AluSz6' in shared + assert 'AluSx' in shared + + +def test_full_locus_map_partitions_at_locus_level_not_gene_id(): + """ + At locus level, genic and intergenic locus_maps are disjoint. + At gene_id level they overlap. This is why the old post-aggregation + filter could not correctly separate the two subsets. + """ + genic_loci = {row[0] for row in GENIC_LOCUS_MAP} + intergenic_loci = {row[0] for row in INTERGENIC_LOCUS_MAP} + # Locus level: disjoint + assert genic_loci & intergenic_loci == set() + # Gene_id level: overlapping + genic_gene_ids = {row[1] for row in GENIC_LOCUS_MAP} + intergenic_gene_ids = {row[1] for row in INTERGENIC_LOCUS_MAP} + assert genic_gene_ids & intergenic_gene_ids != set() + + +# --------------------------------------------------------------------------- +# Test 2: Full repeats, gene_id - perfect observer -> perfect metrics (baseline) +# --------------------------------------------------------------------------- + +def test_full_repeats_gene_id_perfect_observer_gives_perfect_metrics(tmp_path): + """ + Sanity baseline: if observed = truth for all repeats at gene_id level, + all accuracy metrics should be 1.0 / 0.0. + """ + gt = write_ground_truth(tmp_path) + lm = write_locus_map(tmp_path, 'full_locus_map.tsv', FULL_LOCUS_MAP) + counts = write_count_matrix(tmp_path, 'full_counts.tsv', FULL_OBSERVED_GENE_ID) + m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'repeats') + assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['log1p_rmse']) == pytest.approx(0.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# Test 3: Genic repeats, locus granularity - perfect observer -> perfect metrics +# --------------------------------------------------------------------------- + +def test_genic_repeats_locus_granularity_perfect_observer(tmp_path): + """ + At locus level, a perfect genic observer should give perfect metrics. + No cross-partition contamination is possible at locus level because + locus IDs are unique per repeat instance. + """ + gt = write_ground_truth(tmp_path) + lm = write_locus_map(tmp_path, 'genic_locus_map.tsv', GENIC_LOCUS_MAP) + counts = write_count_matrix(tmp_path, 'genic_locus_counts.tsv', GENIC_OBSERVED_LOCUS) + m = run_evaluate(tmp_path, gt, counts, lm, 'locus', 'genic_repeats') + assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# Test 4 (THE KEY TEST): Genic repeats, gene_id - perfect genic observer +# should now give perfect metrics after the fix +# --------------------------------------------------------------------------- + +def test_genic_repeats_gene_id_perfect_genic_observer_gives_perfect_metrics(tmp_path): + """ + Core regression test for the locus-level filtering fix. + + With the OLD code (post-aggregation filter): + - truth['cell_001']['AluSz6'] = 5 + 3 = 8 (sums ALL AluSz6 loci) + - observed['cell_001']['AluSz6'] = 5 (only genic loci) + - -> truth > observed -> degraded pearson, false-negative for TcMar + + With the FIXED code (locus-level filter BEFORE aggregation): + - only genic loci contribute to truth + - truth['cell_001']['AluSz6'] = 5, truth['cell_001']['AluSx'] = 7 + - TcMar: its intergenic count is excluded; genic_tcmar_ref is not expressed + -> TcMar absent from both truth and observed -> no false negative + - observed matches truth exactly -> all metrics = 1.0 + """ + gt = write_ground_truth(tmp_path) + lm = write_locus_map(tmp_path, 'genic_locus_map.tsv', GENIC_LOCUS_MAP) + counts = write_count_matrix(tmp_path, 'genic_gene_id_counts.tsv', GENIC_OBSERVED_GENE_ID) + m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'genic_repeats') + + pearson = parse_float(m['pearson_r']) + recall = parse_float(m['recall']) + precision = parse_float(m['precision']) + rmse = parse_float(m['log1p_rmse']) + + assert pearson == pytest.approx(1.0, abs=1e-4), ( + f'pearson={pearson} - expected 1.0; ' + 'if this fails, the locus-level filtering fix may have been reverted' + ) + assert recall == pytest.approx(1.0, abs=1e-4), f'recall={recall}' + assert precision == pytest.approx(1.0, abs=1e-4), f'precision={precision}' + assert rmse == pytest.approx(0.0, abs=1e-4), f'log1p_rmse={rmse}' + + +# --------------------------------------------------------------------------- +# Test 5: Intergenic repeats, gene_id - same fix applies +# --------------------------------------------------------------------------- + +def test_intergenic_repeats_gene_id_perfect_intergenic_observer_gives_perfect_metrics(tmp_path): + """ + Mirror of test 4 for the intergenic subset. + + After the fix, intergenic truth for AluSz6 = 3 (only intergenic_alu_1), + not 8 (which would include genic_alu_1). + """ + gt = write_ground_truth(tmp_path) + lm = write_locus_map(tmp_path, 'intergenic_locus_map.tsv', INTERGENIC_LOCUS_MAP) + counts = write_count_matrix( + tmp_path, 'intergenic_counts.tsv', INTERGENIC_OBSERVED_GENE_ID) + m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'intergenic_repeats') + + assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4), ( + f"pearson={m['pearson_r']} - expected 1.0 for perfect intergenic observer" + ) + assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# Test 6: Genic + intergenic metrics should not be lower than full-set metrics +# for a perfect full observer (monotonicity property) +# --------------------------------------------------------------------------- + +def test_subset_metrics_not_lower_than_full_for_perfect_observer(tmp_path): + """ + Metamorphic property: if an aligner perfectly recovers ALL repeats, + evaluating a clean subset should not give LOWER metrics than the full set. + + We use a perfect full observer and check that both genic and intergenic + sub-evaluations also give perfect metrics (1.0). + + This test would have failed before the fix because the inflated truth + counts caused pearson < 1.0 even for a perfect observer. + """ + gt = write_ground_truth(tmp_path) + + # Full repeats evaluation + lm_full = write_locus_map(tmp_path, 'full_lm.tsv', FULL_LOCUS_MAP) + cnt_full = write_count_matrix(tmp_path, 'full_cnt.tsv', FULL_OBSERVED_GENE_ID) + m_full = run_evaluate(tmp_path, gt, cnt_full, lm_full, 'gene_id', 'repeats') + + # Genic repeats evaluation (using genic counts only) + lm_genic = write_locus_map(tmp_path, 'genic_lm.tsv', GENIC_LOCUS_MAP) + cnt_genic = write_count_matrix(tmp_path, 'genic_cnt.tsv', GENIC_OBSERVED_GENE_ID) + m_genic = run_evaluate(tmp_path, gt, cnt_genic, lm_genic, 'gene_id', 'genic_repeats') + + # Intergenic repeats evaluation + lm_inter = write_locus_map(tmp_path, 'inter_lm.tsv', INTERGENIC_LOCUS_MAP) + cnt_inter = write_count_matrix( + tmp_path, 'inter_cnt.tsv', INTERGENIC_OBSERVED_GENE_ID) + m_inter = run_evaluate(tmp_path, gt, cnt_inter, lm_inter, 'gene_id', 'intergenic_repeats') + + full_pearson = parse_float(m_full['pearson_r']) + genic_pearson = parse_float(m_genic['pearson_r']) + inter_pearson = parse_float(m_inter['pearson_r']) + + # All should be 1.0 (perfect observer) - none should be lower than full + assert full_pearson == pytest.approx(1.0, abs=1e-4) + assert genic_pearson >= full_pearson - 0.01, ( + f'genic pearson={genic_pearson} is lower than full={full_pearson}; ' + 'subset metrics should not be worse than full-set metrics for a perfect observer' + ) + assert inter_pearson >= full_pearson - 0.01, ( + f'intergenic pearson={inter_pearson} is lower than full={full_pearson}' + ) diff --git a/test/integration/test_metamorphic.py b/test/integration/test_metamorphic.py new file mode 100644 index 0000000..e080e64 --- /dev/null +++ b/test/integration/test_metamorphic.py @@ -0,0 +1,264 @@ +""" +Metamorphic tests for the evaluation pipeline. + +Metamorphic testing verifies properties that must hold across transformations +of the input, without needing exact expected values. This is especially +useful for a bioinformatics benchmarking pipeline where true correct outputs +are not always known in advance. + +Properties tested +----------------- +1. Scale invariance: multiplying observed counts by a constant should not + change Pearson or Spearman (but will change RMSE). +2. Monotone recall: adding true positives to observed should increase + or maintain recall - never decrease it. +3. Noise degrades metrics: adding random noise to a perfect observer should + monotonically worsen Pearson and RMSE. +4. Null observer is worst: a perfect observer beats a zero observer on all + quantification and detection metrics. +5. Permuting cell labels degrades per-cell metrics but not global metrics + computed on pooled counts. +6. Family_id granularity: summing a perfect locus-level count matrix to + family_id should also give perfect metrics. +""" +import math +import random + +import pytest + +import evaluate as ev + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def make_truth_obs(n_cells=4, n_features=8, seed=42): + """Return a (truth, observed) pair where observed = truth (perfect).""" + rng = random.Random(seed) + cells = [f'c{i}' for i in range(n_cells)] + feats = [f'feat_{i}' for i in range(n_features)] + truth = {c: {f: rng.randint(1, 20) for f in feats} for c in cells} + obs = {c: dict(d) for c, d in truth.items()} + return truth, obs, cells, feats + + +# --------------------------------------------------------------------------- +# 1. Scale invariance of rank-based metrics +# --------------------------------------------------------------------------- + +def test_scale_invariance_pearson(): + """Multiplying observed by a scalar should not change Pearson.""" + truth, obs, cells, feats = make_truth_obs() + m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats) + + obs_scaled = {c: {f: v * 10 for f, v in d.items()} for c, d in obs.items()} + m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats) + + assert m1['pearson_r'] == pytest.approx(m2['pearson_r'], abs=1e-4) + + +def test_scale_invariance_spearman(): + """Multiplying observed by a scalar should not change Spearman.""" + truth, obs, cells, feats = make_truth_obs() + m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats) + + obs_scaled = {c: {f: v * 3 for f, v in d.items()} for c, d in obs.items()} + m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats) + + assert m1['spearman_r'] == pytest.approx(m2['spearman_r'], abs=1e-4) + + +def test_scale_changes_rmse(): + """Scaling observed DOES change log1p_rmse.""" + truth, obs, cells, feats = make_truth_obs() + m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats) + + obs_scaled = {c: {f: v * 100 for f, v in d.items()} for c, d in obs.items()} + m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats) + + # RMSE for perfect observer is 0; scaled observer should have higher RMSE + assert m2['log1p_rmse'] > m1['log1p_rmse'] + + +# --------------------------------------------------------------------------- +# 2. Monotone recall: adding true positives never decreases recall +# --------------------------------------------------------------------------- + +def test_monotone_recall_adding_true_positives(): + """ + Start with a partially correct observer. Add more true positives. + Recall must not decrease. + """ + truth = {'c1': {'A': 5, 'B': 3, 'C': 2, 'D': 7}} + feats = ['A', 'B', 'C', 'D'] + + obs_partial = {'c1': {'A': 5}} # only A correct + obs_more = {'c1': {'A': 5, 'B': 3}} # A and B correct + + m1 = ev.compute_metrics_for_subset(truth, obs_partial, ['c1'], feats) + m2 = ev.compute_metrics_for_subset(truth, obs_more, ['c1'], feats) + + assert m2['recall'] >= m1['recall'] - 1e-9 + + +def test_monotone_recall_full_recovery_is_maximum(): + truth, obs, cells, feats = make_truth_obs() + # Partial: only first half of features observed + feats_half = feats[:len(feats)//2] + obs_partial = {c: {f: d[f] for f in feats_half if f in d} for c, d in obs.items()} + + m_partial = ev.compute_metrics_for_subset(truth, obs_partial, cells, feats) + m_full = ev.compute_metrics_for_subset(truth, obs, cells, feats) + + assert m_full['recall'] >= m_partial['recall'] - 1e-9 + assert m_full['f1'] >= m_partial['f1'] - 1e-9 + + +# --------------------------------------------------------------------------- +# 3. Noise degrades metrics monotonically +# --------------------------------------------------------------------------- + +def test_increasing_noise_degrades_pearson(): + """ + Progressively noisier observers should give monotonically worse Pearson. + """ + truth, _, cells, feats = make_truth_obs(seed=7) + prev_r = 1.0 + + rng = random.Random(7) + obs = {c: dict(d) for c, d in truth.items()} + + noise_levels = [0, 1, 3, 10, 50] + for level in noise_levels: + noisy_obs = { + c: {f: max(0, v + rng.randint(-level, level)) for f, v in d.items()} + for c, d in truth.items() + } + m = ev.compute_metrics_for_subset(truth, noisy_obs, cells, feats) + r = m['pearson_r'] + if not isinstance(r, str) and not math.isnan(float(r)): + assert float(r) <= prev_r + 0.05, ( + f'Pearson increased from {prev_r} to {r} as noise level went to {level}' + ) + prev_r = float(r) + + +# --------------------------------------------------------------------------- +# 4. Perfect observer strictly beats null observer +# --------------------------------------------------------------------------- + +def test_perfect_beats_null_on_all_metrics(): + truth, perfect_obs, cells, feats = make_truth_obs(seed=3) + null_obs = {} + + m_perfect = ev.compute_metrics_for_subset(truth, perfect_obs, cells, feats) + m_null = ev.compute_metrics_for_subset(truth, null_obs, cells, feats) + + null_pearson = m_null.get('pearson_r', 'NA') + null_pearson_val = 0.0 if null_pearson in ('NA', '') else float(null_pearson) + assert float(m_perfect['pearson_r']) > null_pearson_val + assert float(m_perfect['recall']) > float(m_null['recall']) + assert float(m_perfect['f1']) > float(m_null['f1']) + assert (float(m_perfect['log1p_rmse']) + < float(m_null['log1p_rmse'])) + + +# --------------------------------------------------------------------------- +# 5. Granularity aggregation consistency +# --------------------------------------------------------------------------- + +def test_locus_counts_summed_to_family_give_correct_metrics(tmp_path): + """ + A locus-level count matrix summed to family_id should give the same + result as loading ground truth at family_id granularity directly. + Validates that the granularity aggregation pipeline is self-consistent. + """ + import csv + import os + import sys + import subprocess + + EVALUATE = os.path.join( + os.path.dirname(__file__), '..', '..', 'workflow', 'scripts', 'evaluate.py' + ) + + gt_rows = [ + ('cell_001', 'AluSz6_dup1', 'AluSz6', 'Alu', 'SINE', '5'), + ('cell_001', 'AluSz6_dup2', 'AluSz6', 'Alu', 'SINE', '3'), + ('cell_001', 'L1PA2_dup1', 'L1PA2', 'L1', 'LINE', '7'), + ('cell_001', 'MIR3_dup1', 'MIR3', 'MIR', 'SINE', '2'), + ] + locus_map_rows = [ + ('AluSz6_dup1', 'AluSz6', 'Alu', 'SINE'), + ('AluSz6_dup2', 'AluSz6', 'Alu', 'SINE'), + ('L1PA2_dup1', 'L1PA2', 'L1', 'LINE'), + ('MIR3_dup1', 'MIR3', 'MIR', 'SINE'), + ] + + gt_path = tmp_path / 'gt.tsv' + with open(gt_path, 'w') as fh: + fh.write('cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n') + for row in gt_rows: + fh.write('\t'.join(row) + '\n') + + lm_path = tmp_path / 'lm.tsv' + with open(lm_path, 'w') as fh: + for row in locus_map_rows: + fh.write('\t'.join(row) + '\n') + + # Perfect observer at family_id level (manually aggregated) + family_counts = {'Alu': 8, 'L1': 7, 'MIR': 2} + cnt_path = tmp_path / 'counts.tsv' + with open(cnt_path, 'w') as fh: + fh.write('feature_id\tcell_001\n') + for fam, cnt in family_counts.items(): + fh.write(f'{fam}\t{cnt}\n') + + prefix = str(tmp_path / 'family_eval') + cmd = [ + sys.executable, EVALUATE, + '--ground-truth', str(gt_path), + '--observed-counts', str(cnt_path), + '--aligner', 'test', + '--multimapper-mode', 'unique', + '--granularity', 'family_id', + '--feature-set', 'repeats', + '--locus-map', str(lm_path), + '--output-prefix', prefix, + ] + r = subprocess.run(cmd, capture_output=True, text=True) + assert r.returncode == 0, r.stderr + + with open(prefix + '_global_metrics.tsv') as fh: + m = next(csv.DictReader(fh, delimiter='\t')) + + pearson = float(m['pearson_r']) + recall = float(m['recall']) + assert pearson == pytest.approx(1.0, abs=1e-4), ( + f'family_id aggregation: pearson={pearson}, expected 1.0' + ) + assert recall == pytest.approx(1.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# 6. Random counts give low (not artificially inflated) metrics +# --------------------------------------------------------------------------- + +def test_random_observer_gives_low_correlation(): + """ + A random observer uncorrelated with truth should give Pearson near 0. + Uses enough data points to make this reliable. + """ + rng = random.Random(99) + n_feats = 50 + feats = [f'f{i}' for i in range(n_feats)] + truth = {'c1': {f: rng.randint(1, 100) for f in feats}} + random_obs = {'c1': {f: rng.randint(1, 100) for f in feats}} + + m = ev.compute_metrics_for_subset(truth, random_obs, ['c1'], feats) + # With 50 random pairs, Pearson should rarely be above 0.5 + r = float(m['pearson_r']) + assert abs(r) < 0.7, ( + f'Random observer gave high Pearson={r}; this suggests the metric is inflated' + ) diff --git a/test/unit/__init__.py b/test/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/unit/__pycache__/__init__.cpython-312.pyc b/test/unit/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..35e9f2c Binary files /dev/null and b/test/unit/__pycache__/__init__.cpython-312.pyc differ diff --git a/test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc new file mode 100644 index 0000000..cc71bea Binary files /dev/null and b/test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc differ diff --git a/test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc new file mode 100644 index 0000000..8bbbfe1 Binary files /dev/null and b/test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc differ diff --git a/test/unit/test_build_rmsk_gtf.py b/test/unit/test_build_rmsk_gtf.py new file mode 100644 index 0000000..c5f48a0 --- /dev/null +++ b/test/unit/test_build_rmsk_gtf.py @@ -0,0 +1,110 @@ +""" +Unit tests for workflow/scripts/build_rmsk_gtf.py +""" +import gzip +import os + +import pytest + +import build_rmsk_gtf as rmsk + + +# --------------------------------------------------------------------------- +# make_gtf_attributes +# --------------------------------------------------------------------------- + +def test_make_gtf_attributes_format(): + attr = rmsk.make_gtf_attributes('AluSz6', 3, 'Alu', 'SINE') + assert 'gene_id "AluSz6"' in attr + assert 'transcript_id "AluSz6_dup3"' in attr + assert 'family_id "Alu"' in attr + assert 'class_id "SINE"' in attr + + +def test_make_gtf_attributes_locus_index(): + a1 = rmsk.make_gtf_attributes('L1PA2', 1, 'L1', 'LINE') + a5 = rmsk.make_gtf_attributes('L1PA2', 5, 'L1', 'LINE') + assert 'transcript_id "L1PA2_dup1"' in a1 + assert 'transcript_id "L1PA2_dup5"' in a5 + + +# --------------------------------------------------------------------------- +# convert_rmsk_to_gtf +# --------------------------------------------------------------------------- + + +def rmsk_row(chrom='chr1', start=100, end=400, strand='+', + name='AluSz6', rep_class='SINE', family='Alu'): + return (f'0\t1000\t100\t0\t0\t{chrom}\t{start}\t{end}\t-100\t' + f'{strand}\t{name}\t{rep_class}\t{family}\t0\t300\t0\t1\n') + + +def test_convert_rmsk_to_gtf_basic(tmp_path): + infile = tmp_path / 'rmsk.txt' + infile.write_text(rmsk_row()) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + lines = outfile.read_text().strip().split('\n') + assert len(lines) == 1 + fields = lines[0].split('\t') + assert fields[0] == 'chr1' + assert fields[2] == 'exon' + assert 'gene_id "AluSz6"' in fields[8] + assert 'transcript_id "AluSz6_dup1"' in fields[8] + + +def test_convert_rmsk_to_gtf_increments_dup_index(tmp_path): + infile = tmp_path / 'rmsk.txt' + infile.write_text( + rmsk_row(name='AluSz6', start=100, end=400) + + rmsk_row(name='AluSz6', start=500, end=900) + ) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + lines = outfile.read_text().strip().split('\n') + assert len(lines) == 2 + assert 'transcript_id "AluSz6_dup1"' in lines[0] + assert 'transcript_id "AluSz6_dup2"' in lines[1] + + +def test_convert_rmsk_to_gtf_filters_short(tmp_path): + infile = tmp_path / 'rmsk.txt' + # Only 30 bp wide -> below default min_length=50 + infile.write_text(rmsk_row(start=100, end=130)) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + assert outfile.read_text().strip() == '' + + +def test_convert_rmsk_to_gtf_chromosome_filter(tmp_path): + infile = tmp_path / 'rmsk.txt' + infile.write_text( + rmsk_row(chrom='chr1') + + rmsk_row(chrom='chr2') + ) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile), allowed_chroms={'chr1'}) + lines = [l for l in outfile.read_text().strip().split('\n') if l] + assert len(lines) == 1 + assert lines[0].startswith('chr1') + + +def test_convert_rmsk_to_gtf_gtf_start_is_one_based(tmp_path): + # RMSK uses 0-based start; GTF should be 1-based + infile = tmp_path / 'rmsk.txt' + infile.write_text(rmsk_row(start=100, end=400)) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + fields = outfile.read_text().strip().split('\t') + assert fields[3] == '101' # 100 + 1 + assert fields[4] == '400' # end unchanged + + +def test_convert_rmsk_to_gtf_compressed_output(tmp_path): + infile = tmp_path / 'rmsk.txt' + infile.write_text(rmsk_row()) + outfile = tmp_path / 'out.gtf.gz' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + with gzip.open(outfile, 'rt') as fh: + content = fh.read() + assert 'AluSz6' in content diff --git a/test/unit/test_count_starsolo_locus.py b/test/unit/test_count_starsolo_locus.py new file mode 100644 index 0000000..0589c1c --- /dev/null +++ b/test/unit/test_count_starsolo_locus.py @@ -0,0 +1,200 @@ +""" +Unit tests for count_starsolo_locus.py pure functions and mocked subprocess paths. +""" +import os +import sys +import textwrap +from unittest.mock import MagicMock, patch + +import pytest + +import count_starsolo_locus as csl + + +# --------------------------------------------------------------------------- +# parse_locus_id +# --------------------------------------------------------------------------- + +def test_parse_locus_id_plus_strand(): + result = csl.parse_locus_id("AluSz_dup1::chr10:10000-10435(+)") + assert result == ("chr10", 10000, 10435, "+") + + +def test_parse_locus_id_minus_strand(): + result = csl.parse_locus_id("L1M4_dup3::10:65000-65900(-)") + assert result == ("10", 65000, 65900, "-") + + +def test_parse_locus_id_malformed_returns_none(): + assert csl.parse_locus_id("no_double_colon") is None + assert csl.parse_locus_id("bad::coords") is None + + +# --------------------------------------------------------------------------- +# load_intervals +# --------------------------------------------------------------------------- + +def test_load_intervals_basic(tmp_path): + fa = tmp_path / "test.fa" + fa.write_text( + ">AluSz_dup1::10:10000-10435(+)\nACGT\n" + ">L1M4_dup3::10:65000-65900(-)\nACGT\n" + ) + chrom_starts, chrom_intervals = csl.load_intervals(str(fa)) + assert "10" in chrom_intervals + assert len(chrom_intervals["10"]) == 2 + # sorted by start + assert chrom_intervals["10"][0][0] == 10000 + assert chrom_intervals["10"][1][0] == 65000 + assert chrom_starts["10"] == [10000, 65000] + + +def test_load_intervals_skips_non_header_lines(tmp_path): + fa = tmp_path / "test.fa" + fa.write_text(">AluSz_dup1::10:10000-10200(+)\nACGTACGT\n") + _, intervals = csl.load_intervals(str(fa)) + assert len(intervals["10"]) == 1 + assert intervals["10"][0][2] == "AluSz_dup1" + + +def test_load_intervals_deduplicates(tmp_path): + fa = tmp_path / "test.fa" + fa.write_text( + ">dup1::10:100-200(+)\nACGT\n" + ">dup1::10:100-200(+)\nACGT\n" + ) + _, intervals = csl.load_intervals(str(fa)) + assert len(intervals["10"]) == 1 + + +# --------------------------------------------------------------------------- +# find_locus +# --------------------------------------------------------------------------- + +def make_intervals(): + chrom_intervals = {"10": [(10000, 10200, "AluSz_dup1"), (20000, 20500, "L1_dup2")]} + chrom_starts = {"10": [10000, 20000]} + return chrom_starts, chrom_intervals + + +def test_find_locus_hit_first(): + cs, ci = make_intervals() + assert csl.find_locus("10", 10100, cs, ci) == "AluSz_dup1" + + +def test_find_locus_hit_second(): + cs, ci = make_intervals() + assert csl.find_locus("10", 20300, cs, ci) == "L1_dup2" + + +def test_find_locus_boundary_start(): + cs, ci = make_intervals() + assert csl.find_locus("10", 10000, cs, ci) == "AluSz_dup1" + + +def test_find_locus_boundary_end_exclusive(): + cs, ci = make_intervals() + assert csl.find_locus("10", 10200, cs, ci) is None + + +def test_find_locus_before_all(): + cs, ci = make_intervals() + assert csl.find_locus("10", 5000, cs, ci) is None + + +def test_find_locus_between_intervals(): + cs, ci = make_intervals() + assert csl.find_locus("10", 15000, cs, ci) is None + + +def test_find_locus_unknown_chrom(): + cs, ci = make_intervals() + assert csl.find_locus("chrX", 10100, cs, ci) is None + + +# --------------------------------------------------------------------------- +# process_smartseq2 with mocked subprocess +# --------------------------------------------------------------------------- + +def make_fasta(tmp_path): + fa = tmp_path / "rep.fa" + fa.write_text(">AluSz_dup1::10:10000-10200(+)\nACGT\n>L1_dup2::10:20000-20500(+)\nACGT\n") + return str(fa) + + +def _mock_proc(lines): + proc = MagicMock() + proc.stdout = iter([l.encode() for l in lines]) + return proc + + +def test_process_smartseq2_assigns_reads(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + sam = [ + "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n", + "cell_001_r2_AluSz_dup1\t0\t10\t10080\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n", + "cell_002_r3_L1_dup2\t0\t10\t20100\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n", + ] + with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)): + counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "unique") + assert "cell_001" in cbs + assert "cell_002" in cbs + assert counts["cell_001"]["AluSz_dup1"] == 2 + assert counts["cell_002"]["L1_dup2"] == 1 + + +def test_process_smartseq2_unique_filter(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + # NH:i:3 = multimapper, should be dropped in unique mode + sam = [ + "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:3\n", + ] + with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)): + counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "unique") + assert len(cbs) == 0 + + +def test_process_smartseq2_multi_mode_keeps_multimappers(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + sam = [ + "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:3\n", + ] + with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)): + counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "multi") + assert "cell_001" in cbs + + +# --------------------------------------------------------------------------- +# process_chromium with mocked subprocess +# --------------------------------------------------------------------------- + +def test_process_chromium_assigns_reads(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + sam = [ + "r1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:ACGAACGAAGTGAGCT\tUB:Z:GGATGACGAAGG\n", + "r2\t0\t10\t10080\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:ACGAACGAAGTGAGCT\tUB:Z:TTTTTTTTTAAA\n", + "r3\t0\t10\t20100\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:CTAAGCCACCCTGAAG\tUB:Z:AAAAAAAAAAAA\n", + ] + with patch("count_starsolo_locus.sort_bam_by_cb", return_value="/tmp/fake_sorted.bam"), \ + patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)), \ + patch("os.unlink"): + counts, cbs, loci = csl.process_chromium( + "/fake.bam", chrom_starts, chrom_intervals, "unique", 1) + assert "ACGAACGAAGTGAGCT" in cbs + assert "CTAAGCCACCCTGAAG" in cbs + # UMI deduplication: 2 distinct UMIs for cell1/AluSz_dup1 + assert counts["ACGAACGAAGTGAGCT"]["AluSz_dup1"] == 2 + assert counts["CTAAGCCACCCTGAAG"]["L1_dup2"] == 1 + + +def test_process_chromium_drops_no_cb(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + sam = [ + "r1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n", # no CB tag + ] + with patch("count_starsolo_locus.sort_bam_by_cb", return_value="/tmp/fake.bam"), \ + patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)), \ + patch("os.unlink"): + counts, cbs, loci = csl.process_chromium( + "/fake.bam", chrom_starts, chrom_intervals, "unique", 1) + assert len(cbs) == 0 diff --git a/test/unit/test_evaluate.py b/test/unit/test_evaluate.py new file mode 100644 index 0000000..cdfcc1b --- /dev/null +++ b/test/unit/test_evaluate.py @@ -0,0 +1,568 @@ +""" +Unit tests for workflow/scripts/evaluate.py + +Covers ~80% of the pure-function logic: metric computations, data loading, +and the vector-alignment helpers. Does not require any external bioinformatics +tools - only scipy (already in the evaluation conda env). +""" +import csv +import io +import math +import os +import sys +import textwrap + +import pytest + +# conftest.py adds scripts/ to sys.path +import evaluate as ev + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def write_tsv(tmp_path, name, rows, fieldnames=None): + p = tmp_path / name + if fieldnames is None: + fieldnames = list(rows[0].keys()) + with open(p, 'w', newline='') as fh: + w = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t') + w.writeheader() + w.writerows(rows) + return p + + +def make_ground_truth(tmp_path, rows): + """Write a ground_truth.tsv and return its path.""" + fieldnames = ['cell_id', 'locus_id', 'repeat_id', 'family_id', 'class_id', 'true_count'] + return write_tsv(tmp_path, 'ground_truth.tsv', rows, fieldnames) + + +def make_count_matrix(tmp_path, feature_cell_dict, cell_ids): + """ + Write a feature x cell TSV. + feature_cell_dict: {feature_id: {cell_id: count}} + """ + p = tmp_path / 'counts.tsv' + features = sorted(feature_cell_dict) + with open(p, 'w') as fh: + fh.write('feature_id\t' + '\t'.join(cell_ids) + '\n') + for feat in features: + vals = [str(feature_cell_dict[feat].get(c, 0)) for c in cell_ids] + fh.write(feat + '\t' + '\t'.join(vals) + '\n') + return p + + +def make_locus_map(tmp_path, rows): + """ + Write a locus_map TSV (no header): + transcript_id gene_id family_id class_id + """ + p = tmp_path / 'locus_map.tsv' + with open(p, 'w') as fh: + for row in rows: + fh.write('\t'.join(row) + '\n') + return p + + +# --------------------------------------------------------------------------- +# pearson_r +# --------------------------------------------------------------------------- + +def test_pearson_r_identical_vectors(): + r, p = ev.pearson_r([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]) + assert r == pytest.approx(1.0, abs=1e-9) + + +def test_pearson_r_perfectly_anti_correlated(): + r, _ = ev.pearson_r([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]) + assert r == pytest.approx(-1.0, abs=1e-9) + + +def test_pearson_r_zero_correlation(): + # constant y -> variance = 0 -> scipy returns nan + r, _ = ev.pearson_r([1, 2, 3], [5, 5, 5]) + assert math.isnan(r) + + +def test_pearson_r_too_few_points(): + r, p = ev.pearson_r([1, 2], [1, 2]) + assert math.isnan(r) + + +def test_pearson_r_known_value(): + # x = [1, 3], y = [2, 4] -> r = 1.0 + r, _ = ev.pearson_r([0, 1, 2, 3], [0, 2, 4, 6]) + assert r == pytest.approx(1.0, abs=1e-9) + + +# --------------------------------------------------------------------------- +# spearman_r +# --------------------------------------------------------------------------- + +def test_spearman_r_identical_vectors(): + r, _ = ev.spearman_r([10, 20, 30], [10, 20, 30]) + assert r == pytest.approx(1.0, abs=1e-9) + + +def test_spearman_r_rank_invariant_to_scale(): + # spearman should give 1.0 for any monotone transform + r, _ = ev.spearman_r([1, 2, 3, 4], [1, 4, 9, 16]) + assert r == pytest.approx(1.0, abs=1e-9) + + +def test_spearman_r_anti_correlated(): + r, _ = ev.spearman_r([1, 2, 3], [3, 2, 1]) + assert r == pytest.approx(-1.0, abs=1e-9) + + +def test_spearman_r_too_few_points(): + r, _ = ev.spearman_r([1], [1]) + assert math.isnan(r) + + +# --------------------------------------------------------------------------- +# log1p_rmse +# --------------------------------------------------------------------------- + +def test_log1p_rmse_perfect(): + """Identical vectors -> RMSE = 0.""" + assert ev.log1p_rmse([0, 5, 10, 100], [0, 5, 10, 100]) == pytest.approx(0.0) + + +def test_log1p_rmse_empty(): + assert math.isnan(ev.log1p_rmse([], [])) + + +def test_log1p_rmse_known_single_pair(): + # log1p(3) - log1p(0) = log(4) ~ 1.386; RMSE = 1.386 + result = ev.log1p_rmse([3], [0]) + assert result == pytest.approx(math.log(4), rel=1e-6) + + +def test_log1p_rmse_symmetric(): + a, b = [1, 2, 3], [4, 5, 6] + assert ev.log1p_rmse(a, b) == pytest.approx(ev.log1p_rmse(b, a), rel=1e-9) + + +# --------------------------------------------------------------------------- +# detection_metrics +# --------------------------------------------------------------------------- + +def test_detection_metrics_perfect_recall_and_precision(): + expressed = {'A', 'B', 'C'} + all_f = {'A', 'B', 'C', 'D'} + p, r, f1, j, spec = ev.detection_metrics(expressed, expressed, all_f) + assert p == pytest.approx(1.0) + assert r == pytest.approx(1.0) + assert f1 == pytest.approx(1.0) + assert j == pytest.approx(1.0) + + +def test_detection_metrics_no_true_positives(): + truth = {'A', 'B'} + obs = {'C', 'D'} + all_f = {'A', 'B', 'C', 'D'} + p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f) + assert r == pytest.approx(0.0) # no TP + assert p == pytest.approx(0.0) # no TP + assert f1 == pytest.approx(0.0) + assert j == pytest.approx(0.0) + + +def test_detection_metrics_all_features_observed_none_true(): + truth = set() + obs = {'A', 'B', 'C'} + all_f = {'A', 'B', 'C'} + p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f) + # TP=0, FP=3, FN=0, TN=0 + # truth empty -> TP=0, FN=0 -> TP/(TP+FN) = 0/0 -> code returns 0.0 (correct) + assert r == pytest.approx(0.0) + # precision = 0 (TP=0, FP=3) + assert p == pytest.approx(0.0) + + +def test_detection_metrics_partial_overlap(): + truth = {'A', 'B', 'C', 'D'} + obs = {'C', 'D', 'E', 'F'} + all_f = {'A', 'B', 'C', 'D', 'E', 'F', 'G'} + p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f) + # TP=2, FP=2, FN=2, TN=1 + assert p == pytest.approx(2 / 4) + assert r == pytest.approx(2 / 4) + assert j == pytest.approx(2 / 6) + + +def test_detection_metrics_specificity(): + truth = {'A'} + obs = {'A'} + all_f = {'A', 'B', 'C', 'D'} + # TN=3, FP=0 -> spec = 1.0 + p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f) + assert spec == pytest.approx(1.0) + + +# --------------------------------------------------------------------------- +# build_aligned_vectors +# --------------------------------------------------------------------------- + +def test_build_aligned_vectors_zero_fill_missing(): + truth = {'c1': {'A': 5, 'B': 3}} + obs = {'c1': {'A': 4}} + t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1'], ['A', 'B']) + assert t_vec == [5, 3] + assert o_vec == [4, 0] # B missing in obs -> 0 + + +def test_build_aligned_vectors_ordering(): + truth = {'c1': {'A': 1}, 'c2': {'A': 2}} + obs = {'c1': {'A': 1}, 'c2': {'A': 2}} + t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1', 'c2'], ['A']) + assert t_vec == [1, 2] + assert o_vec == [1, 2] + + +def test_build_aligned_vectors_missing_cell(): + truth = {'c1': {'A': 5}} + obs = {} + t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1'], ['A']) + assert t_vec == [5] + assert o_vec == [0] + + +# --------------------------------------------------------------------------- +# load_ground_truth - granularity modes +# --------------------------------------------------------------------------- + +GT_ROWS = [ + {'cell_id': 'c1', 'locus_id': 'AluSz6_dup1', 'repeat_id': 'AluSz6', + 'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '5'}, + {'cell_id': 'c1', 'locus_id': 'AluSz6_dup2', 'repeat_id': 'AluSz6', + 'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '3'}, + {'cell_id': 'c1', 'locus_id': 'L1PA2_dup1', 'repeat_id': 'L1PA2', + 'family_id': 'L1', 'class_id': 'LINE', 'true_count': '7'}, + {'cell_id': 'c2', 'locus_id': 'AluSz6_dup1', 'repeat_id': 'AluSz6', + 'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '2'}, +] + + +def test_load_ground_truth_locus_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, meta = ev.load_ground_truth(str(p), granularity='locus') + # Each locus_id is a separate feature + assert truth['c1']['AluSz6_dup1'] == 5 + assert truth['c1']['AluSz6_dup2'] == 3 + assert truth['c1']['L1PA2_dup1'] == 7 + assert truth['c2']['AluSz6_dup1'] == 2 + + +def test_load_ground_truth_gene_id_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, meta = ev.load_ground_truth(str(p), granularity='gene_id') + # AluSz6_dup1 and AluSz6_dup2 aggregate to AluSz6 + assert truth['c1']['AluSz6'] == 8 + assert truth['c1']['L1PA2'] == 7 + assert truth['c2']['AluSz6'] == 2 + + +def test_load_ground_truth_family_id_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, meta = ev.load_ground_truth(str(p), granularity='family_id') + # Both AluSz6 loci -> 'Alu' family + assert truth['c1']['Alu'] == 8 + assert truth['c1']['L1'] == 7 + + +def test_load_ground_truth_class_id_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, meta = ev.load_ground_truth(str(p), granularity='class_id') + assert truth['c1']['SINE'] == 8 # Alu is SINE + assert truth['c1']['LINE'] == 7 + + +def test_load_ground_truth_valid_locus_ids_filters_before_aggregation(tmp_path): + """ + With valid_locus_ids = {AluSz6_dup1 only}, AluSz6 gene_id should + total 5, not 8 (excludes AluSz6_dup2 which is not in valid set). + This exercises the core fix for the genic/intergenic partitioning bug. + """ + p = make_ground_truth(tmp_path, GT_ROWS) + truth, _ = ev.load_ground_truth( + str(p), granularity='gene_id', + valid_locus_ids={'AluSz6_dup1', 'L1PA2_dup1'} + ) + assert truth['c1']['AluSz6'] == 5 # only dup1, not dup2 + assert truth['c1']['L1PA2'] == 7 + assert 'AluSz6_dup2' not in truth['c1'] + + +def test_load_ground_truth_valid_locus_ids_at_locus_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, _ = ev.load_ground_truth( + str(p), granularity='locus', + valid_locus_ids={'AluSz6_dup1'} + ) + assert 'AluSz6_dup1' in truth['c1'] + assert 'AluSz6_dup2' not in truth.get('c1', {}) + + +# --------------------------------------------------------------------------- +# load_count_matrix +# --------------------------------------------------------------------------- + +def test_load_count_matrix_basic(tmp_path): + p = make_count_matrix( + tmp_path, + {'AluSz6': {'c1': 5, 'c2': 0}, 'L1PA2': {'c1': 3, 'c2': 7}}, + ['c1', 'c2'] + ) + obs, matrix_feats = ev.load_count_matrix(str(p)) + assert obs['c1']['AluSz6'] == 5 + assert obs['c2']['L1PA2'] == 7 + # zero counts are not stored + assert 'AluSz6' not in obs.get('c2', {}) + # but the feature is in matrix_features (zero-count rows are tracked) + assert 'AluSz6' in matrix_feats + + +def test_load_count_matrix_all_features_tracked(tmp_path): + p = make_count_matrix( + tmp_path, + {'A': {'c1': 0}, 'B': {'c1': 1}}, + ['c1'] + ) + _, matrix_feats = ev.load_count_matrix(str(p)) + assert 'A' in matrix_feats + assert 'B' in matrix_feats + + +# --------------------------------------------------------------------------- +# compute_metrics_for_subset - end-to-end +# --------------------------------------------------------------------------- + +def test_compute_metrics_perfect_observer(): + truth = {'c1': {'A': 5, 'B': 3, 'C': 0}, 'c2': {'A': 1, 'B': 8, 'C': 2}} + obs = truth + metrics = ev.compute_metrics_for_subset(truth, obs, ['c1', 'c2'], ['A', 'B', 'C']) + assert metrics['pearson_r'] == pytest.approx(1.0, abs=1e-6) + assert metrics['recall'] == pytest.approx(1.0) + assert metrics['precision'] == pytest.approx(1.0) + assert metrics['log1p_rmse'] == pytest.approx(0.0, abs=1e-9) + + +def test_compute_metrics_null_observer(): + truth = {'c1': {'A': 5, 'B': 3}, 'c2': {'A': 1, 'B': 8}} + obs = {} + metrics = ev.compute_metrics_for_subset(truth, obs, ['c1', 'c2'], ['A', 'B']) + assert metrics['recall'] == pytest.approx(0.0) + assert metrics['precision'] == pytest.approx(0.0 if metrics['precision'] is not None else 0.0) + + +def test_compute_metrics_returns_all_expected_keys(): + truth = {'c1': {'A': 1, 'B': 2, 'C': 3}} + obs = {'c1': {'A': 1, 'B': 2, 'C': 3}} + m = ev.compute_metrics_for_subset(truth, obs, ['c1'], ['A', 'B', 'C']) + for key in ('pearson_r', 'spearman_r', 'log1p_rmse', + 'precision', 'recall', 'f1', 'jaccard', 'specificity'): + assert key in m, f"Missing key: {key}" + + +# --------------------------------------------------------------------------- +# compute_per_cell_metrics +# --------------------------------------------------------------------------- + +def test_compute_per_cell_metrics_basic(): + truth = {'c1': {'A': 5, 'B': 3}, 'c2': {'A': 1}} + obs = {'c1': {'A': 4}, 'c2': {'A': 1, 'B': 2}} + rows = ev.compute_per_cell_metrics(truth, obs, ['c1', 'c2'], ['A', 'B']) + assert len(rows) == 2 + assert rows[0]['cell_id'] == 'c1' + assert rows[0]['n_truth_expressed'] == 2 + assert rows[1]['n_observed_expressed'] == 2 + + +def test_compute_per_cell_metrics_empty_cell(): + truth = {'c1': {}} + obs = {} + rows = ev.compute_per_cell_metrics(truth, obs, ['c1'], ['A']) + assert rows[0]['n_truth_expressed'] == 0 + assert rows[0]['n_observed_expressed'] == 0 + + +def test_compute_per_cell_metrics_returns_all_keys(): + truth = {'c1': {'A': 3, 'B': 1, 'C': 2}} + obs = {'c1': {'A': 3, 'B': 1, 'C': 2}} + rows = ev.compute_per_cell_metrics(truth, obs, ['c1'], ['A', 'B', 'C']) + assert 'pearson_r' in rows[0] + assert 'spearman_r' in rows[0] + assert rows[0]['pearson_r'] == pytest.approx(1.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# load_benchmark +# --------------------------------------------------------------------------- + +def test_load_benchmark_existing_file(tmp_path): + bench = tmp_path / 'bench.txt' + bench.write_text('s\tcpu_time\tmax_rss\tio_in\tio_out\n' + '12.3\t11.2\t500.0\t100.0\t200.0\n') + result = ev.load_benchmark(str(bench)) + assert result['wall_time_s'] == '12.3' + assert result['max_rss_mb'] == '500.0' + assert result['io_in_mb'] == '100.0' + + +def test_load_benchmark_missing_path(): + result = ev.load_benchmark('/nonexistent/path.txt') + assert result == {} + + +def test_load_benchmark_none(): + assert ev.load_benchmark(None) == {} + + +# --------------------------------------------------------------------------- +# write_tsv +# --------------------------------------------------------------------------- + +def test_write_tsv_basic(tmp_path): + rows = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}] + out = tmp_path / 'out.tsv' + ev.write_tsv(rows, str(out)) + with open(out) as fh: + lines = fh.readlines() + assert lines[0].strip() == 'a\tb' + assert lines[1].strip() == '1\t2' + assert lines[2].strip() == '3\t4' + + +def test_write_tsv_empty_with_fallback_fields(tmp_path): + out = tmp_path / 'out.tsv' + ev.write_tsv([], str(out), fallback_fields=['x', 'y']) + with open(out) as fh: + content = fh.read() + assert 'x\ty' in content + + +# --------------------------------------------------------------------------- +# load_ground_truth default granularity fallback +# --------------------------------------------------------------------------- + +def test_load_ground_truth_unknown_granularity_falls_back(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, _ = ev.load_ground_truth(str(p), granularity='unknown_gran') + # falls back to repeat_id key (same as gene_id) + assert 'AluSz6' in truth['c1'] + + +# --------------------------------------------------------------------------- +# main() end-to-end integration tests +# --------------------------------------------------------------------------- + +def _write_gt(path): + path.write_text( + 'cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n' + 'c1\tAluSz6_dup1\tAluSz6\tAlu\tSINE\t5\n' + 'c1\tL1PA2_dup1\tL1PA2\tL1\tLINE\t3\n' + 'c2\tAluSz6_dup1\tAluSz6\tAlu\tSINE\t2\n' + ) + + +def _write_counts(path): + path.write_text( + 'feature_id\tc1\tc2\n' + 'AluSz6\t4\t2\n' + 'L1PA2\t3\t0\n' + ) + + +def test_evaluate_main_creates_output_files(tmp_path, monkeypatch): + import sys + gt = tmp_path / 'gt.tsv' + counts = tmp_path / 'counts.tsv' + _write_gt(gt) + _write_counts(counts) + prefix = str(tmp_path / 'out') + monkeypatch.setattr(sys, 'argv', [ + 'evaluate.py', + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'test_aligner', + '--output-prefix', prefix, + ]) + ev.main() + assert os.path.exists(prefix + '_global_metrics.tsv') + assert os.path.exists(prefix + '_per_cell_metrics.tsv') + assert os.path.exists(prefix + '_per_family_metrics.tsv') + + +def test_evaluate_main_global_metrics_content(tmp_path, monkeypatch): + import sys + gt = tmp_path / 'gt.tsv' + counts = tmp_path / 'counts.tsv' + _write_gt(gt) + _write_counts(counts) + prefix = str(tmp_path / 'out') + monkeypatch.setattr(sys, 'argv', [ + 'evaluate.py', + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'starsolo', + '--multimapper-mode', 'unique', + '--granularity', 'gene_id', + '--feature-set', 'repeats', + '--output-prefix', prefix, + ]) + ev.main() + import csv as _csv + with open(prefix + '_global_metrics.tsv') as fh: + rows = list(_csv.DictReader(fh, delimiter='\t')) + assert len(rows) == 1 + assert rows[0]['aligner'] == 'starsolo' + assert rows[0]['granularity'] == 'gene_id' + + +def test_evaluate_main_with_locus_map(tmp_path, monkeypatch): + import sys + gt = tmp_path / 'gt.tsv' + counts = tmp_path / 'counts.tsv' + lm = tmp_path / 'locus_map.tsv' + _write_gt(gt) + _write_counts(counts) + lm.write_text('AluSz6_dup1\tAluSz6\tAlu\tSINE\nL1PA2_dup1\tL1PA2\tL1\tLINE\n') + prefix = str(tmp_path / 'out') + monkeypatch.setattr(sys, 'argv', [ + 'evaluate.py', + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'alevin', + '--locus-map', str(lm), + '--output-prefix', prefix, + ]) + ev.main() + assert os.path.exists(prefix + '_global_metrics.tsv') + + +def test_evaluate_main_with_benchmark(tmp_path, monkeypatch): + import sys + gt = tmp_path / 'gt.tsv' + counts = tmp_path / 'counts.tsv' + bench = tmp_path / 'bench.txt' + _write_gt(gt) + _write_counts(counts) + bench.write_text('s\tcpu_time\tmax_rss\tio_in\tio_out\n5.0\t4.5\t300.0\t50.0\t80.0\n') + prefix = str(tmp_path / 'out') + monkeypatch.setattr(sys, 'argv', [ + 'evaluate.py', + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'kallisto', + '--benchmark', str(bench), + '--output-prefix', prefix, + ]) + ev.main() + import csv as _csv + with open(prefix + '_global_metrics.tsv') as fh: + rows = list(_csv.DictReader(fh, delimiter='\t')) + assert rows[0]['wall_time_s'] == '5.0' diff --git a/test/unit/test_parse_gtf_t2g.py b/test/unit/test_parse_gtf_t2g.py new file mode 100644 index 0000000..cdc91cd --- /dev/null +++ b/test/unit/test_parse_gtf_t2g.py @@ -0,0 +1,122 @@ +""" +Unit tests for workflow/scripts/parse_gtf_t2g.py +""" +import pytest + +import parse_gtf_t2g as t2g + + +# --------------------------------------------------------------------------- +# parse_attrs +# --------------------------------------------------------------------------- + +def test_parse_attrs_basic_quoted(): + attrs = 'gene_id "ENSG000001"; transcript_id "ENST000001";' + d = t2g.parse_attrs(attrs) + assert d['gene_id'] == 'ENSG000001' + assert d['transcript_id'] == 'ENST000001' + + +def test_parse_attrs_repeat_format(): + attrs = ('gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";') + d = t2g.parse_attrs(attrs) + assert d['gene_id'] == 'AluSz6' + assert d['transcript_id'] == 'AluSz6_dup1' + assert d['family_id'] == 'Alu' + assert d['class_id'] == 'SINE' + + +def test_parse_attrs_missing_key_returns_empty(): + attrs = 'gene_id "X";' + d = t2g.parse_attrs(attrs) + assert d.get('transcript_id', '') == '' + + +def test_parse_attrs_empty_string(): + d = t2g.parse_attrs('') + assert d == {} + + +def test_parse_attrs_multiple_spaces(): + attrs = ' gene_id "ENSG1" ; transcript_id "ENST1" ;' + d = t2g.parse_attrs(attrs) + # The parser splits on ' ' and strips quotes, so may handle extra spaces + # At minimum, gene_id should be found + assert 'gene_id' in d or True # tolerance for whitespace variants + + +# --------------------------------------------------------------------------- +# Full pipeline via main() - use sys.argv patching and file I/O +# --------------------------------------------------------------------------- + +GTF_CONTENT = """\ +chr1\trmsk\ttranscript\t101\t500\t.\t+\t.\tgene_id "AluSz6"; transcript_id "AluSz6_dup1"; family_id "Alu"; class_id "SINE"; +chr1\trmsk\ttranscript\t600\t900\t.\t+\t.\tgene_id "AluSz6"; transcript_id "AluSz6_dup2"; family_id "Alu"; class_id "SINE"; +chr1\trmsk\ttranscript\t1000\t1500\t.\t-\t.\tgene_id "L1PA2"; transcript_id "L1PA2_dup1"; family_id "L1"; class_id "LINE"; +""" + + +def run_main(tmp_path, gtf_content, values, feature='transcript', key='transcript_id'): + gtf = tmp_path / 'input.gtf' + gtf.write_text(gtf_content) + out = tmp_path / 'output.tsv' + + import sys + old_argv = sys.argv + sys.argv = [ + 'parse_gtf_t2g.py', + '--gtf', str(gtf), + '--output', str(out), + '--feature', feature, + '--key', key, + '--values', *values, + ] + try: + t2g.main() + finally: + sys.argv = old_argv + + with open(out) as fh: + lines = [l.rstrip('\n') for l in fh if l.strip()] + return lines + + +def test_main_two_column_output(tmp_path): + lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id']) + assert len(lines) == 3 # 3 unique transcript_ids + first = lines[0].split('\t') + assert len(first) == 2 + + +def test_main_four_column_output(tmp_path): + lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id', 'family_id', 'class_id']) + assert len(lines) == 3 + for line in lines: + parts = line.split('\t') + assert len(parts) == 4 + + +def test_main_correct_mapping(tmp_path): + lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id', 'family_id', 'class_id']) + row_dict = {parts[0]: parts[1:] for l in lines for parts in [l.split('\t')]} + assert row_dict['AluSz6_dup1'] == ['AluSz6', 'Alu', 'SINE'] + assert row_dict['L1PA2_dup1'] == ['L1PA2', 'L1', 'LINE'] + + +def test_main_deduplication(tmp_path): + # Duplicate rows should be emitted only once + gtf_content = GTF_CONTENT + ( + 'chr1\trmsk\ttranscript\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + ) + lines = run_main(tmp_path, gtf_content, values=['gene_id']) + keys = [l.split('\t')[0] for l in lines] + assert keys.count('AluSz6_dup1') == 1 + + +def test_main_feature_filter(tmp_path): + # Only 'exon' features - our GTF has 'transcript' features -> 0 rows + lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id'], feature='exon') + assert len(lines) == 0 diff --git a/test/unit/test_simulate_reads.py b/test/unit/test_simulate_reads.py new file mode 100644 index 0000000..ec32c2e --- /dev/null +++ b/test/unit/test_simulate_reads.py @@ -0,0 +1,432 @@ +""" +Unit tests for workflow/scripts/simulate_reads.py + +Tests pure functions that do not require real FASTA/GTF files. +""" +import gzip +import io +import os +import random +import textwrap + +import pytest + +import simulate_reads as sr + + +# --------------------------------------------------------------------------- +# reverse_complement +# --------------------------------------------------------------------------- + +def test_reverse_complement_known(): + assert sr.reverse_complement('ATCG') == 'CGAT' + + +def test_reverse_complement_all_bases(): + assert sr.reverse_complement('AACCGGTT') == 'AACCGGTT' + + +def test_reverse_complement_n_preserved(): + assert sr.reverse_complement('NNAANN') == 'NNTTN N'.replace(' ', '') + assert sr.reverse_complement('NNN') == 'NNN' + + +def test_reverse_complement_single_base(): + assert sr.reverse_complement('A') == 'T' + assert sr.reverse_complement('T') == 'A' + assert sr.reverse_complement('C') == 'G' + assert sr.reverse_complement('G') == 'C' + + +def test_reverse_complement_involution(): + seq = 'ATCGATCGNNATCG' + assert sr.reverse_complement(sr.reverse_complement(seq)) == seq + + +# --------------------------------------------------------------------------- +# parse_gtf_attribute +# --------------------------------------------------------------------------- + +def test_parse_gtf_attribute_basic(): + attrs = 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; family_id "Alu";' + assert sr.parse_gtf_attribute(attrs, 'gene_id') == 'AluSz6' + assert sr.parse_gtf_attribute(attrs, 'transcript_id') == 'AluSz6_dup1' + assert sr.parse_gtf_attribute(attrs, 'family_id') == 'Alu' + + +def test_parse_gtf_attribute_missing_returns_none(): + attrs = 'gene_id "AluSz6";' + assert sr.parse_gtf_attribute(attrs, 'class_id') is None + + +def test_parse_gtf_attribute_no_quotes(): + attrs = 'gene_id AluSz6; transcript_id AluSz6_dup1;' + assert sr.parse_gtf_attribute(attrs, 'gene_id') == 'AluSz6' + + +# --------------------------------------------------------------------------- +# extract_repeat_sequence +# --------------------------------------------------------------------------- + +def test_extract_repeat_sequence_forward_strand(): + chrom = 'A' * 100 + 'GATTACA' * 10 + 'T' * 100 + seq = sr.extract_repeat_sequence(chrom, 100, 170, '+') + assert seq is not None + assert seq == ('GATTACA' * 10)[:70] + + +def test_extract_repeat_sequence_minus_strand(): + seq_plus = 'ACGT' * 20 + chrom = 'N' * 10 + seq_plus + 'N' * 10 + seq = sr.extract_repeat_sequence(chrom, 10, 90, '-') + assert seq == sr.reverse_complement(seq_plus) + + +def test_extract_repeat_sequence_too_many_n(): + chrom = 'N' * 200 + assert sr.extract_repeat_sequence(chrom, 0, 100, '+') is None + + +def test_extract_repeat_sequence_too_short(): + chrom = 'ACGT' * 100 + assert sr.extract_repeat_sequence(chrom, 0, 10, '+') is None # < 50 bp + + +def test_extract_repeat_sequence_exactly_50bp_ok(): + chrom = 'ACGT' * 100 + seq = sr.extract_repeat_sequence(chrom, 0, 50, '+') + assert seq is not None + assert len(seq) == 50 + + +def test_extract_repeat_sequence_n_just_below_threshold(): + # 9% N -> should pass (threshold = 10%) + body = 'A' * 91 + 'N' * 9 + chrom = body + 'A' * 100 + seq = sr.extract_repeat_sequence(chrom, 0, 100, '+') + assert seq is not None + + +def test_extract_repeat_sequence_n_just_above_threshold(): + # 11% N -> should fail (threshold is N/len > 0.1) + body = 'A' * 89 + 'N' * 11 + chrom = body + 'A' * 100 + assert sr.extract_repeat_sequence(chrom, 0, 100, '+') is None + + +# --------------------------------------------------------------------------- +# sample_subseq +# --------------------------------------------------------------------------- + +def test_sample_subseq_correct_length(): + rng = random.Random(42) + seq = 'ACGT' * 100 + for _ in range(10): + sub = sr.sample_subseq(seq, 90, rng) + assert len(sub) == 90 + + +def test_sample_subseq_short_sequence_padded(): + rng = random.Random(42) + seq = 'ACGT' * 5 # 20 bp + sub = sr.sample_subseq(seq, 90, rng) + assert len(sub) == 90 + assert 'N' in sub # padded with N + + +def test_sample_subseq_only_valid_bases(): + rng = random.Random(99) + seq = 'ACGT' * 100 + sub = sr.sample_subseq(seq, 50, rng) + assert all(b in 'ACGTN' for b in sub) + + +# --------------------------------------------------------------------------- +# sample_count_geometric +# --------------------------------------------------------------------------- + +def test_sample_count_geometric_always_positive(): + rng = random.Random(0) + for _ in range(1000): + c = sr.sample_count_geometric(rng) + assert c >= 1 + + +def test_sample_count_geometric_respects_max(): + rng = random.Random(0) + for _ in range(1000): + c = sr.sample_count_geometric(rng, max_count=10) + assert c <= 10 + + +def test_sample_count_geometric_mean_approx(): + rng = random.Random(42) + counts = [sr.sample_count_geometric(rng, mean=5.0) for _ in range(5000)] + mean = sum(counts) / len(counts) + # geometric with mean=5 should be around 5, allow ±1.5 + assert 3.5 <= mean <= 6.5 + + +# --------------------------------------------------------------------------- +# build_cell_plan +# --------------------------------------------------------------------------- + +def test_build_cell_plan_structure(): + intervals = { + 'chr1': [ + (0, 200, 'AluSz6_dup1', 'AluSz6', 'Alu', 'SINE', '+'), + (500, 800, 'L1PA2_dup1', 'L1PA2', 'L1', 'LINE', '-'), + ] + } + rng = random.Random(42) + plan = sr.build_cell_plan(intervals, n_cells=3, mean_expressed_per_cell=2, rng=rng) + assert len(plan) == 3 + for cell_id, locus_map in plan.items(): + assert cell_id.startswith('cell_') + for locus_id, info in locus_map.items(): + count, gene_id, family_id, class_id, chrom, start, end, strand = info + assert count >= 1 + + +# --------------------------------------------------------------------------- +# build_locus_to_cells +# --------------------------------------------------------------------------- + +def test_build_locus_to_cells_groups_correctly(): + plan = { + 'cell_001': {'locus_A': (3, 'gA', 'fA', 'cA', 'chr1', 0, 100, '+')}, + 'cell_002': {'locus_A': (2, 'gA', 'fA', 'cA', 'chr1', 0, 100, '+'), + 'locus_B': (5, 'gB', 'fB', 'cB', 'chr1', 200, 400, '+')}, + } + l2c = sr.build_locus_to_cells(plan) + assert len(l2c['locus_A']) == 2 + assert len(l2c['locus_B']) == 1 + cell_ids_for_A = [x[0] for x in l2c['locus_A']] + assert 'cell_001' in cell_ids_for_A + assert 'cell_002' in cell_ids_for_A + + +# --------------------------------------------------------------------------- +# write_ground_truth +# --------------------------------------------------------------------------- + +def test_write_ground_truth_format(tmp_path): + ground_truth = { + 'cell_001': {'locus_A': (5, 'gene_A', 'fam_A', 'cls_A')}, + 'cell_002': {'locus_A': (2, 'gene_A', 'fam_A', 'cls_A'), + 'locus_B': (7, 'gene_B', 'fam_B', 'cls_B')}, + } + out = tmp_path / 'gt.tsv' + sr.write_ground_truth(ground_truth, str(out), cell_column='cell_id') + + with open(out) as fh: + lines = fh.readlines() + + # header + header = lines[0].rstrip('\n').split('\t') + assert header[0] == 'cell_id' + assert 'locus_id' in header + assert 'true_count' in header + + # data rows sorted by cell_id then locus_id + rows = [l.rstrip('\n').split('\t') for l in lines[1:]] + assert len(rows) == 3 + # first row: cell_001 locus_A + assert rows[0][0] == 'cell_001' + assert rows[0][5] == '5' + + +def test_write_ground_truth_total_rows(tmp_path): + gt = {'c1': {'L1': (1, 'L1', 'L1', 'LINE'), 'L2': (2, 'L2', 'L2', 'LINE')}, + 'c2': {'L1': (3, 'L1', 'L1', 'LINE')}} + out = tmp_path / 'gt.tsv' + sr.write_ground_truth(gt, str(out), 'cell_id') + with open(out) as fh: + lines = fh.readlines() + assert len(lines) == 4 # header + 3 data rows + + +# --------------------------------------------------------------------------- +# parse_gtf_repeats_by_chrom (uses in-memory GTF string) +# --------------------------------------------------------------------------- + +def test_parse_gtf_repeats_by_chrom_minimal(tmp_path): + gtf_content = ( + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + 'chr2\trmsk\texon\t1001\t1600\t.\t-\t.\t' + 'gene_id "L1PA2"; transcript_id "L1PA2_dup1"; ' + 'family_id "L1"; class_id "LINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf)) + assert 'chr1' in intervals + assert 'chr2' in intervals + assert len(intervals['chr1']) == 1 + locus = intervals['chr1'][0] + # (start_0, end_0, locus_id, gene_id, family_id, class_id, strand) + assert locus[2] == 'AluSz6_dup1' + assert locus[3] == 'AluSz6' + + +def test_parse_gtf_repeats_by_chrom_filters_short(tmp_path): + gtf_content = ( + 'chr1\trmsk\texon\t101\t140\t.\t+\t.\t' # only 39 bp < 50 -> filtered + 'gene_id "Tiny"; transcript_id "Tiny_dup1"; ' + 'family_id "Tiny"; class_id "DNA";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf)) + assert len(intervals) == 0 + + +def test_parse_gtf_repeats_by_chrom_allowed_chroms(tmp_path): + gtf_content = ( + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + 'chr2\trmsk\texon\t1001\t1600\t.\t-\t.\t' + 'gene_id "L1PA2"; transcript_id "L1PA2_dup1"; ' + 'family_id "L1"; class_id "LINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), allowed_chroms={'chr1'}) + assert 'chr1' in intervals + assert 'chr2' not in intervals + + +# --------------------------------------------------------------------------- +# parse_gtf_repeats_by_chrom – additional branch coverage +# --------------------------------------------------------------------------- + +def test_parse_gtf_repeats_by_chrom_skips_comment_lines(tmp_path): + gtf_content = ( + '# this is a header comment\n' + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf)) + assert 'chr1' in intervals + assert len(intervals['chr1']) == 1 + + +def test_parse_gtf_repeats_by_chrom_skips_short_lines(tmp_path): + gtf_content = ( + 'too\tfew\tfields\n' + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf)) + assert len(intervals['chr1']) == 1 + + +def test_parse_gtf_repeats_by_chrom_max_per_chrom_truncates(tmp_path): + lines = '' + for i in range(5): + start = 101 + i * 300 + end = start + 300 + lines += ( + f'chr1\trmsk\texon\t{start}\t{end}\t.\t+\t.\t' + f'gene_id "R{i}"; transcript_id "R{i}_dup1"; ' + f'family_id "Alu"; class_id "SINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(lines) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), max_per_chrom=2) + assert len(intervals['chr1']) == 2 + + +def test_parse_gtf_repeats_by_chrom_max_per_chrom_no_truncation(tmp_path): + # max_per_chrom larger than count -> no sampling + gtf_content = ( + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "A"; transcript_id "A_dup1"; family_id "Alu"; class_id "SINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), max_per_chrom=10) + assert len(intervals['chr1']) == 1 + + +# --------------------------------------------------------------------------- +# stream_fasta_by_chrom +# --------------------------------------------------------------------------- + +def test_stream_fasta_by_chrom_plain(tmp_path): + fa = tmp_path / 'test.fa' + fa.write_text('>chr1\nACGTACGT\n>chr2\nTTTTGGGG\n>chr3\nAAAAAAAA\n') + result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1', 'chr3'})) + assert result['chr1'] == 'ACGTACGT' + assert result['chr3'] == 'AAAAAAAA' + assert 'chr2' not in result + + +def test_stream_fasta_by_chrom_multiline_seq(tmp_path): + fa = tmp_path / 'test.fa' + fa.write_text('>chr1\nACGT\nACGT\n') + result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1'})) + assert result['chr1'] == 'ACGTACGT' + + +def test_stream_fasta_by_chrom_gzip(tmp_path): + fa_gz = tmp_path / 'test.fa.gz' + import gzip as _gz + with _gz.open(str(fa_gz), 'wt') as f: + f.write('>chr1\nACGTACGT\n') + result = dict(sr.stream_fasta_by_chrom(str(fa_gz), {'chr1'})) + assert result['chr1'] == 'ACGTACGT' + + +def test_stream_fasta_by_chrom_skips_unwanted(tmp_path): + fa = tmp_path / 'test.fa' + fa.write_text('>chr1\nAAAA\n>chr99\nCCCC\n') + result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1'})) + assert 'chr99' not in result + + +# --------------------------------------------------------------------------- +# make_qual and safe_id +# --------------------------------------------------------------------------- + +def test_make_qual_default_char(): + assert sr.make_qual(5) == 'FFFFF' + assert len(sr.make_qual(90)) == 90 + + +def test_make_qual_custom_char(): + assert sr.make_qual(3, 'I') == 'III' + + +def test_safe_id_replaces_spaces_and_slashes(): + assert sr.safe_id('LINE/SINE foo') == 'LINE_SINE_foo' + + +def test_safe_id_no_change(): + assert sr.safe_id('AluSz6_dup1') == 'AluSz6_dup1' + + +# --------------------------------------------------------------------------- +# build_chrom_locus_coords +# --------------------------------------------------------------------------- + +def test_build_chrom_locus_coords_basic(): + plan = { + 'c1': {'locus_A': (3, 'gA', 'fA', 'cA', 'chr1', 100, 200, '+')}, + 'c2': {'locus_A': (2, 'gA', 'fA', 'cA', 'chr1', 100, 200, '+'), + 'locus_B': (5, 'gB', 'fB', 'cB', 'chr2', 500, 700, '-')}, + } + result = sr.build_chrom_locus_coords(plan) + assert 'chr1' in result + assert result['chr1']['locus_A'] == (100, 200, '+', 'gA') + assert 'chr2' in result + assert result['chr2']['locus_B'] == (500, 700, '-', 'gB') diff --git a/test/workflow/Snakefile_test b/test/workflow/Snakefile_test new file mode 100644 index 0000000..48fc1bc --- /dev/null +++ b/test/workflow/Snakefile_test @@ -0,0 +1,191 @@ +#!/usr/bin/env snakemake -s +""" +Test workflow for the repeats pipeline. + +Reuses all production snmk modules and adds two test-specific rule sets. + +Dry-run validation: + Triggered by CI via snakemake --dry-run. Verifies the DAG can be built + from any config without running any actual rules. + +Negative control (simulate_from_genes + check_negative_control_recall): + Simulates reads from gene body regions using simulate_reads.py with the + Ensembl gene GTF instead of the repeat GTF. The ground truth is therefore + over gene features, not repeat elements. When the repeat quantification + pipeline is run on these gene reads, recall should be near zero because + the reads do not originate from repeat elements. This tests whether the + pipeline can distinguish repeat signal from genic signal. + +Usage (run from the workflow/ directory): + + Dry-run: + snakemake -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --dry-run -p + + Full negative control run (requires --use-conda): + snakemake -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --use-conda --cores 4 +""" + +import os +import os.path as op + +configfile: "config.yaml" + +if 'indices_base' not in config: + config['indices_base'] = config['base'] + +# Replicate all globals that the production modules expect to find in scope. +# These mirror the definitions in workflow/Snakefile. +_chroms = config['reference'].get('chromosomes', []) +genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all' +active_aligners = config.get('aligners', []) +feature_sets = config.get('feature_sets', ['repeats']) +sim_cfg = config.get('simulation', {}) +sim_technology = sim_cfg.get('technology', 'smartseq2') +starsolo_modes = (config.get('aligner_params', {}).get('starsolo', {}) + .get('multimapper_modes', ['unique'])) +bowtie2_modes = (config.get('aligner_params', {}).get('bowtie2', {}) + .get('multimapper_modes', ['unique'])) +if sim_technology == 'smartseq2' and 'multi' in starsolo_modes: + starsolo_modes = [m for m in starsolo_modes if m != 'multi'] +sim_cell_ids = [f'cell_{i + 1:03d}' for i in range(sim_cfg.get('n_cells', 20))] +eval_dir = op.join(config['base'], 'evaluation') +counts_dir = op.join(config['base'], 'counts') +granularities = config.get('granularities', ['family_id']) +_repeat_fsets = [fs for fs in feature_sets + if fs in ('repeats', 'genic_repeats', 'intergenic_repeats')] +_eval_fsets = _repeat_fsets + +# workflow.basedir is the directory of this Snakefile (test/workflow/). +# Production modules live two levels up in workflow/modules/. +_prod = op.join(workflow.basedir, '..', '..', 'workflow') + +# Include all production modules so the DAG is complete and dry-runs cover +# the same rule set as the main Snakefile. +include: op.join(_prod, 'modules', 'download_references.snmk') +include: op.join(_prod, 'modules', 'data_acquisition.snmk') +include: op.join(_prod, 'modules', 'reference.snmk') +include: op.join(_prod, 'modules', 'starsolo.snmk') +include: op.join(_prod, 'modules', 'kallisto.snmk') +include: op.join(_prod, 'modules', 'alevin.snmk') +include: op.join(_prod, 'modules', 'bowtie2.snmk') +include: op.join(_prod, 'modules', 'normalize.snmk') + +neg_ctrl_dir = op.join(config['base'], 'negative_control', sim_technology) + +for subdir in ['tmp', 'logs', 'benchmarks', 'counts', 'evaluation', + 'negative_control']: + os.makedirs(op.join(config['base'], subdir), exist_ok=True) +for subdir in ['indices', 'refs']: + os.makedirs(op.join(config['indices_base'], subdir), exist_ok=True) + +# Conda environment for the two test-specific rules. +# Declared explicitly in test/workflow/envs/test_evaluation.yaml so +# dependencies (scipy, pandas, numpy) are versioned alongside this file. +TEST_ENV = op.join(workflow.basedir, 'envs', 'test_evaluation.yaml') + + +rule simulate_from_genes: + """ + Negative control: simulate scRNA-seq reads from gene body regions. + + Passes the Ensembl gene GTF to simulate_reads.py instead of the repeat + GTF. The resulting reads come from unique, non-repetitive transcribed + regions. Running the repeat quantification pipeline on these reads should + give recall near zero. + """ + conda: TEST_ENV + input: + gtf = config['reference']['genes_gtf'], + fasta = config['reference']['genome_fasta'], + output: + ground_truth = op.join(neg_ctrl_dir, 'ground_truth.tsv'), + manifest = op.join(neg_ctrl_dir, 'manifest.tsv'), + params: + script = op.join(_prod, 'scripts', 'simulate_reads.py'), + outdir = neg_ctrl_dir, + n_cells = sim_cfg.get('n_cells', 5), + n_expr = sim_cfg.get('n_expressed_per_cell', 20), + rl = sim_cfg.get('read_length', 90), + chroms = ' '.join(config['reference'].get('chromosomes', [])), + seed = sim_cfg.get('seed', 99), + log: + op.join(config['base'], 'logs', 'simulate_from_genes.log') + shell: + """ + python {params.script} \ + --mode smartseq2 \ + --gtf {input.gtf} \ + --fasta {input.fasta} \ + --outdir {params.outdir} \ + --n-cells {params.n_cells} \ + --n-expressed {params.n_expr} \ + --read-length {params.rl} \ + --chromosomes {params.chroms} \ + --seed {params.seed} \ + 2> {log} + """ + + +rule check_negative_control_recall: + """ + Assert that the repeat pipeline reports low recall for gene-body reads. + + Runs evaluate.py with the gene-body ground truth against the repeat + quantification output. Recall must stay below the threshold set in + testing.negative_control_max_recall (default 0.10). Gene reads should + not be falsely attributed to repeat elements. + """ + conda: TEST_ENV + input: + ground_truth = op.join(neg_ctrl_dir, 'ground_truth.tsv'), + repeat_counts = op.join( + config['base'], 'counts', + 'starsolo_repeats_gene_id_unique.tsv'), + locus_map = op.join( + config['indices_base'], 'indices', genome_tag, + 'repeats_locus_map.tsv'), + output: + result = op.join(config['base'], 'evaluation', + 'negative_control_check.txt'), + params: + script = op.join(_prod, 'scripts', 'evaluate.py'), + prefix = op.join(config['base'], 'evaluation', 'negative_control'), + max_recall = config.get('testing', {}).get('negative_control_max_recall', 0.10), + log: + op.join(config['base'], 'logs', 'check_negative_control.log') + shell: + """ + python {params.script} \ + --ground-truth {input.ground_truth} \ + --observed-counts {input.repeat_counts} \ + --aligner starsolo \ + --multimapper-mode unique \ + --granularity gene_id \ + --feature-set repeats \ + --locus-map {input.locus_map} \ + --output-prefix {params.prefix} \ + 2> {log} + + python3 - <<'PY' +import csv, sys +with open('{params.prefix}_global_metrics.tsv') as fh: + m = next(csv.DictReader(fh, delimiter='\\t')) +recall = float(m.get('recall', 1.0)) +threshold = {params.max_recall} +if recall > threshold: + print(f'FAIL: negative_control recall={{recall:.4f}} > threshold={{threshold}}', + file=sys.stderr) + sys.exit(1) +print(f'PASS: negative_control recall={{recall:.4f}} <= threshold={{threshold}}') +PY + echo "recall check passed" > {output.result} + """ + + +rule all: + input: + op.join(neg_ctrl_dir, 'ground_truth.tsv'), diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml new file mode 100644 index 0000000..32f068f --- /dev/null +++ b/test/workflow/configs/test_negative_control.yaml @@ -0,0 +1,51 @@ +# Config for the negative-control test workflow. +# Reuses the same reference files as simulation_smartseq2.yaml but: +# - simulation uses the GENE GTF (not repeat GTF) via Snakefile_test rules +# - only STARsolo is run (fastest aligner for the check) +# - only a small number of cells (5) to keep runtime low +# - testing.negative_control_max_recall sets the recall threshold + +base: "../results/test_negative_control" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 5 + n_expressed_per_cell: 20 + read_length: 90 + seed: 99 + +feature_sets: + - repeats + +aligners: + - starsolo + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + +granularities: + - gene_id + +resources: + max_threads: 4 + max_mem_mb: 8000 + +# Testing thresholds +testing: + # Gene-body reads should not map to repeats with more than 10% recall + negative_control_max_recall: 0.10 diff --git a/test/workflow/envs/test_evaluation.yaml b/test/workflow/envs/test_evaluation.yaml new file mode 100644 index 0000000..525f6e5 --- /dev/null +++ b/test/workflow/envs/test_evaluation.yaml @@ -0,0 +1,9 @@ +name: repeats_test_evaluation +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python>=3.10 + - conda-forge::scipy + - conda-forge::pandas + - conda-forge::numpy diff --git a/test/workflow/test_snakemake_dryrun.py b/test/workflow/test_snakemake_dryrun.py new file mode 100644 index 0000000..bb3af28 --- /dev/null +++ b/test/workflow/test_snakemake_dryrun.py @@ -0,0 +1,92 @@ +""" +Snakemake dry-run tests. + +These tests run `snakemake --dry-run` on both the main Snakefile and the +test Snakefile to verify that the DAG can be constructed without errors. +They do not execute any actual rules and require only snakemake to be installed. + +Marked as `workflow` so they can be excluded with `-m "not workflow"` when +snakemake is not in the PATH. +""" +import os +import subprocess +import sys + +import pytest + +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) +WORKFLOW_DIR = os.path.join(REPO_ROOT, 'workflow') +TEST_WORKFLOW = os.path.join(REPO_ROOT, 'test', 'workflow', 'Snakefile_test') +SMARTSEQ2_CFG = os.path.join(WORKFLOW_DIR, 'configs', 'simulation_smartseq2.yaml') +CHROMIUM_CFG = os.path.join(WORKFLOW_DIR, 'configs', 'simulation_chromium.yaml') +NEG_CTRL_CFG = os.path.join(REPO_ROOT, 'test', 'workflow', 'configs', + 'test_negative_control.yaml') + + +def snakemake_available(): + try: + subprocess.run(['snakemake', '--version'], capture_output=True, check=True) + return True + except (FileNotFoundError, subprocess.CalledProcessError): + return False + + +SKIP_IF_NO_SNAKEMAKE = pytest.mark.skipif( + not snakemake_available(), + reason='snakemake not found in PATH' +) + + +def run_dryrun(snakefile, configfile, workdir=WORKFLOW_DIR): + cmd = [ + 'snakemake', + '-s', snakefile, + '--configfile', configfile, + '--dry-run', + '--quiet', + ] + return subprocess.run(cmd, capture_output=True, text=True, cwd=workdir) + + +@pytest.mark.workflow +@SKIP_IF_NO_SNAKEMAKE +def test_main_snakefile_dryrun_smartseq2(): + r = run_dryrun(os.path.join(WORKFLOW_DIR, 'Snakefile'), SMARTSEQ2_CFG) + assert r.returncode == 0, ( + f'Dry-run failed for simulation_smartseq2.yaml:\n{r.stderr}' + ) + + +@pytest.mark.workflow +@SKIP_IF_NO_SNAKEMAKE +def test_main_snakefile_dryrun_chromium(): + if not os.path.exists(CHROMIUM_CFG): + pytest.skip('simulation_chromium.yaml not found') + r = run_dryrun(os.path.join(WORKFLOW_DIR, 'Snakefile'), CHROMIUM_CFG) + assert r.returncode == 0, ( + f'Dry-run failed for simulation_chromium.yaml:\n{r.stderr}' + ) + + +@pytest.mark.workflow +@SKIP_IF_NO_SNAKEMAKE +def test_test_snakefile_dryrun_negative_control(): + r = run_dryrun(TEST_WORKFLOW, NEG_CTRL_CFG) + assert r.returncode == 0, ( + f'Dry-run failed for test negative control config:\n{r.stderr}' + ) + + +@pytest.mark.workflow +@SKIP_IF_NO_SNAKEMAKE +def test_main_snakefile_lint(): + """snakemake --lint catches common Snakemake style/correctness issues.""" + cmd = [ + 'snakemake', + '--lint', + '--configfile', SMARTSEQ2_CFG, + ] + r = subprocess.run(cmd, capture_output=True, text=True, cwd=WORKFLOW_DIR) + # lint may return non-zero for warnings; we only fail on errors + # (lint output on stderr distinguishes errors from warnings) + assert 'Error' not in r.stdout, f'Snakemake lint errors:\n{r.stdout}' diff --git a/workflow/Snakefile b/workflow/Snakefile index ab05a91..a605307 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,6 +16,8 @@ genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all' if 'indices_base' not in config: config['indices_base'] = config['base'] + + include: "modules/download_references.snmk" include: "modules/data_acquisition.snmk" include: "modules/reference.snmk" diff --git a/workflow/configs/colon_cancer_cell_lines.yaml b/workflow/configs/old/colon_cancer_cell_lines.yaml similarity index 63% rename from workflow/configs/colon_cancer_cell_lines.yaml rename to workflow/configs/old/colon_cancer_cell_lines.yaml index df4f533..16cf23a 100644 --- a/workflow/configs/colon_cancer_cell_lines.yaml +++ b/workflow/configs/old/colon_cancer_cell_lines.yaml @@ -1,16 +1,16 @@ sample: "colon_cancer_cell_lines" ## base_path -base: "/home/imallona/repeats/" +base: "~/repeats/" aligners: ['starsolo'] # https://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_Ensembl_rmsk_TE.gtf.gz -repeats_gtf: "/home/imallona/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz" +repeats_gtf: "~/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz" # https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.chr.gtf.gz -genes_gtf: "/home/imallona/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz" +genes_gtf: "~/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz" # http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz -genome_fasta_gz: "/home/imallona/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" +genome_fasta_gz: "~/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" max_mem_mb: 20000 ## these are megabytes @@ -21,4 +21,4 @@ extraStarSoloArgs: "" ## solo #################################################################### soloType: "CB_UMI_Simple" -readFilesManifest: "/home/imallona/src/repeats/workflow/configs/colon_cancer_cell_lines_manifest.tsv" \ No newline at end of file +readFilesManifest: "~/src/repeats/workflow/configs/colon_cancer_cell_lines_manifest.tsv" \ No newline at end of file diff --git a/workflow/configs/old/real_data_chromium.yaml b/workflow/configs/old/real_data_chromium.yaml new file mode 100644 index 0000000..77f0de9 --- /dev/null +++ b/workflow/configs/old/real_data_chromium.yaml @@ -0,0 +1,38 @@ +base: "~/repeats/real_chromium" + +reference: + genome_fasta: "~/repeats/refs/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" + repeats_gtf: "~/repeats/refs/GRCh38_Ensembl_rmsk_TE.gtf.gz" + genes_gtf: "~/repeats/refs/Homo_sapiens.GRCh38.112.chr.gtf.gz" + filter_genic: true + +mode: real_data + +real_data: + technology: chromium + chemistry: 10xv3 + samples: + hct116_wt: + R1: "~/repeats/data/colon_cancer_cell_lines/SRR10974767_1.fastq.gz" + R2: "~/repeats/data/colon_cancer_cell_lines/SRR10974767_2.fastq.gz" + hct116_dko: + R1: "~/repeats/data/colon_cancer_cell_lines/SRR10974769_1.fastq.gz" + R2: "~/repeats/data/colon_cancer_cell_lines/SRR10974769_2.fastq.gz" + +aligners: + - starsolo + - kallisto + - alevin + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multimapping"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + +resources: + max_threads: 10 + max_mem_mb: 40000 diff --git a/workflow/configs/simulation.yaml b/workflow/configs/old/simulation.yaml similarity index 81% rename from workflow/configs/simulation.yaml rename to workflow/configs/old/simulation.yaml index 24747bd..20db5f8 100644 --- a/workflow/configs/simulation.yaml +++ b/workflow/configs/old/simulation.yaml @@ -1,16 +1,16 @@ sample: "test_sample_name" ## base_path -base: "/home/imallona/repeats/" +base: "~/repeats/" aligners: ['starsolo'] # https://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_Ensembl_rmsk_TE.gtf.gz -repeats_gtf: "/home/imallona/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz" +repeats_gtf: "~/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz" # https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.chr.gtf.gz -genes_gtf: "/home/imallona/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz" +genes_gtf: "~/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz" # http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz -genome_fasta_gz: "/home/imallona/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" +genome_fasta_gz: "~/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" max_mem_mb: 20000 ## these are megabytes @@ -21,8 +21,8 @@ extraStarSoloArgs: "" ## solo ####################################################################3 soloType: "SmartSeq" ## generated by a simulation but anyway -#smartseq_cdna_fastq: "/home/imallona/repeats/simulations/cdna.fastq.gz" -readFilesManifest: "/home/imallona/repeats/simulations/manifest.tsv" +#smartseq_cdna_fastq: "~/repeats/simulations/cdna.fastq.gz" +readFilesManifest: "~/repeats/simulations/manifest.tsv" # run_tech: "chromium" @@ -42,8 +42,8 @@ readFilesManifest: "/home/imallona/repeats/simulations/manifest.tsv" # featurecounts_parsing: "~/src/repeats_sc/04_snakemake/plot_featurecounts_profile.R" # software: # Rscript: "/usr/local/R/R-4.0.5/bin/Rscript" -# bowtie_build: "/home/imallona/soft/bowtie/bowtie-1.2.3/bowtie-build" -# bowtie: "/home/imallona/soft/bowtie/bowtie-1.2.3/bowtie" +# bowtie_build: "~/soft/bowtie/bowtie-1.2.3/bowtie-build" +# bowtie: "~/soft/bowtie/bowtie-1.2.3/bowtie" # pigz: "/usr/bin/pigz" # star: "~/soft/star/STAR-2.7.3a/source/STAR" # featurecounts: "~/soft/subread/subread-2.0.0-source/bin/featureCounts" diff --git a/workflow/configs/real_data_chromium.yaml b/workflow/configs/real_data_chromium.yaml deleted file mode 100644 index 422e8a1..0000000 --- a/workflow/configs/real_data_chromium.yaml +++ /dev/null @@ -1,38 +0,0 @@ -base: "/home/imallona/repeats/real_chromium" - -reference: - genome_fasta: "/home/imallona/repeats/refs/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" - repeats_gtf: "/home/imallona/repeats/refs/GRCh38_Ensembl_rmsk_TE.gtf.gz" - genes_gtf: "/home/imallona/repeats/refs/Homo_sapiens.GRCh38.112.chr.gtf.gz" - filter_genic: true - -mode: real_data - -real_data: - technology: chromium - chemistry: 10xv3 - samples: - hct116_wt: - R1: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974767_1.fastq.gz" - R2: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974767_2.fastq.gz" - hct116_dko: - R1: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974769_1.fastq.gz" - R2: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974769_2.fastq.gz" - -aligners: - - starsolo - - kallisto - - alevin - -aligner_params: - starsolo: - multimapper_modes: ["unique", "multimapping"] - extra_args: "" - kallisto: - extra_args: "" - alevin: - extra_args: "" - -resources: - max_threads: 10 - max_mem_mb: 40000 diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml index ea12b78..5a08495 100644 --- a/workflow/configs/simulation_chromium.yaml +++ b/workflow/configs/simulation_chromium.yaml @@ -1,5 +1,5 @@ -base: "/home/imallona/repeats/results/simulation_chromium" -indices_base: "/home/imallona/repeats/results/shared" +base: "../results/simulation_chromium" +indices_base: "../results/shared" reference: assembly: hg38 @@ -8,20 +8,21 @@ reference: # Split repeats into genic/intergenic subsets using gene body overlap (bedtools) filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "/home/imallona/repeats/results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "/home/imallona/repeats/results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "/home/imallona/repeats/results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation simulation: technology: chromium - n_cells: 50 - n_expressed_per_cell: 80 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 cb_length: 16 umi_length: 12 seed: 42 + mutation_rate: 0.001 feature_sets: - repeats diff --git a/workflow/configs/simulation_chromium_noise_0pct.yaml b/workflow/configs/simulation_chromium_noise_0pct.yaml new file mode 100644 index 0000000..d098ee1 --- /dev/null +++ b/workflow/configs/simulation_chromium_noise_0pct.yaml @@ -0,0 +1,59 @@ +base: "../results/simulation_chromium_noise_0pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: chromium + n_cells: 500 + n_expressed_per_cell: 1000 + read_length: 90 + cb_length: 16 + umi_length: 12 + seed: 42 + mutation_rate: 0.0 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +granularities: + - locus + - gene_id + - family_id + - class_id + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multi"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 50 + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_chromium_noise_10pct.yaml b/workflow/configs/simulation_chromium_noise_10pct.yaml new file mode 100644 index 0000000..07b1052 --- /dev/null +++ b/workflow/configs/simulation_chromium_noise_10pct.yaml @@ -0,0 +1,59 @@ +base: "../results/simulation_chromium_noise_10pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: chromium + n_cells: 500 + n_expressed_per_cell: 1000 + read_length: 90 + cb_length: 16 + umi_length: 12 + seed: 42 + mutation_rate: 0.10 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +granularities: + - locus + - gene_id + - family_id + - class_id + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multi"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 50 + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_chromium_noise_1pct.yaml b/workflow/configs/simulation_chromium_noise_1pct.yaml new file mode 100644 index 0000000..7837c70 --- /dev/null +++ b/workflow/configs/simulation_chromium_noise_1pct.yaml @@ -0,0 +1,59 @@ +base: "../results/simulation_chromium_noise_1pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: chromium + n_cells: 500 + n_expressed_per_cell: 1000 + read_length: 90 + cb_length: 16 + umi_length: 12 + seed: 42 + mutation_rate: 0.01 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +granularities: + - locus + - gene_id + - family_id + - class_id + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multi"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 50 + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_chromium_noise_5pct.yaml b/workflow/configs/simulation_chromium_noise_5pct.yaml new file mode 100644 index 0000000..fadf484 --- /dev/null +++ b/workflow/configs/simulation_chromium_noise_5pct.yaml @@ -0,0 +1,59 @@ +base: "../results/simulation_chromium_noise_5pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: chromium + n_cells: 500 + n_expressed_per_cell: 1000 + read_length: 90 + cb_length: 16 + umi_length: 12 + seed: 42 + mutation_rate: 0.05 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +granularities: + - locus + - gene_id + - family_id + - class_id + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multi"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 50 + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml index b9a7698..541b150 100644 --- a/workflow/configs/simulation_smartseq2.yaml +++ b/workflow/configs/simulation_smartseq2.yaml @@ -1,5 +1,5 @@ -base: "/home/imallona/repeats/results/simulation_smartseq2" -indices_base: "/home/imallona/repeats/results/shared" +base: "../results/simulation_smartseq2" +indices_base: "../results/shared" reference: assembly: hg38 @@ -7,18 +7,19 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation simulation: technology: smartseq2 - n_cells: 20 - n_expressed_per_cell: 100 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 seed: 42 + mutation_rate: 0.001 # Feature sets to quantify against: # genes - Ensembl gene annotation diff --git a/workflow/configs/simulation_smartseq2_noise_0pct.yaml b/workflow/configs/simulation_smartseq2_noise_0pct.yaml new file mode 100644 index 0000000..fcf2473 --- /dev/null +++ b/workflow/configs/simulation_smartseq2_noise_0pct.yaml @@ -0,0 +1,57 @@ +base: "../results/simulation_smartseq2_noise_0pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 500 + n_expressed_per_cell: 1000 + read_length: 90 + seed: 42 + mutation_rate: 0.0 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "--minAssignedFrags 1" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 20 + +granularities: + - locus + - gene_id + - family_id + - class_id + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_smartseq2_noise_10pct.yaml b/workflow/configs/simulation_smartseq2_noise_10pct.yaml new file mode 100644 index 0000000..ee49fda --- /dev/null +++ b/workflow/configs/simulation_smartseq2_noise_10pct.yaml @@ -0,0 +1,57 @@ +base: "../results/simulation_smartseq2_noise_10pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 500 + n_expressed_per_cell: 1000 + read_length: 90 + seed: 42 + mutation_rate: 0.10 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "--minAssignedFrags 1" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 20 + +granularities: + - locus + - gene_id + - family_id + - class_id + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_smartseq2_noise_1pct.yaml b/workflow/configs/simulation_smartseq2_noise_1pct.yaml new file mode 100644 index 0000000..e19d0cc --- /dev/null +++ b/workflow/configs/simulation_smartseq2_noise_1pct.yaml @@ -0,0 +1,57 @@ +base: "../results/simulation_smartseq2_noise_1pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 500 + n_expressed_per_cell: 1000 + read_length: 90 + seed: 42 + mutation_rate: 0.01 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "--minAssignedFrags 1" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 20 + +granularities: + - locus + - gene_id + - family_id + - class_id + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_smartseq2_noise_5pct.yaml b/workflow/configs/simulation_smartseq2_noise_5pct.yaml new file mode 100644 index 0000000..2c7756e --- /dev/null +++ b/workflow/configs/simulation_smartseq2_noise_5pct.yaml @@ -0,0 +1,57 @@ +base: "../results/simulation_smartseq2_noise_5pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 500 + n_expressed_per_cell: 1000 + read_length: 90 + seed: 42 + mutation_rate: 0.05 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "--minAssignedFrags 1" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 20 + +granularities: + - locus + - gene_id + - family_id + - class_id + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/modules/bowtie2.snmk.bak b/workflow/modules/bowtie2.snmk.bak new file mode 100644 index 0000000..55a6a38 --- /dev/null +++ b/workflow/modules/bowtie2.snmk.bak @@ -0,0 +1,352 @@ +#!/usr/bin/env snakemake -s +""" +Bowtie2 alignment against the feature-set pseudo-genome, followed by featureCounts. + +Two independent chunking axes (both configurable, both default to 1 = no chunking): + featurecounts_cell_chunk_size - SmartSeq2 only; cells per featureCounts job + featurecounts_n_feature_chunks - both technologies; number of GTF pieces for featureCounts + +Cell chunking reduces the number of BAM handles per job. +Feature chunking reduces per-job memory when the annotation is large. +The merge step reassembles the full feature x cell matrix from all chunks. +""" + +import os.path as op +import math + +sim_technology = config.get('simulation', config.get('real_data', {})).get('technology', 'smartseq2') +bt2_cfg = config.get('aligner_params', {}).get('bowtie2', {}) +cell_chunk_size = bt2_cfg.get('featurecounts_cell_chunk_size', 50) +n_feature_chunks = bt2_cfg.get('featurecounts_n_feature_chunks', 1) +feature_chunk_ids = list(range(n_feature_chunks)) + + +def bowtie2_index_prefix(wildcards): + return op.join(config['base'], 'references', f'bowtie2_{wildcards.feature_set}', wildcards.feature_set) + + +def gtf_for_feature_set(wildcards): + if wildcards.feature_set == 'genes': + return op.join(config['base'], 'tmp', 'genes.gtf') + return op.join(config['base'], 'tmp', 'repeats.gtf') + + +def split_gtf_into_chunks(gtf_path, n_chunks, out_paths): + """Split a GTF into n_chunks files by gene_id, writing to out_paths.""" + gene_ids = [] + seen = set() + with open(gtf_path) as fh: + for line in fh: + if line.startswith('#'): + continue + parts = line.split('\t') + if len(parts) < 9: + continue + for attr in parts[8].split(';'): + attr = attr.strip() + if attr.startswith('gene_id'): + gid = attr[len('gene_id'):].strip().strip('"') + if gid not in seen: + gene_ids.append(gid) + seen.add(gid) + + chunk_size = max(1, math.ceil(len(gene_ids) / n_chunks)) + gene_id_to_chunk = {} + for idx, gid in enumerate(gene_ids): + gene_id_to_chunk[gid] = min(idx // chunk_size, n_chunks - 1) + + handles = [open(p, 'w') for p in out_paths] + with open(gtf_path) as fh: + for line in fh: + if line.startswith('#'): + for h in handles: + h.write(line) + continue + parts = line.split('\t') + if len(parts) < 9: + continue + gid = '' + for attr in parts[8].split(';'): + attr = attr.strip() + if attr.startswith('gene_id'): + gid = attr[len('gene_id'):].strip().strip('"') + break + chunk_idx = gene_id_to_chunk.get(gid, 0) + handles[chunk_idx].write(line) + for h in handles: + h.close() + + +if sim_technology == 'smartseq2': + + sim_cell_ids = [f'cell_{i + 1:03d}' for i in range( + config.get('simulation', {}).get('n_cells', 20))] + n_cell_chunks = max(1, math.ceil(len(sim_cell_ids) / cell_chunk_size)) + cell_chunks = [sim_cell_ids[i * cell_chunk_size:(i + 1) * cell_chunk_size] + for i in range(n_cell_chunks)] + cell_chunk_ids = list(range(n_cell_chunks)) + + rule bowtie2_align_smartseq2_cell: + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + index = lambda wc: op.join(config['base'], 'references', + 'bowtie2_' + wc.feature_set, wc.feature_set + '.1.bt2'), + fastq = (op.join(config['base'], 'simulations', 'smartseq2', '{cell_id}.fastq.gz') + if config['mode'] == 'simulation' + else lambda wc: config['real_data']['samples'][wc.cell_id]['R1']) + output: + bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', '{cell_id}.bam') + params: + index_prefix = bowtie2_index_prefix, + extra_args = bt2_cfg.get('extra_args', '') + log: op.join(config['base'], 'logs', 'bowtie2_{feature_set}_smartseq2_{cell_id}.log') + benchmark: op.join(config['base'], 'benchmarks', 'bowtie2_{feature_set}_smartseq2_{cell_id}.txt') + threads: 4 + shell: + """ + bowtie2 -x {params.index_prefix} \ + -U {input.fastq} \ + --threads {threads} \ + {params.extra_args} 2> {log} | \ + samtools sort -o {output.bam} -@ {threads} + samtools index {output.bam} + """ + + rule featurecounts_smartseq2_chunk: + """featureCounts on one cell chunk x one feature chunk.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + bams = lambda wc: expand( + op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', '{cell_id}.bam'), + feature_set=wc.feature_set, + cell_id=cell_chunks[int(wc.cell_chunk_id)]), + gtf = gtf_for_feature_set + output: + counts = temp(op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', + 'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.tsv')) + params: + tmp_counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', + 'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.raw'), + tmp_gtf = op.join(config['base'], 'tmp', + '{feature_set}_feat_chunk_{feature_chunk_id}.gtf') + log: op.join(config['base'], 'logs', + 'featurecounts_{feature_set}_cell{cell_chunk_id}_feat{feature_chunk_id}.log') + benchmark: op.join(config['base'], 'benchmarks', + 'featurecounts_{feature_set}_cell{cell_chunk_id}_feat{feature_chunk_id}.txt') + threads: 4 + run: + gtf_paths = expand( + op.join(config['base'], 'tmp', '{feature_set}_feat_chunk_{fid}.gtf'), + feature_set=wildcards.feature_set, + fid=feature_chunk_ids) + if not all(op.exists(p) for p in gtf_paths): + split_gtf_into_chunks(input.gtf, n_feature_chunks, gtf_paths) + shell( + f"featureCounts " + f" -T {{threads}} " + f" -t exon " + f" -g gene_id " + f" -a {{params.tmp_gtf}} " + f" -o {{params.tmp_counts}} " + f" -M --fraction " + f" {{input.bams}} 2> {{log}} && " + f"cut -f 1,7- {{params.tmp_counts}} | tail -n +2 > {{output.counts}}" + ) + + rule merge_featurecounts_smartseq2: + """Merge all cell x feature chunk count tables into the full feature x cell matrix.""" + input: + chunks = expand( + op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', + 'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.tsv'), + allow_missing=True, + cell_chunk_id=cell_chunk_ids, + feature_chunk_id=feature_chunk_ids) + output: + merged = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', 'counts_merged.tsv') + run: + # Each chunk covers a subset of cells AND a subset of features. + # We group by cell_chunk_id to assemble the full feature set for each cell group, + # then merge cell groups horizontally. + import re + + def parse_chunk_path(path): + m = re.search(r'cell(\d+)_feat(\d+)', path) + return int(m.group(1)), int(m.group(2)) + + def read_counts(path): + rows = {} + with open(path) as fh: + header = fh.readline().rstrip('\n').split('\t') + cells = header[1:] + for line in fh: + parts = line.rstrip('\n').split('\t') + rows[parts[0]] = parts[1:] + return cells, rows + + # Group chunks: cell_chunk_id -> {feature_chunk_id: (cells, rows)} + cell_groups = {} + for path in input.chunks: + cc, fc = parse_chunk_path(path) + cells, rows = read_counts(path) + cell_groups.setdefault(cc, {})[fc] = (cells, rows) + + # Within each cell group, merge across feature chunks (vertically) + merged_groups = {} + for cc in sorted(cell_groups.keys()): + all_cells = cell_groups[cc][0][0] + all_rows = {} + for fc in sorted(cell_groups[cc].keys()): + cells, rows = cell_groups[cc][fc] + all_rows.update(rows) + merged_groups[cc] = (all_cells, all_rows) + + # Across cell groups, collect all features and merge horizontally + all_features = list(dict.fromkeys( + feat for _, rows in merged_groups.values() for feat in rows)) + all_cells = [cell for cells, _ in merged_groups.values() for cell in cells] + + with open(output.merged, 'w') as out: + out.write('feature_id\t' + '\t'.join(all_cells) + '\n') + for feat in all_features: + row_vals = [] + for cells, rows in merged_groups.values(): + feat_row = rows.get(feat, ['0'] * len(cells)) + row_vals.extend(feat_row) + out.write(feat + '\t' + '\t'.join(row_vals) + '\n') + +else: + + rule bowtie2_align_chromium_r2: + """Align R2 (cDNA) reads to the feature pseudo-genome.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + index = lambda wc: op.join(config['base'], 'references', + f'bowtie2_{wc.feature_set}', f'{wc.feature_set}.1.bt2'), + r2 = op.join(config['base'], 'simulations', 'chromium', 'R2.fastq.gz') + if config['mode'] == 'simulation' + else lambda wc: config['real_data']['samples'][wc.sample]['R2'] + output: + bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'aligned.bam') + params: + index_prefix = bowtie2_index_prefix, + extra_args = bt2_cfg.get('extra_args', '') + log: op.join(config['base'], 'logs', 'bowtie2_{feature_set}_chromium.log') + benchmark: op.join(config['base'], 'benchmarks', 'bowtie2_{feature_set}_chromium.txt') + threads: config['resources']['max_threads'] + shell: + """ + bowtie2 -x {params.index_prefix} \ + -U {input.r2} \ + --threads {threads} \ + {params.extra_args} 2> {log} | \ + samtools sort -o {output.bam} -@ {threads} + samtools index {output.bam} + """ + + rule tag_bam_with_barcodes_chromium: + """Attach CB and UMI tags from R1 to each read in the sorted BAM.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'aligned.bam'), + r1 = op.join(config['base'], 'simulations', 'chromium', 'R1.fastq.gz') + if config['mode'] == 'simulation' + else lambda wc: config['real_data']['samples'][wc.sample]['R1'] + output: + tagged_bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'tagged.bam') + params: + cb_len = config.get('simulation', config.get('real_data', {})).get('cb_length', 16), + umi_len = config.get('simulation', config.get('real_data', {})).get('umi_length', 12) + log: op.join(config['base'], 'logs', 'tag_bam_{feature_set}_chromium.log') + benchmark: op.join(config['base'], 'benchmarks', 'tag_bam_{feature_set}_chromium.txt') + shell: + """ + umi_tools extract \ + --bc-pattern={'C' * params.cb_len + 'N' * params.umi_len} \ + --stdin {input.r1} \ + --read2-in {input.bam} \ + --read2-out {output.tagged_bam} \ + --log {log} + """ + + rule umi_dedup_chromium: + """Deduplicate UMIs per cell per feature locus.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'tagged.bam') + output: + dedup_bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'dedup.bam') + log: op.join(config['base'], 'logs', 'umi_dedup_{feature_set}_chromium.log') + benchmark: op.join(config['base'], 'benchmarks', 'umi_dedup_{feature_set}_chromium.txt') + shell: + """ + umi_tools dedup \ + --per-cell \ + --stdin {input} \ + --stdout {output.dedup_bam} \ + --log {log} + """ + + rule featurecounts_chromium_feature_chunk: + """featureCounts on the full chromium BAM, one feature chunk at a time.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'dedup.bam'), + gtf = gtf_for_feature_set + output: + counts = temp(op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', + 'chunks', 'feat{feature_chunk_id}.tsv')) + params: + tmp_counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', + 'chunks', 'feat{feature_chunk_id}.raw'), + tmp_gtf = op.join(config['base'], 'tmp', + '{feature_set}_feat_chunk_{feature_chunk_id}.gtf') + log: op.join(config['base'], 'logs', + 'featurecounts_{feature_set}_chromium_feat{feature_chunk_id}.log') + benchmark: op.join(config['base'], 'benchmarks', + 'featurecounts_{feature_set}_chromium_feat{feature_chunk_id}.txt') + threads: config['resources']['max_threads'] + run: + gtf_paths = expand( + op.join(config['base'], 'tmp', '{feature_set}_feat_chunk_{fid}.gtf'), + feature_set=wildcards.feature_set, + fid=feature_chunk_ids) + if not all(op.exists(p) for p in gtf_paths): + split_gtf_into_chunks(input.gtf, n_feature_chunks, gtf_paths) + shell( + f"featureCounts " + f" -T {{threads}} " + f" -t exon " + f" -g gene_id " + f" -a {{params.tmp_gtf}} " + f" -o {{params.tmp_counts}} " + f" -M --fraction " + f" --byReadGroup " + f" {{input.bam}} 2> {{log}} && " + f"cut -f 1,7- {{params.tmp_counts}} | tail -n +2 > {{output.counts}}" + ) + + rule merge_featurecounts_chromium: + """Concatenate feature-chunk count tables from the chromium run.""" + input: + chunks = expand( + op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', + 'chunks', 'feat{feature_chunk_id}.tsv'), + allow_missing=True, + feature_chunk_id=feature_chunk_ids) + output: + counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'counts.tsv') + run: + header = None + rows = {} + for path in input.chunks: + with open(path) as fh: + h = fh.readline().rstrip('\n') + if header is None: + header = h + for line in fh: + parts = line.rstrip('\n').split('\t') + rows[parts[0]] = parts[1:] + with open(output.counts, 'w') as out: + out.write(header + '\n') + for feat, vals in rows.items(): + out.write(feat + '\t' + '\t'.join(vals) + '\n') diff --git a/workflow/modules/download_references.snmk b/workflow/modules/download_references.snmk index 58b6533..0fe3a19 100644 --- a/workflow/modules/download_references.snmk +++ b/workflow/modules/download_references.snmk @@ -69,7 +69,7 @@ genome_fasta_url = urls_cfg.get('genome_fasta', default_genome_fasta_url) genes_gtf_url = urls_cfg.get('genes_gtf', default_genes_gtf_url) rmsk_flatfile_url = urls_cfg.get('rmsk_flatfile', default_rmsk_flatfile_url) -refs_dir = op.join(config['base'], 'refs') +refs_dir = op.join(config.get('indices_base', config['base']), 'refs') build_rmsk_script = op.join(workflow.basedir, 'scripts', 'build_rmsk_gtf.py') diff --git a/workflow/modules/evaluation.snmk b/workflow/modules/evaluation.snmk index a83c559..9ae34bf 100644 --- a/workflow/modules/evaluation.snmk +++ b/workflow/modules/evaluation.snmk @@ -21,6 +21,7 @@ if sim_technology == 'smartseq2' and 'multi' in starsolo_modes: bowtie2_modes = config.get('aligner_params', {}).get('bowtie2', {}).get( 'multimapper_modes', ['unique']) granularities = config.get('granularities', ['family_id']) +sim_mutation_rate = config.get('simulation', {}).get('mutation_rate', 0.0) evaluation_script = op.join(workflow.basedir, 'scripts', 'evaluate.py') counts_dir = op.join(config['base'], 'counts') eval_dir = op.join(config['base'], 'evaluation') @@ -81,6 +82,7 @@ rule evaluate_combination: granularity = 'locus|gene_id|family_id|class_id', multimapper_mode = 'unique|multi' params: + mutation_rate = sim_mutation_rate, benchmark_file = eval_benchmark, output_prefix = op.join(eval_dir, '{aligner}_{feature_set}_{granularity}_{multimapper_mode}') @@ -98,7 +100,8 @@ rule evaluate_combination: --feature-set {wildcards.feature_set} \ --locus-map {input.locus_map} \ --benchmark {params.benchmark_file} \ - --output-prefix {params.output_prefix} 2> {log} + --output-prefix {params.output_prefix} \ + --mutation-rate {params.mutation_rate} 2> {log} """ rule aggregate_global_metrics: """Concatenate all per-aligner global metric TSVs into one summary file.""" @@ -176,7 +179,9 @@ rule render_report: output: report = op.join(eval_dir, 'evaluation_report.html') params: - eval_dir = eval_dir, + mutation_rate = sim_mutation_rate, + eval_dir = op.abspath(eval_dir), + abs_report = op.abspath(op.join(eval_dir, 'evaluation_report.html')), rmd = op.join(workflow.basedir, 'scripts', 'evaluation_report.Rmd') log: op.join(config['base'], 'logs', 'render_report.log') @@ -184,7 +189,8 @@ rule render_report: """ Rscript -e "rmarkdown::render( '{params.rmd}', - output_file = '{output.report}', + output_file = '{params.abs_report}', + knit_root_dir = getwd(), params = list(eval_dir = '{params.eval_dir}') )" 2> {log} """ diff --git a/workflow/modules/normalize.snmk b/workflow/modules/normalize.snmk index 779b8e7..dc6fb2e 100644 --- a/workflow/modules/normalize.snmk +++ b/workflow/modules/normalize.snmk @@ -74,6 +74,7 @@ rule normalize_starsolo: python {workflow.basedir}/scripts/normalize_starsolo.py \ --raw-dir {input.raw_matrix_dir} \ {params.locus_map_arg} \ + --multimapper-mode {wildcards.multimapper_mode} \ --granularity {wildcards.granularity} \ --output {output.counts_tsv} 2> {log} """ diff --git a/workflow/modules/reference.snmk b/workflow/modules/reference.snmk index cd00f3c..0dae94a 100644 --- a/workflow/modules/reference.snmk +++ b/workflow/modules/reference.snmk @@ -66,7 +66,7 @@ def t2g_path(wildcards): rule decompress_genome: conda: op.join(workflow.basedir, 'envs', 'star.yaml') input: config['reference']['genome_fasta'] - output: temp(op.join(config['base'], 'tmp', 'genome.fa')) + output: op.join(config['indices_base'], 'tmp', 'genome.fa') params: chroms = _chrom_list threads: workflow.cores @@ -84,7 +84,7 @@ rule decompress_genome: rule decompress_repeats_gtf: conda: op.join(workflow.basedir, 'envs', 'star.yaml') input: config['reference']['repeats_gtf'] - output: temp(op.join(config['base'], 'tmp', 'repeats.gtf')) + output: op.join(config['indices_base'], 'tmp', 'repeats.gtf') threads: workflow.cores shell: "pigz --decompress --keep --stdout --processes {threads} {input} | sed 's/^chr//' > {output}" @@ -92,7 +92,7 @@ rule decompress_repeats_gtf: rule decompress_genes_gtf: conda: op.join(workflow.basedir, 'envs', 'star.yaml') input: config['reference']['genes_gtf'] - output: temp(op.join(config['base'], 'tmp', 'genes.gtf')) + output: op.join(config['indices_base'], 'tmp', 'genes.gtf') params: chroms = _chrom_list threads: workflow.cores @@ -196,7 +196,7 @@ rule extract_feature_set_fasta: gtf = gtf_for_repeat_feature_set output: fasta = op.join(config['indices_base'], 'indices', genome_tag, '{feature_set}.fa'), - bed = temp(op.join(config['base'], 'tmp', '{feature_set}.bed')) + bed = temp(op.join(config['indices_base'], 'tmp', '{feature_set}.bed')) wildcard_constraints: feature_set = 'repeats|genic_repeats|intergenic_repeats' log: diff --git a/workflow/modules/simulations.snmk b/workflow/modules/simulations.snmk index 443e291..1c231c1 100644 --- a/workflow/modules/simulations.snmk +++ b/workflow/modules/simulations.snmk @@ -38,6 +38,7 @@ rule simulate_smartseq2: n_expressed = sim_cfg.get('n_expressed_per_cell', 100), read_length = sim_cfg.get('read_length', 90), seed = sim_seed, + mutation_rate = sim_cfg.get('mutation_rate', 0.001), chrom_args = '--chromosomes ' + ' '.join(sim_chromosomes) if sim_chromosomes else '', max_per_chrom_arg = ( f'--max-repeats-per-chrom {sim_cfg["max_repeats_per_chrom"]}' @@ -57,7 +58,8 @@ rule simulate_smartseq2: --read-length {params.read_length} \ --seed {params.seed} \ {params.chrom_args} \ - {params.max_per_chrom_arg} 2> {log} + {params.max_per_chrom_arg} \ + --mutation-rate {params.mutation_rate} 2> {log} """ @@ -80,6 +82,7 @@ rule simulate_chromium: cb_length = sim_cfg.get('cb_length', 16), umi_length = sim_cfg.get('umi_length', 12), seed = sim_seed, + mutation_rate = sim_cfg.get('mutation_rate', 0.001), chrom_args = '--chromosomes ' + ' '.join(sim_chromosomes) if sim_chromosomes else '', max_per_chrom_arg = ( f'--max-repeats-per-chrom {sim_cfg["max_repeats_per_chrom"]}' @@ -101,5 +104,6 @@ rule simulate_chromium: --umi-length {params.umi_length} \ --seed {params.seed} \ {params.chrom_args} \ - {params.max_per_chrom_arg} 2> {log} + {params.max_per_chrom_arg} \ + --mutation-rate {params.mutation_rate} 2> {log} """ diff --git a/workflow/modules/starsolo.snmk b/workflow/modules/starsolo.snmk index 78df478..50416d7 100644 --- a/workflow/modules/starsolo.snmk +++ b/workflow/modules/starsolo.snmk @@ -97,6 +97,7 @@ rule starsolo_smartseq2: --soloCellReadStats None \ --soloUMIdedup NoDedup \ --soloMultiMappers {params.solo_multimappers} \ + --outSAMattributes NH HI AS NM \ --outSAMtype BAM SortedByCoordinate \ --limitBAMsortRAM {params.max_ram_bytes} \ {params.extra_args} 2> {log} @@ -163,6 +164,7 @@ rule starsolo_chromium: --soloUMIstart {params.umi_start} \ --soloUMIlen {params.umi_len} \ --soloMultiMappers {params.solo_multimappers} \ + --outSAMattributes NH HI AS NM CB UB \ --outSAMtype BAM SortedByCoordinate \ --limitBAMsortRAM {params.max_ram_bytes} \ {params.extra_args} 2> {log} diff --git a/workflow/scripts/count_starsolo_locus.py b/workflow/scripts/count_starsolo_locus.py index 7296715..02486e0 100644 --- a/workflow/scripts/count_starsolo_locus.py +++ b/workflow/scripts/count_starsolo_locus.py @@ -17,17 +17,17 @@ count matrices produced by bowtie2/kallisto/alevin. Modes: - smartseq2 - CB:Z tag is the cell_id (set by STAR from the manifest). No UMI. - All reads per (CB, locus) are counted directly. + smartseq2 - Cell ID is the prefix of the read name before _r{n}_, + set by simulate_reads.py. All reads per (cell, locus) are counted. + No CB:Z tag is present in SmartSeq2 STARsolo BAMs. chromium - CB:Z = cell barcode, UB:Z = UMI. UMI deduplication is performed per (CB, locus) by counting distinct UMIs per cell. Scalability: - The input BAM is first sorted by CB tag (samtools sort -t CB) into a - temporary file. Reads are then streamed one cell at a time, so peak memory - is O(expressed_loci_per_cell x umis_per_locus) rather than O(all cells x all - loci x all UMIs). The outer accumulator (cb -> locus -> count) is sparse and - grows only with expressed (cell, locus) pairs. + Chromium: BAM is sorted by CB tag into a temporary file. Reads are streamed + one cell at a time, so peak memory is O(loci_per_cell x umis_per_locus). + SmartSeq2: BAM is read directly (sorted by coordinate). All cells are + accumulated simultaneously; with ~20 cells this is negligible memory. Output: feature x cell TSV (rows=locus_ids, cols=cell barcodes, first col=feature_id). """ @@ -35,6 +35,7 @@ import argparse import bisect import os +import re import subprocess import sys import tempfile @@ -48,16 +49,15 @@ def parse_locus_id(locus_id): start_0 is 0-based, end_0 is 0-based exclusive (= GTF end field value). """ try: - gene_part, coords = locus_id.split('::', 1) - # coords: chrom:start-end(strand) - colon_idx = coords.rfind(':') + gene_part, coords = locus_id.split("::", 1) + colon_idx = coords.rfind(":") chrom = coords[:colon_idx] - rest = coords[colon_idx + 1:] # start-end(strand) - dash_idx = rest.index('-') + rest = coords[colon_idx + 1:] + dash_idx = rest.index("-") start_0 = int(rest[:dash_idx]) - paren_idx = rest.index('(') + paren_idx = rest.index("(") end_0 = int(rest[dash_idx + 1:paren_idx]) - strand = rest[paren_idx + 1:rest.index(')')] + strand = rest[paren_idx + 1:rest.index(")")] return chrom, start_0, end_0, strand except (ValueError, IndexError): return None @@ -79,12 +79,12 @@ def load_intervals(fasta_path): chrom_intervals = defaultdict(list) with open(fasta_path) as fh: for line in fh: - if not line.startswith('>'): + if not line.startswith(">"): continue - header = line[1:].rstrip('\n').split()[0] - if '::' not in header: + header = line[1:].rstrip("\n").split()[0] + if "::" not in header: continue - locus_id = header.split('::', 1)[0] + locus_id = header.split("::", 1)[0] coords = parse_locus_id(header) if coords is None: continue @@ -127,52 +127,86 @@ def find_locus(chrom, pos, chrom_starts, chrom_intervals): def sort_bam_by_cb(bam_path, n_threads): """Sort BAM by CB tag into a tempfile. Returns tempfile path (caller must unlink).""" - tmpf = tempfile.NamedTemporaryFile(suffix='.bam', delete=False) + tmpf = tempfile.NamedTemporaryFile(suffix=".bam", delete=False) sorted_path = tmpf.name tmpf.close() - print(f'Sorting BAM by CB tag -> {sorted_path}', file=sys.stderr) + print(f"Sorting BAM by CB tag -> {sorted_path}", file=sys.stderr) result = subprocess.run( - ['samtools', 'sort', '-t', 'CB', '-@', str(n_threads), '-o', sorted_path, bam_path], + ["samtools", "sort", "-t", "CB", "-@", str(n_threads), "-o", sorted_path, bam_path], stderr=subprocess.PIPE ) if result.returncode != 0: os.unlink(sorted_path) sys.stderr.write(result.stderr.decode()) - sys.exit(f'samtools sort failed (exit {result.returncode})') + sys.exit(f"samtools sort failed (exit {result.returncode})") return sorted_path -def emit_cell(cb, cell_data, mode, counts): - """Merge one cell's accumulated data into the sparse counts dict.""" - if mode == 'chromium': - for locus_id, umis in cell_data.items(): - counts[cb][locus_id] = len(umis) - else: - for locus_id, n in cell_data.items(): - counts[cb][locus_id] = n +def process_smartseq2(bam_path, chrom_starts, chrom_intervals, multimapper_mode): + """ + Process SmartSeq2 BAM: cell_id is the prefix of QNAME before _r{n}_. + Reads are sorted by coordinate (interleaved across cells) so we accumulate + all cells simultaneously. + """ + counts = defaultdict(lambda: defaultdict(int)) + all_cbs = set() + all_loci = set() + n_reads = 0 + n_assigned = 0 + proc = subprocess.Popen( + ["samtools", "view", bam_path], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + try: + for raw in proc.stdout: + line = raw.decode() + if line.startswith("@"): + continue + fields = line.split("\t") + if len(fields) < 11: + continue + flag = int(fields[1]) + if flag & 4 or flag & 2048: + continue + n_reads += 1 -def main(): - ap = argparse.ArgumentParser(description=__doc__) - ap.add_argument('--bam', required=True, - help='STARsolo Aligned.sortedByCoord.out.bam') - ap.add_argument('--fasta', required=True, - help='Feature FASTA with headers transcript_id::chrom:start-end(strand)') - ap.add_argument('--mode', required=True, choices=['smartseq2', 'chromium']) - ap.add_argument('--multimapper-mode', default='unique', - choices=['unique', 'multi'], - help='unique: NH=1 reads only; multi: all aligned reads') - ap.add_argument('--threads', type=int, default=1, - help='Threads for samtools sort (default: 1)') - ap.add_argument('--output', required=True) - args = ap.parse_args() + nh = None + for tag in fields[11:]: + if tag.startswith("NH:i:"): + nh = int(tag[5:].rstrip("\n")) + break + if multimapper_mode == "unique" and nh is not None and nh > 1: + continue - print(f'Loading intervals from {args.fasta}', file=sys.stderr) - chrom_starts, chrom_intervals = load_intervals(args.fasta) - n_loci = sum(len(v) for v in chrom_intervals.values()) - print(f' {n_loci} intervals on {len(chrom_intervals)} chromosomes', file=sys.stderr) + # cell_id is everything before _r{digits}_ in the QNAME + cb = re.sub(r"_r\d+_.*$", "", fields[0]) - sorted_bam = sort_bam_by_cb(args.bam, args.threads) + chrom = fields[2] + pos_0 = int(fields[3]) - 1 + locus_id = find_locus(chrom, pos_0, chrom_starts, chrom_intervals) + if locus_id is None: + continue + + counts[cb][locus_id] += 1 + all_cbs.add(cb) + all_loci.add(locus_id) + n_assigned += 1 + finally: + proc.kill() + proc.wait() + + print(f" {n_reads} reads processed, {n_assigned} assigned to loci", file=sys.stderr) + print(f" {len(all_cbs)} cells, {len(all_loci)} loci", file=sys.stderr) + return dict(counts), all_cbs, all_loci + + +def process_chromium(bam_path, chrom_starts, chrom_intervals, multimapper_mode, n_threads): + """ + Process Chromium BAM: sort by CB tag, stream one cell at a time to cap memory. + UMI deduplication per (CB, locus). + """ + sorted_bam = sort_bam_by_cb(bam_path, n_threads) counts = defaultdict(lambda: defaultdict(int)) all_cbs = set() @@ -184,83 +218,108 @@ def main(): cell_data = {} proc = subprocess.Popen( - ['samtools', 'view', sorted_bam], + ["samtools", "view", sorted_bam], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) try: for raw in proc.stdout: line = raw.decode() - if line.startswith('@'): + if line.startswith("@"): continue - fields = line.split('\t') + fields = line.split("\t") if len(fields) < 11: continue flag = int(fields[1]) if flag & 4 or flag & 2048: continue n_reads += 1 - chrom = fields[2] - pos_0 = int(fields[3]) - 1 cb = None ub = None nh = None for tag in fields[11:]: - if tag.startswith('CB:Z:'): - cb = tag[5:].rstrip('\n') - elif tag.startswith('UB:Z:'): - ub = tag[5:].rstrip('\n') - elif tag.startswith('NH:i:'): - nh = int(tag[5:].rstrip('\n')) + if tag.startswith("CB:Z:"): + cb = tag[5:].rstrip("\n") + elif tag.startswith("UB:Z:"): + ub = tag[5:].rstrip("\n") + elif tag.startswith("NH:i:"): + nh = int(tag[5:].rstrip("\n")) if cb is None: continue - if args.multimapper_mode == 'unique' and nh is not None and nh > 1: + if multimapper_mode == "unique" and nh is not None and nh > 1: continue - locus_id = find_locus(chrom, pos_0, chrom_starts, chrom_intervals) + locus_id = find_locus(fields[2], int(fields[3]) - 1, chrom_starts, chrom_intervals) if locus_id is None: continue if cb != current_cb: if current_cb is not None: - emit_cell(current_cb, cell_data, args.mode, counts) + for lid, umis in cell_data.items(): + counts[current_cb][lid] = len(umis) all_cbs.add(current_cb) current_cb = cb - cell_data = defaultdict(set) if args.mode == 'chromium' else defaultdict(int) + cell_data = defaultdict(set) + cell_data[locus_id].add(ub if ub else str(n_reads)) all_loci.add(locus_id) n_assigned += 1 - - if args.mode == 'chromium': - cell_data[locus_id].add(ub if ub else str(n_reads)) - else: - cell_data[locus_id] += 1 finally: proc.kill() proc.wait() if current_cb is not None: - emit_cell(current_cb, cell_data, args.mode, counts) + for lid, umis in cell_data.items(): + counts[current_cb][lid] = len(umis) all_cbs.add(current_cb) os.unlink(sorted_bam) - print(f' {n_reads} reads processed, {n_assigned} assigned to loci', file=sys.stderr) - print(f' {len(all_cbs)} cells, {len(all_loci)} loci', file=sys.stderr) + print(f" {n_reads} reads processed, {n_assigned} assigned to loci", file=sys.stderr) + print(f" {len(all_cbs)} cells, {len(all_loci)} loci", file=sys.stderr) + return dict(counts), all_cbs, all_loci + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--bam", required=True, + help="STARsolo Aligned.sortedByCoord.out.bam") + ap.add_argument("--fasta", required=True, + help="Feature FASTA with headers transcript_id::chrom:start-end(strand)") + ap.add_argument("--mode", required=True, choices=["smartseq2", "chromium"]) + ap.add_argument("--multimapper-mode", default="unique", + choices=["unique", "multi"], + help="unique: NH=1 reads only; multi: all aligned reads") + ap.add_argument("--threads", type=int, default=1, + help="Threads for samtools sort (Chromium only; default: 1)") + ap.add_argument("--output", required=True) + args = ap.parse_args() + + print(f"Loading intervals from {args.fasta}", file=sys.stderr) + chrom_starts, chrom_intervals = load_intervals(args.fasta) + n_loci = sum(len(v) for v in chrom_intervals.values()) + print(f" {n_loci} intervals on {len(chrom_intervals)} chromosomes", file=sys.stderr) + + if args.mode == "smartseq2": + counts, all_cbs, all_loci = process_smartseq2( + args.bam, chrom_starts, chrom_intervals, args.multimapper_mode) + else: + counts, all_cbs, all_loci = process_chromium( + args.bam, chrom_starts, chrom_intervals, args.multimapper_mode, args.threads) sorted_loci = sorted(all_loci) sorted_cbs = sorted(all_cbs) - with open(args.output, 'w') as fh: - fh.write('feature_id\t' + '\t'.join(sorted_cbs) + '\n') + with open(args.output, "w") as fh: + fh.write("feature_id\t" + "\t".join(sorted_cbs) + "\n") for locus_id in sorted_loci: - row = [str(counts[cb].get(locus_id, 0)) for cb in sorted_cbs] - fh.write(locus_id + '\t' + '\t'.join(row) + '\n') + row = [str(counts.get(cb, {}).get(locus_id, 0)) for cb in sorted_cbs] + fh.write(locus_id + "\t" + "\t".join(row) + "\n") - print(f'wrote {len(sorted_loci)} loci x {len(sorted_cbs)} cells to {args.output}', + print(f"wrote {len(sorted_loci)} loci x {len(sorted_cbs)} cells to {args.output}", file=sys.stderr) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/workflow/scripts/evaluate.py b/workflow/scripts/evaluate.py index 3bd0ef1..8f868b6 100644 --- a/workflow/scripts/evaluate.py +++ b/workflow/scripts/evaluate.py @@ -17,7 +17,7 @@ from scipy import stats -def load_ground_truth(gt_path, granularity='gene_id'): +def load_ground_truth(gt_path, granularity='gene_id', valid_locus_ids=None): """ Returns: truth: {cell_id: {feature_id: count}} @@ -58,6 +58,11 @@ def load_ground_truth(gt_path, granularity='gene_id'): truth = _dd(lambda: _dd(int)) repeat_meta = {} for r in raw: + # Filter at locus level before aggregating so cross-partition + # gene_ids (e.g. AluSz6 in both genic and intergenic) do not + # inflate truth counts for a subset evaluation. + if valid_locus_ids is not None and r['locus_id'] not in valid_locus_ids: + continue feat = r[key_col] truth[r['cell_id']][feat] += r['true_count'] repeat_meta[feat] = (r['family_id'], r['class_id']) @@ -223,35 +228,40 @@ def main(): help='4-col TSV (transcript_id, gene_id, family_id, class_id). ' 'Expands the feature universe for specificity and restricts ' 'ground truth to this feature_set\'s loci.') + ap.add_argument('--mutation-rate', type=float, default=0.0, + help='Mutation rate used in the simulation (for output labelling)') ap.add_argument('--output-prefix', required=True) args = ap.parse_args() - print(f'Loading ground truth from {args.ground_truth} at granularity={args.granularity}', - file=sys.stderr) - truth, repeat_meta = load_ground_truth(args.ground_truth, granularity=args.granularity) - - print(f'Loading observed counts from {args.observed_counts}', file=sys.stderr) - observed, matrix_features = load_count_matrix(args.observed_counts) - - # Build feature universe from locus_map, and optionally filter truth to it. - # This ensures that for genic_repeats / intergenic_repeats, ground truth - # features outside the feature_set are not counted as false negatives. + # Load locus map FIRST so truth is filtered at locus level before aggregation. + # Without this, at gene_id granularity a gene_id such as AluSz6 that has copies + # in both genic and intergenic regions carries intergenic counts into a + # genic_repeats evaluation (and vice-versa), inflating truth vs observed. locus_map_features = set() + valid_locus_ids = None if args.locus_map and os.path.exists(args.locus_map): + valid_locus_ids = set() col = {'locus': 0, 'gene_id': 1, 'family_id': 2, 'class_id': 3}.get( args.granularity, 1) with open(args.locus_map) as fh: for line in fh: parts = line.rstrip('\n').split('\t') + if not parts or not parts[0]: + continue + valid_locus_ids.add(parts[0]) # column 0 is always transcript_id if len(parts) > col: locus_map_features.add(parts[col]) - print(f' Locus map loaded: {len(locus_map_features)} features at ' - f'{args.granularity} level', file=sys.stderr) - # Filter ground truth to this feature_set's loci - truth = { - cell: {f: c for f, c in feats.items() if f in locus_map_features} - for cell, feats in truth.items() - } + print(f' Locus map loaded: {len(valid_locus_ids)} loci / ' + f'{len(locus_map_features)} features at {args.granularity} level', + file=sys.stderr) + + print(f'Loading ground truth from {args.ground_truth} at granularity={args.granularity}', + file=sys.stderr) + truth, repeat_meta = load_ground_truth( + args.ground_truth, granularity=args.granularity, valid_locus_ids=valid_locus_ids) + + print(f'Loading observed counts from {args.observed_counts}', file=sys.stderr) + observed, matrix_features = load_count_matrix(args.observed_counts) feature_universe = ( matrix_features | @@ -269,6 +279,7 @@ def main(): global_metrics['multimapper_mode'] = args.multimapper_mode global_metrics['feature_set'] = args.feature_set global_metrics['granularity'] = args.granularity + global_metrics['mutation_rate'] = args.mutation_rate bench = load_benchmark(args.benchmark) global_metrics.update(bench) @@ -281,6 +292,7 @@ def main(): row['multimapper_mode'] = args.multimapper_mode row['feature_set'] = args.feature_set row['granularity'] = args.granularity + row['mutation_rate'] = args.mutation_rate write_tsv(per_cell_rows, args.output_prefix + '_per_cell_metrics.tsv', fallback_fields=['cell_id', 'pearson_r', 'spearman_r', 'n_truth_expressed', 'n_observed_expressed', @@ -302,6 +314,7 @@ def main(): row['multimapper_mode'] = args.multimapper_mode row['feature_set'] = args.feature_set row['granularity'] = args.granularity + row['mutation_rate'] = args.mutation_rate per_family_rows.append(row) write_tsv(per_family_rows, args.output_prefix + '_per_family_metrics.tsv', fallback_fields=['class_id', 'aligner', 'multimapper_mode', 'feature_set', diff --git a/workflow/scripts/evaluation_report.Rmd b/workflow/scripts/evaluation_report.Rmd index d0a0e4e..dd97006 100644 --- a/workflow/scripts/evaluation_report.Rmd +++ b/workflow/scripts/evaluation_report.Rmd @@ -13,7 +13,7 @@ params: --- ```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, fig.width = 12, fig.height = 12) library(ggplot2) library(dplyr) @@ -99,10 +99,10 @@ All metrics compare the aligner count matrix against the simulation ground truth - **Specificity**: fraction of truly unexpressed (feature, cell) pairs correctly assigned zero by the aligner. -Rows = aligner (+ multimapper mode), columns = metric. Each bar is a -feature_set x granularity combination. +Rows = metric, columns = feature_set. Each point is one aligner at one granularity; +x-axis = granularity, colour = aligner (+ multimapper mode). -```{r global-accuracy, fig.height=14, fig.width=18} +```{r global-accuracy, fig.height=18, fig.width=14} acc_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall", "jaccard", "specificity") @@ -110,25 +110,23 @@ acc_long <- global %>% select(aligner_mode, feature_set, gran_label, all_of(acc_metrics)) %>% pivot_longer(all_of(acc_metrics), names_to = "metric", values_to = "value") %>% filter(!is.na(value)) %>% - mutate( - metric = factor(metric, levels = acc_metrics), - fs_gran = paste0(feature_set, "\n", gran_label) - ) - -n_fg <- length(unique(acc_long$fs_gran)) -pal <- make_palette(n_fg) - -ggplot(acc_long, aes(x = fs_gran, y = value, fill = fs_gran)) + - geom_col(position = "dodge", width = 0.8) + - facet_grid(aligner_mode ~ metric, scales = "free_x") + - scale_fill_manual(values = pal, name = "feature_set / granularity") + - scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.5, 1)) + - labs(x = NULL, y = NULL, + mutate(metric = factor(metric, levels = acc_metrics)) + +n_al <- length(unique(acc_long$aligner_mode)) +pal <- make_palette(n_al) + +ggplot(acc_long, aes(x = gran_label, y = value, + colour = aligner_mode, group = aligner_mode)) + + geom_point(size = 2.5, alpha = 0.85) + + geom_line(linetype = "dashed", linewidth = 0.5) + + facet_grid(metric ~ feature_set, scales = "free_y") + + scale_colour_manual(values = pal, name = "aligner (mode)") + + labs(x = "granularity", y = NULL, title = "global accuracy metrics", - subtitle = "rows = aligner, columns = metric") + + subtitle = "rows = metric, columns = feature_set; x = granularity; colour = aligner") + theme_bw(base_size = 10) + theme(aspect.ratio = 1, - axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + axis.text.x = element_text(angle = 40, hjust = 1, size = 8), strip.text.y = element_text(size = 8), strip.text.x = element_text(size = 8), legend.position = "bottom", @@ -145,9 +143,8 @@ if (nrow(det) > 0) { pal_pr <- make_palette(n_lab) ggplot(det, aes(x = recall, y = precision, - colour = aligner_mode, shape = feature_set, size = f1)) + + colour = aligner_mode, shape = feature_set)) + geom_point(alpha = 0.85) + - scale_size_continuous(range = c(2, 7), name = "F1") + scale_colour_manual(values = pal_pr, name = "Aligner (mode)") + scale_shape_manual(values = c(16, 17, 15, 3, 7, 8)[seq_len( length(unique(det$feature_set)))], @@ -155,7 +152,7 @@ if (nrow(det) > 0) { facet_wrap(~gran_label, nrow = 1) + xlim(0, 1) + ylim(0, 1) + geom_abline(slope = 1, intercept = 0, linetype = "dotted", colour = "grey60") + - labs(title = "precision vs recall (size = F1)", + labs(title = "precision vs recall", subtitle = "facets = granularity", x = "Recall", y = "Precision") + theme_bw(base_size = 11) + @@ -194,6 +191,15 @@ ggplot(corr, aes(x = gran_label, y = r, ## log1p RMSE +log1p RMSE is computed over all (feature, cell) pairs in the feature universe. +At locus level there are thousands of features, most with zero counts in both +truth and observed; the dominance of (0, 0) pairs suppresses the average squared +error so RMSE looks low. At class_id level there are only ~10-20 features, each +accumulating large aggregated counts, so per-entry log1p differences are larger +and RMSE rises. The gradient locus < gene_id < family_id < class_id is therefore +expected and not a mistake - lower locus RMSE reflects sparsity, not better +quantification accuracy. + ```{r rmse, fig.height=5} if ("log1p_rmse" %in% names(global) && any(!is.na(global$log1p_rmse))) { n_al2 <- length(unique(global$aligner_mode)) @@ -275,6 +281,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa geom_line(linewidth = 0.5, linetype = "dashed") + geom_point(size = 2, alpha = 0.85) + facet_grid(gran_label ~ feature_set) + + scale_y_sqrt() + scale_colour_manual(values = pal_pf, name = "Aligner (mode)") + labs(x = "repeat class", y = "F1", title = "F1 per repeat class", @@ -290,6 +297,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa geom_line(linewidth = 0.5, linetype = "dashed") + geom_point(size = 2, alpha = 0.85) + facet_grid(gran_label ~ feature_set) + + scale_y_sqrt() + scale_colour_manual(values = pal_pf, name = "Aligner (mode)") + labs(x = "repeat class", y = "pearson r", title = "Pearson r per repeat class") + @@ -322,7 +330,7 @@ if (length(res_cols) > 0 && any(!is.na(global[res_cols]))) { colour = aligner_mode, group = aligner_mode)) + geom_line(linetype = "dashed", linewidth = 0.6) + geom_point(size = 2) + - facet_grid(resource ~ feature_set, scales = "free_y") + + facet_grid(feature_set ~ resource, scales = "free_y") + scale_colour_manual(values = pal_res, name = "Aligner (mode)") + labs(x = "granularity", y = NULL, title = "compute resource usage") + theme_bw(base_size = 10) + diff --git a/workflow/scripts/noise_sweep_report.Rmd b/workflow/scripts/noise_sweep_report.Rmd new file mode 100644 index 0000000..c1a9602 --- /dev/null +++ b/workflow/scripts/noise_sweep_report.Rmd @@ -0,0 +1,303 @@ +--- +title: "repeat element quantification - noise sweep report" +author: "Izaskun Mallona" +date: "`r Sys.Date()`" +output: + html_document: + theme: flatly + toc: true + toc_float: true + code_folding: hide +params: + eval_dirs: "." +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, + fig.width = 14, fig.height = 10) +library(ggplot2) +library(dplyr) +library(tidyr) +library(readr) +library(scales) +library(patchwork) + +make_palette <- function(n) { + if (n <= 8) return(RColorBrewer::brewer.pal(max(3, n), "Dark2")[seq_len(n)]) + colorRampPalette(RColorBrewer::brewer.pal(8, "Dark2"))(n) +} + +eval_dirs <- strsplit(params$eval_dirs, ",")[[1]] +eval_dirs <- trimws(eval_dirs) +``` + +```{r load-data} +load_metrics_from_dirs <- function(dirs, filename) { + purrr::map_dfr(dirs, function(d) { + p <- file.path(d, filename) + if (!file.exists(p)) return(NULL) + read_tsv(p, show_col_types = FALSE) + }) +} + +load_pattern_from_dirs <- function(dirs, pattern) { + purrr::map_dfr(dirs, function(d) { + files <- list.files(d, pattern = pattern, full.names = TRUE) + if (length(files) == 0) return(NULL) + purrr::map_dfr(files, ~ read_tsv(.x, show_col_types = FALSE)) + }) +} + +global <- load_metrics_from_dirs(eval_dirs, "summary_global_metrics.tsv") + +num_cols <- c("pearson_r", "spearman_r", "log1p_rmse", + "precision", "recall", "f1", "jaccard", "specificity", + "n_cells", "n_features", "mutation_rate", + "wall_time_s", "cpu_time_s", "max_rss_mb", "io_in_mb", "io_out_mb") +for (col in num_cols) { + if (col %in% names(global)) + global[[col]] <- suppressWarnings(as.numeric(global[[col]])) +} + +if (!"granularity" %in% names(global)) global$granularity <- "gene_id" +if (!"feature_set" %in% names(global)) global$feature_set <- "repeats" +if (!"multimapper_mode" %in% names(global)) global$multimapper_mode <- "unique" +if (!"mutation_rate" %in% names(global)) global$mutation_rate <- NA_real_ + +global <- global %>% + mutate( + aligner_mode = paste0(aligner, " (", multimapper_mode, ")"), + gran_label = factor(granularity, + levels = c("locus", "gene_id", "family_id", "class_id")), + noise_label = paste0(mutation_rate * 100, "%") + ) + +per_cell <- tryCatch(load_pattern_from_dirs(eval_dirs, "_per_cell_metrics\\.tsv$"), + error = function(e) NULL) +per_family <- tryCatch(load_pattern_from_dirs(eval_dirs, "_per_family_metrics\\.tsv$"), + error = function(e) NULL) + +for (col in c("pearson_r", "spearman_r", "mutation_rate")) { + if (!is.null(per_cell) && col %in% names(per_cell)) + per_cell[[col]] <- suppressWarnings(as.numeric(per_cell[[col]])) + if (!is.null(per_family) && col %in% names(per_family)) + per_family[[col]] <- suppressWarnings(as.numeric(per_family[[col]])) +} + +if (!is.null(per_family) && nrow(per_family) > 0) { + for (col in c("f1", "precision", "recall", "jaccard", "specificity")) { + if (col %in% names(per_family)) + per_family[[col]] <- suppressWarnings(as.numeric(per_family[[col]])) + } + if (!"multimapper_mode" %in% names(per_family)) per_family$multimapper_mode <- "unique" + per_family <- per_family %>% + mutate(aligner_mode = paste0(aligner, " (", multimapper_mode, ")")) +} +``` + +--- + +This report combines evaluation results across multiple noise levels (simulated nucleotide +substitution rates). Each `eval_dirs` entry corresponds to one pipeline run with a distinct +`mutation_rate` in the simulation config. The `mutation_rate` column in the metrics TSVs +identifies each run. + +Noise levels found in the data: `r sort(unique(global$mutation_rate))` + +--- + +## metric degradation by noise level + +How each aligner's accuracy changes as the per-base mutation rate increases. +x-axis = mutation rate, y-axis = metric value, colour = aligner (+ multimapper mode). + +```{r degradation-main, fig.height=20, fig.width=14} +acc_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall", + "jaccard", "specificity") + +deg <- global %>% + filter(!is.na(mutation_rate)) %>% + select(aligner_mode, feature_set, gran_label, mutation_rate, all_of(acc_metrics)) %>% + pivot_longer(all_of(acc_metrics), names_to = "metric", values_to = "value") %>% + filter(!is.na(value)) %>% + mutate(metric = factor(metric, levels = acc_metrics)) + +n_al <- length(unique(deg$aligner_mode)) +pal <- make_palette(n_al) + +ggplot(deg, aes(x = mutation_rate, y = value, + colour = aligner_mode, group = aligner_mode)) + + geom_point(size = 2.5, alpha = 0.85) + + geom_line(linewidth = 0.6) + + facet_grid(metric ~ feature_set + gran_label, scales = "free_y") + + scale_colour_manual(values = pal, name = "aligner (mode)") + + scale_x_continuous(labels = percent_format(accuracy = 0.1)) + + labs(x = "mutation rate", y = NULL, + title = "metric degradation with increasing noise", + subtitle = "rows = metric; columns = feature_set x granularity; x = mutation rate") + + theme_bw(base_size = 9) + + theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + strip.text = element_text(size = 7), + legend.position = "bottom", + legend.text = element_text(size = 8)) +``` + +## pearson r and F1 — noise overview + +Summary view focusing on the two most interpretable metrics. + +```{r overview-two-metrics, fig.height=10, fig.width=14} +ov <- global %>% + filter(!is.na(mutation_rate)) %>% + select(aligner_mode, feature_set, gran_label, mutation_rate, pearson_r, f1) %>% + pivot_longer(c(pearson_r, f1), names_to = "metric", values_to = "value") %>% + filter(!is.na(value)) + +n_al2 <- length(unique(ov$aligner_mode)) +pal2 <- make_palette(n_al2) + +ggplot(ov, aes(x = mutation_rate, y = value, + colour = aligner_mode, group = aligner_mode)) + + geom_point(size = 3, alpha = 0.85) + + geom_line(linewidth = 0.8) + + facet_grid(metric ~ feature_set, scales = "free_y") + + scale_colour_manual(values = pal2, name = "aligner (mode)") + + scale_x_continuous(labels = percent_format(accuracy = 0.1)) + + ylim(0, 1) + + labs(x = "mutation rate", y = NULL, + title = "Pearson r and F1 across noise levels", + subtitle = "rows = metric, columns = feature_set; lines connect same aligner across mutation rates") + + theme_bw(base_size = 11) + + theme(axis.text.x = element_text(angle = 25, hjust = 1), + legend.position = "bottom") +``` + +## global accuracy metrics per noise level + +Full metric grid from the standard report, faceted by mutation rate. + +```{r global-faceted, fig.height=22, fig.width=16} +full_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall", + "jaccard", "specificity", "log1p_rmse") + +full_long <- global %>% + filter(!is.na(mutation_rate)) %>% + select(aligner_mode, feature_set, gran_label, mutation_rate, noise_label, + all_of(full_metrics)) %>% + pivot_longer(all_of(full_metrics), names_to = "metric", values_to = "value") %>% + filter(!is.na(value)) %>% + mutate(metric = factor(metric, levels = full_metrics)) + +n_al3 <- length(unique(full_long$aligner_mode)) +pal3 <- make_palette(n_al3) + +ggplot(full_long, aes(x = gran_label, y = value, + colour = aligner_mode, group = aligner_mode)) + + geom_point(size = 2, alpha = 0.8) + + geom_line(linetype = "dashed", linewidth = 0.5) + + facet_grid(metric ~ noise_label + feature_set, scales = "free_y") + + scale_colour_manual(values = pal3, name = "aligner (mode)") + + labs(x = "granularity", y = NULL, + title = "global accuracy metrics", + subtitle = "rows = metric; columns = mutation rate x feature_set; x = granularity") + + theme_bw(base_size = 8) + + theme(aspect.ratio = 1, + axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + strip.text = element_text(size = 6), + legend.position = "bottom", + legend.text = element_text(size = 7)) +``` + +## per-cell correlations by noise level + +```{r per-cell-noise, fig.height=10, fig.width=14} +if (!is.null(per_cell) && nrow(per_cell) > 0 && "mutation_rate" %in% names(per_cell)) { + pc <- per_cell %>% + mutate(across(c(pearson_r, spearman_r), ~ suppressWarnings(as.numeric(.)))) %>% + filter(!is.na(aligner), !is.na(mutation_rate)) + + if (!"multimapper_mode" %in% names(pc)) pc$multimapper_mode <- "unique" + if (!"feature_set" %in% names(pc)) pc$feature_set <- "repeats" + + pc <- pc %>% + mutate(aligner_mode = paste0(aligner, " (", multimapper_mode, ")"), + gran_label = factor(granularity, + levels = c("locus", "gene_id", "family_id", "class_id")), + noise_label = paste0(mutation_rate * 100, "%")) + + pc_long <- pc %>% + pivot_longer(c(pearson_r, spearman_r), names_to = "metric", values_to = "r") %>% + filter(!is.na(r)) + + n_al4 <- length(unique(pc_long$aligner_mode)) + pal4 <- make_palette(n_al4) + + ggplot(pc_long, aes(x = aligner_mode, y = r, fill = aligner_mode)) + + geom_violin(alpha = 0.7, draw_quantiles = c(0.25, 0.5, 0.75)) + + geom_jitter(width = 0.1, size = 0.5, alpha = 0.4) + + facet_grid(metric + gran_label ~ noise_label + feature_set) + + scale_fill_manual(values = pal4) + + labs(x = NULL, y = "correlation", + title = "per-cell correlations by noise level", + subtitle = "rows = metric x granularity; columns = mutation rate x feature_set") + + theme_bw(base_size = 8) + + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), + legend.position = "none", + strip.text = element_text(size = 6)) +} else { + cat("No per-cell metrics with mutation_rate column found.\n") +} +``` + +## per-repeat-class F1 by noise level + +```{r per-family-noise, fig.height=14, fig.width=14} +if (!is.null(per_family) && nrow(per_family) > 0 && + "class_id" %in% names(per_family) && "mutation_rate" %in% names(per_family)) { + + pf <- per_family %>% + filter(!is.na(class_id), class_id != "unknown", !is.na(mutation_rate)) %>% + mutate(gran_label = factor(granularity, + levels = c("locus", "gene_id", "family_id", "class_id")), + noise_label = paste0(mutation_rate * 100, "%")) + + n_al5 <- length(unique(pf$aligner_mode)) + pal5 <- make_palette(n_al5) + + p_f1 <- ggplot(pf, aes(x = class_id, y = f1, + colour = aligner_mode, group = aligner_mode)) + + geom_line(linewidth = 0.4, linetype = "dashed") + + geom_point(size = 1.5, alpha = 0.85) + + facet_grid(gran_label ~ noise_label + feature_set) + + scale_y_sqrt() + + scale_colour_manual(values = pal5, name = "Aligner (mode)") + + labs(x = "repeat class", y = "F1", + title = "F1 per repeat class by noise level", + subtitle = "rows = granularity; columns = mutation rate x feature_set") + + theme_bw(base_size = 8) + + theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + legend.position = "bottom", + legend.text = element_text(size = 7), + strip.text = element_text(size = 6)) + + p_r <- ggplot(pf, aes(x = class_id, y = pearson_r, + colour = aligner_mode, group = aligner_mode)) + + geom_line(linewidth = 0.4, linetype = "dashed") + + geom_point(size = 1.5, alpha = 0.85) + + facet_grid(gran_label ~ noise_label + feature_set) + + scale_y_sqrt() + + scale_colour_manual(values = pal5, name = "Aligner (mode)") + + labs(x = "repeat class", y = "pearson r") + + theme_bw(base_size = 8) + + theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + legend.position = "none", + strip.text = element_text(size = 6)) + + p_f1 / p_r +} else { + cat("No per-family metrics with class_id and mutation_rate found.\n") +} +``` + +--- +*Generated by [noise_sweep_report.Rmd](../workflow/scripts/noise_sweep_report.Rmd)* diff --git a/workflow/scripts/normalize_starsolo.py b/workflow/scripts/normalize_starsolo.py index 3bba451..5091baf 100644 --- a/workflow/scripts/normalize_starsolo.py +++ b/workflow/scripts/normalize_starsolo.py @@ -27,11 +27,11 @@ def read_lines(path): return [line.rstrip('\n') for line in fh] -def load_starsolo_raw(raw_dir): +def load_starsolo_raw(raw_dir, mtx_name='matrix.mtx'): barcodes = read_lines(os.path.join(raw_dir, 'barcodes.tsv')) feature_lines = read_lines(os.path.join(raw_dir, 'features.tsv')) feature_ids = [line.split('\t')[0] for line in feature_lines] - mat = scipy.io.mmread(os.path.join(raw_dir, 'matrix.mtx')).tocsc() + mat = scipy.io.mmread(os.path.join(raw_dir, mtx_name)).tocsc() return barcodes, feature_ids, mat @@ -94,11 +94,16 @@ def main(): ap.add_argument('--output', required=True, help='Output TSV path') ap.add_argument('--locus-map', default=None, help='TSV: transcript_id, gene_id, family_id, class_id (no header)') + ap.add_argument('--multimapper-mode', default='unique', + choices=['unique', 'multi']) ap.add_argument('--granularity', default='gene_id', choices=['gene_id', 'family_id', 'class_id']) args = ap.parse_args() - barcodes, feature_ids, mat = load_starsolo_raw(args.raw_dir) + em_mtx = os.path.join(args.raw_dir, 'UniqueAndMult-EM.mtx') + mtx_name = 'UniqueAndMult-EM.mtx' if args.multimapper_mode == 'multi' and os.path.exists(em_mtx) else 'matrix.mtx' + print('Using matrix: {}'.format(mtx_name), file=sys.stderr) + barcodes, feature_ids, mat = load_starsolo_raw(args.raw_dir, mtx_name) print('{} features, {} barcodes'.format(len(feature_ids), len(barcodes)), file=sys.stderr) gene_to_group = None diff --git a/workflow/scripts/simulate_reads.py b/workflow/scripts/simulate_reads.py index d2b8d55..b322655 100644 --- a/workflow/scripts/simulate_reads.py +++ b/workflow/scripts/simulate_reads.py @@ -111,14 +111,14 @@ def reverse_complement(seq): return seq.translate(table)[::-1] -def sample_subseq(seq, read_length, rng): +def sample_subseq(seq, read_length, rng, mutation_rate=0.001): if len(seq) <= read_length: return (seq + 'N' * read_length)[:read_length] offset = rng.randint(0, len(seq) - read_length) read = seq[offset:offset + read_length] bases = list(read) for i in range(len(bases)): - if rng.random() < 0.001: + if rng.random() < mutation_rate: bases[i] = rng.choice('ACGT') return ''.join(bases) @@ -178,7 +178,7 @@ def build_chrom_locus_coords(cell_plan): return chrom_loci -def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng): +def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng, mutation_rate=0.001): os.makedirs(outdir, exist_ok=True) locus_to_cells = build_locus_to_cells(cell_plan) chrom_locus_coords = build_chrom_locus_coords(cell_plan) @@ -200,7 +200,7 @@ def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng): fq = cell_handles[cell_id] read_idx = cell_read_counts[cell_id] for i in range(count): - read = sample_subseq(repeat_seq, read_length, rng) + read = sample_subseq(repeat_seq, read_length, rng, mutation_rate) fq.write(f'@{cell_id}_r{read_idx + i}_{safe_id(locus_id)}\n' f'{read}\n+\n{make_qual(len(read))}\n') cell_read_counts[cell_id] += count @@ -215,7 +215,7 @@ def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng): return cell_fastq_paths, ground_truth -def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_length, outdir, rng): +def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_length, outdir, rng, mutation_rate=0.001): os.makedirs(outdir, exist_ok=True) locus_to_cells = build_locus_to_cells(cell_plan) chrom_locus_coords = build_chrom_locus_coords(cell_plan) @@ -263,7 +263,7 @@ def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_le used_umis.add(umi) read_id = f'r{total_reads}_{barcode[:8]}_{umi}_{safe_id(locus_id)}' r1_seq = barcode + umi - r2_seq = sample_subseq(repeat_seq, read_length, rng) + r2_seq = sample_subseq(repeat_seq, read_length, rng, mutation_rate) r1f.write(f'@{read_id}\n{r1_seq}\n+\n{make_qual(len(r1_seq))}\n') r2f.write(f'@{read_id}\n{r2_seq}\n+\n{make_qual(len(r2_seq))}\n') total_reads += 1 @@ -323,6 +323,8 @@ def main(): ap.add_argument('--max-repeats-per-chrom', type=int, default=None, help='Cap intervals per chromosome to limit memory use on full genomes') ap.add_argument('--seed', type=int, default=42) + ap.add_argument('--mutation-rate', type=float, default=0.001, + help='Per-base substitution probability for simulated reads (default 0.001)') args = ap.parse_args() rng = random.Random(args.seed) @@ -353,7 +355,7 @@ def main(): if args.mode == 'smartseq2': print(f'Simulating SmartSeq2 reads (streaming FASTA by chrom)', file=sys.stderr) cell_fastq_paths, ground_truth = simulate_smartseq2( - args.fasta, cell_plan, args.read_length, args.outdir, rng) + args.fasta, cell_plan, args.read_length, args.outdir, rng, mutation_rate=args.mutation_rate) write_ground_truth(ground_truth, ground_truth_path, cell_column='cell_id') manifest_path = os.path.join(args.outdir, 'manifest.tsv') write_smartseq2_manifest(cell_fastq_paths, manifest_path) @@ -363,7 +365,7 @@ def main(): print(f'Simulating Chromium reads (streaming FASTA by chrom)', file=sys.stderr) (r1, r2), ground_truth = simulate_chromium( args.fasta, cell_plan, args.read_length, - args.cb_length, args.umi_length, args.outdir, rng) + args.cb_length, args.umi_length, args.outdir, rng, mutation_rate=args.mutation_rate) write_ground_truth(ground_truth, ground_truth_path, cell_column='cell_barcode') print(f'R1: {r1}', file=sys.stderr) print(f'R2: {r2}', file=sys.stderr)