Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[run]
source = workflow/scripts

# Scripts requiring subprocess calls to samtools/BAM files or real aligner
# output matrices cannot be meaningfully unit-tested without bioinformatics
# infrastructure. Omit them so the coverage threshold applies only to
# logic that can be exercised with synthetic data.
omit =
workflow/scripts/count_pseudo_genome.py
workflow/scripts/count_pseudo_genome_chromium.py
workflow/scripts/normalize_starsolo.py
workflow/scripts/normalize_alevin_chromium.py
workflow/scripts/normalize_alevin_smartseq2.py
workflow/scripts/normalize_kallisto_chromium.py
workflow/scripts/normalize_kallisto_smartseq2_granular.py
workflow/scripts/merge_featurecounts.py
workflow/scripts/tag_bam_chromium.py
workflow/scripts/split_gtf_chunks.py

[report]
exclude_lines =
pragma: no cover
if __name__ == .__main__.:
125 changes: 125 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
name: Tests

on:
workflow_dispatch:
pull_request:
branches: [master, dev]

jobs:
# Unit and integration tests. Needs Python and scipy
unit-and-integration:
name: Unit and integration tests
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Install Python dependencies
run: pip install pytest pytest-cov scipy

- name: Run unit and integration tests
run: |
pytest test/unit/ test/integration/ \
-v \
--tb=short \
--cov=workflow/scripts \
--cov-report=term-missing \
--cov-fail-under=70

# Snakemake dry-run. Installs snakemake via micromamba but does not run any rules.
snakemake-dryrun:
name: Snakemake dry-run
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4

- name: Set up micromamba
uses: mamba-org/setup-micromamba@v1
with:
micromamba-version: latest
environment-name: snakemake
create-args: >-
-c bioconda
-c conda-forge
python=3.11
snakemake>=8
pytest
pytest-cov
scipy
init-shell: bash

- name: Dry-run main Snakefile with SmartSeq2 config
shell: bash -el {0}
working-directory: workflow
run: |
snakemake \
--configfile configs/simulation_smartseq2.yaml \
--dry-run \
--rerun-triggers mtime \
--quiet --cores 2

- name: Dry-run main Snakefile with Chromium config
shell: bash -el {0}
working-directory: workflow
run: |
snakemake \
--configfile configs/simulation_chromium.yaml \
--dry-run \
--rerun-triggers mtime \
--quiet --cores 2

- name: Dry-run test Snakefile with negative control config
shell: bash -el {0}
working-directory: workflow
run: |
snakemake \
-s ../test/workflow/Snakefile_test \
--configfile ../test/workflow/configs/test_negative_control.yaml \
--dry-run \
--rerun-triggers mtime \
--quiet --cores 2

- name: Run workflow dry-run pytest tests
shell: bash -el {0}
run: |
pytest test/workflow/test_snakemake_dryrun.py \
-v --tb=short -m workflow

# Full negative control run. Requires reference data on the runner and an env.
# triggered manually
negative-control-run:
name: Negative control full run
runs-on: ubuntu-latest
if: |
github.event_name == 'workflow_dispatch'
steps:
- uses: actions/checkout@v4

- name: Set up micromamba
uses: mamba-org/setup-micromamba@v1
with:
micromamba-version: latest
environment-name: snakemake
create-args: >-
-c bioconda
-c conda-forge
python=3.11
snakemake>=8
init-shell: bash

- name: Run negative control workflow
shell: bash -el {0}
working-directory: workflow
run: |
snakemake \
-s ../test/workflow/Snakefile_test \
--configfile ../test/workflow/configs/test_negative_control.yaml \
--use-conda \
--cores 2 \
--conda-frontend mamba
124 changes: 124 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
SHELL := /bin/bash
.DEFAULT_GOAL := all
CORES ?= 10
WF := workflow
RESULTS := results

CONDA_RUN := source ~/miniconda3/etc/profile.d/conda.sh && conda activate snakemake &&
SM := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES) --rerun-triggers mtime
RSCRIPT := cd $(WF) && $(CONDA_RUN) Rscript

# noise sweep eval dirs (relative to workflow/, comma-separated for the Rmd param)
NOISE_EDIRS_SS2 := $(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation
NOISE_EDIRS_CHR := $(CURDIR)/$(RESULTS)/simulation_chromium_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_10pct/evaluation

# output HTML files (relative to project root)
NOISE_REPORT_SS2 := $(RESULTS)/noise_sweep_smartseq2.html
NOISE_REPORT_CHR := $(RESULTS)/noise_sweep_chromium.html

.PHONY: all \
simulation_smartseq2 simulation_chromium \
noise_smartseq2 noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct \
noise_chromium noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct \
report_noise_smartseq2 report_noise_chromium reports_noise \
help

# -----------------------------------------------------------------------
# base simulations
# -----------------------------------------------------------------------

simulation_smartseq2:
$(SM) --configfile configs/simulation_smartseq2.yaml

simulation_chromium:
$(SM) --configfile configs/simulation_chromium.yaml

# -----------------------------------------------------------------------
# noise sweep — SmartSeq2
# note: genome refs are reused from the base simulation_smartseq2 run;
# alignment indices are shared via results/shared/ (never rebuilt)
# -----------------------------------------------------------------------

noise_smartseq2_0pct:
$(SM) --configfile configs/simulation_smartseq2_noise_0pct.yaml

noise_smartseq2_1pct:
$(SM) --configfile configs/simulation_smartseq2_noise_1pct.yaml

noise_smartseq2_5pct:
$(SM) --configfile configs/simulation_smartseq2_noise_5pct.yaml

noise_smartseq2_10pct:
$(SM) --configfile configs/simulation_smartseq2_noise_10pct.yaml

noise_smartseq2: noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct

# -----------------------------------------------------------------------
# noise sweep — Chromium
# -----------------------------------------------------------------------

noise_chromium_0pct:
$(SM) --configfile configs/simulation_chromium_noise_0pct.yaml

noise_chromium_1pct:
$(SM) --configfile configs/simulation_chromium_noise_1pct.yaml

noise_chromium_5pct:
$(SM) --configfile configs/simulation_chromium_noise_5pct.yaml

noise_chromium_10pct:
$(SM) --configfile configs/simulation_chromium_noise_10pct.yaml

noise_chromium: noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct

# -----------------------------------------------------------------------
# noise sweep reports
# file targets: only (re)rendered when the HTML does not yet exist;
# use make -B report_noise_smartseq2 to force a re-render
# -----------------------------------------------------------------------

$(NOISE_REPORT_SS2):
mkdir -p $(RESULTS)
$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \
output_file = '$(CURDIR)/$(NOISE_REPORT_SS2)', \
params = list(eval_dirs = '$(NOISE_EDIRS_SS2)'))"

$(NOISE_REPORT_CHR):
mkdir -p $(RESULTS)
$(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \
output_file = '$(CURDIR)/$(NOISE_REPORT_CHR)', \
params = list(eval_dirs = '$(NOISE_EDIRS_CHR)'))"

report_noise_smartseq2: $(NOISE_REPORT_SS2)
report_noise_chromium: $(NOISE_REPORT_CHR)
reports_noise: report_noise_smartseq2 report_noise_chromium

# -----------------------------------------------------------------------
# all
# -----------------------------------------------------------------------

all: simulation_smartseq2 simulation_chromium noise_smartseq2 noise_chromium reports_noise

# -----------------------------------------------------------------------
# help
# -----------------------------------------------------------------------

help:
@echo "Usage: make [target] [CORES=N] (default CORES=$(CORES))"
@echo ""
@echo "Base simulation runs:"
@echo " simulation_smartseq2 SmartSeq2 base run (mutation_rate 0.1%)"
@echo " simulation_chromium Chromium base run (mutation_rate 0.1%)"
@echo ""
@echo "Noise sweep runs (base simulation must have been run first):"
@echo " noise_smartseq2 all 4 noise levels for SmartSeq2"
@echo " noise_smartseq2_{0,1,5,10}pct individual SmartSeq2 noise level"
@echo " noise_chromium all 4 noise levels for Chromium"
@echo " noise_chromium_{0,1,5,10}pct individual Chromium noise level"
@echo ""
@echo "Reports (file targets; use -B to force re-render):"
@echo " report_noise_smartseq2 $(NOISE_REPORT_SS2)"
@echo " report_noise_chromium $(NOISE_REPORT_CHR)"
@echo " reports_noise both noise sweep reports"
@echo ""
@echo " all run everything in sequence"
64 changes: 52 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Repeat element quantification in bulk and single cells
# Repetitive element quantification in bulk and single-cell RNA-seq

## Status

Expand Down Expand Up @@ -46,30 +46,64 @@ Exact package pins (platform-locked explicit exports) are in `workflow/envs/expl

## Running

### Via Makefile (recommended)

A `Makefile` at the project root orchestrates all configs and renders reports.

```
# Run everything (all simulations + noise sweep + reports)
make

# Individual targets
make simulation_smartseq2 # SmartSeq2 base simulation
make simulation_chromium # Chromium base simulation
make noise_smartseq2 # SmartSeq2 noise sweep (0%, 1%, 5%, 10% mutation rate)
make noise_chromium # Chromium noise sweep
make report_noise_smartseq2 # Render SmartSeq2 noise sweep HTML report
make report_noise_chromium # Render Chromium noise sweep HTML report
make help # List all targets
```

Tune parallelism with `make CORES=20`. The Makefile activates the `snakemake` conda
environment automatically.

### Manually

```
source ~/miniconda3/etc/profile.d/conda.sh
conda activate snakemake
cd workflow
snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10
snakemake --configfile configs/simulation_chromium.yaml --use-conda --cores 10
snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10 --rerun-triggers mtime
snakemake --configfile configs/simulation_chromium.yaml --use-conda --cores 10 --rerun-triggers mtime
```

The report is written to `{base}/evaluation/evaluation_report.html` as defined in
the config file.
The per-run evaluation report is written to `{base}/evaluation/evaluation_report.html`.

## Configuration

Two reference configs are provided under `workflow/configs/`:
Simulation configs are under `workflow/configs/`:

| File | Technology | Cells | Expressed loci/cell | Mutation rate |
|---|---|---|---|---|
| `simulation_smartseq2.yaml` | SmartSeq2 | 500 | 1000 | 0.1% |
| `simulation_chromium.yaml` | 10x Chromium | 500 | 1000 | 0.1% |
| `simulation_smartseq2_noise_0pct.yaml` | SmartSeq2 | 500 | 1000 | 0% |
| `simulation_smartseq2_noise_1pct.yaml` | SmartSeq2 | 500 | 1000 | 1% |
| `simulation_smartseq2_noise_5pct.yaml` | SmartSeq2 | 500 | 1000 | 5% |
| `simulation_smartseq2_noise_10pct.yaml` | SmartSeq2 | 500 | 1000 | 10% |
| `simulation_chromium_noise_0pct.yaml` | 10x Chromium | 500 | 1000 | 0% |
| `simulation_chromium_noise_1pct.yaml` | 10x Chromium | 500 | 1000 | 1% |
| `simulation_chromium_noise_5pct.yaml` | 10x Chromium | 500 | 1000 | 5% |
| `simulation_chromium_noise_10pct.yaml` | 10x Chromium | 500 | 1000 | 10% |

| File | Technology | Cells | Chr subset |
|---|---|---|---|
| `simulation_smartseq2.yaml` | SmartSeq2 | 20 | chr10 |
| `simulation_chromium.yaml` | 10x Chromium | 50 | chr10 |
Unused real-data configs have been moved to `workflow/configs/old/`.

Key parameters:

- `base`: output directory for all results.
- `indices_base`: directory for shared aligner indices (can be shared across runs).
- `base`: run-specific output directory.
- `indices_base`: shared directory for aligner indices and decompressed references.
All noise-sweep configs share the same `indices_base` so indices are built once.
- `simulation.mutation_rate`: per-base substitution rate applied to simulated reads.
- `feature_sets`: which repeat subsets to quantify (`repeats`, `genic_repeats`, `intergenic_repeats`).
- `granularities`: aggregation levels (`gene_id`, `family_id`, `class_id`).
- `aligner_params.{aligner}.multimapper_modes`: `unique` (best hit only) or `multi` (EM/all-alignments).
Expand All @@ -86,6 +120,12 @@ per-(barcode, locus) counts directly, without splitting into per-cell BAM files.

See workflow/methods.md for full details.

## Testing

Unit tests, integration tests, and Snakemake dry-run tests are in the
`test/` directory. See [test/README.md](test/README.md) for details on
design, coverage, and how to run the tests.

## Contact

izaskun dot mallona dot work at gmail.com
Expand Down
6 changes: 6 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[pytest]
testpaths = test
addopts = -v --tb=short
markers =
slow: marks tests as slow (requires bioinformatics tools or large data)
workflow: marks tests that require snakemake
Loading
Loading