From d1c3ed8c8b76124148b6ab0d6556a6cc65ce58dd Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Sun, 22 Mar 2026 16:15:47 +0100 Subject: [PATCH 01/12] Fix evaluation metric, draft a testing framework --- .github/workflows/tests.yml | 117 ++++++ README.md | 8 +- test/README.md | 209 ++++++++++ test/conftest.py | 13 + test/integration/__init__.py | 0 .../test_feature_set_consistency.py | 393 ++++++++++++++++++ test/integration/test_metamorphic.py | 264 ++++++++++++ test/unit/__init__.py | 0 .../unit/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 145 bytes ...uild_rmsk_gtf.cpython-312-pytest-7.4.4.pyc | Bin 0 -> 14887 bytes ...test_evaluate.cpython-312-pytest-7.4.4.pyc | Bin 0 -> 55247 bytes ...parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc | Bin 0 -> 14535 bytes ...imulate_reads.cpython-312-pytest-7.4.4.pyc | Bin 0 -> 45973 bytes test/unit/test_build_rmsk_gtf.py | 110 +++++ test/unit/test_evaluate.py | 368 ++++++++++++++++ test/unit/test_parse_gtf_t2g.py | 122 ++++++ test/unit/test_simulate_reads.py | 299 +++++++++++++ test/workflow/Snakefile_test | 191 +++++++++ .../configs/test_negative_control.yaml | 51 +++ test/workflow/envs/test_evaluation.yaml | 9 + test/workflow/test_snakemake_dryrun.py | 92 ++++ workflow/scripts/evaluate.py | 44 +- 22 files changed, 2271 insertions(+), 19 deletions(-) create mode 100644 .github/workflows/tests.yml create mode 100644 test/README.md create mode 100644 test/conftest.py create mode 100644 test/integration/__init__.py create mode 100644 test/integration/test_feature_set_consistency.py create mode 100644 test/integration/test_metamorphic.py create mode 100644 test/unit/__init__.py create mode 100644 test/unit/__pycache__/__init__.cpython-312.pyc create mode 100644 test/unit/__pycache__/test_build_rmsk_gtf.cpython-312-pytest-7.4.4.pyc create mode 100644 test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc create mode 100644 test/unit/__pycache__/test_parse_gtf_t2g.cpython-312-pytest-7.4.4.pyc create mode 100644 test/unit/__pycache__/test_simulate_reads.cpython-312-pytest-7.4.4.pyc create mode 100644 test/unit/test_build_rmsk_gtf.py create mode 100644 test/unit/test_evaluate.py create mode 100644 test/unit/test_parse_gtf_t2g.py create mode 100644 test/unit/test_simulate_reads.py create mode 100644 test/workflow/Snakefile_test create mode 100644 test/workflow/configs/test_negative_control.yaml create mode 100644 test/workflow/envs/test_evaluation.yaml create mode 100644 test/workflow/test_snakemake_dryrun.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..ea12b79 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,117 @@ +name: Tests + +on: + workflow_dispatch: + push: + branches: [master, dev] + pull_request: + branches: [master, dev] + +jobs: + # Unit and integration tests. Needs Python and scipy + unit-and-integration: + name: Unit and integration tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install Python dependencies + run: pip install pytest pytest-cov scipy + + - name: Run unit and integration tests + run: | + pytest test/unit/ test/integration/ \ + -v \ + --tb=short \ + --cov=workflow/scripts \ + --cov-report=term-missing \ + --cov-fail-under=70 + + # Snakemake dry-run. Installs snakemake via micromamba but does not run any rules. + snakemake-dryrun: + name: Snakemake dry-run + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up micromamba + uses: mamba-org/setup-micromamba@v1 + with: + micromamba-version: latest + environment-name: snakemake + create-args: >- + python=3.11 + snakemake>=8 + init-shell: bash + + - name: Dry-run main Snakefile with SmartSeq2 config + shell: bash -el {0} + working-directory: workflow + run: | + snakemake \ + --configfile configs/simulation_smartseq2.yaml \ + --dry-run \ + --quiet + + - name: Dry-run main Snakefile with Chromium config + shell: bash -el {0} + working-directory: workflow + run: | + snakemake \ + --configfile configs/simulation_chromium.yaml \ + --dry-run \ + --quiet + + - name: Dry-run test Snakefile with negative control config + shell: bash -el {0} + working-directory: workflow + run: | + snakemake \ + -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --dry-run \ + --quiet + + - name: Run workflow dry-run pytest tests + shell: bash -el {0} + run: | + pytest test/workflow/test_snakemake_dryrun.py \ + -v --tb=short -m workflow + + # Full negative control run. Requires reference data on the runner and an env. + # triggered manually + negative-control-run: + name: Negative control full run + runs-on: ubuntu-latest + if: | + github.event_name == 'workflow_dispatch' + steps: + - uses: actions/checkout@v4 + + - name: Set up micromamba + uses: mamba-org/setup-micromamba@v1 + with: + micromamba-version: latest + environment-name: snakemake + create-args: >- + python=3.11 + snakemake>=8 + init-shell: bash + + - name: Run negative control workflow + shell: bash -el {0} + working-directory: workflow + run: | + snakemake \ + -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --use-conda \ + --cores 4 \ + --conda-frontend mamba diff --git a/README.md b/README.md index f8532c2..6bc5ef0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Repeat element quantification in bulk and single cells +# Repetitive element quantification in bulk and single-cell RNA-seq ## Status @@ -86,6 +86,12 @@ per-(barcode, locus) counts directly, without splitting into per-cell BAM files. See workflow/methods.md for full details. +## Testing + +Unit tests, integration tests, and Snakemake dry-run tests are in the +`test/` directory. See [test/README.md](test/README.md) for details on +design, coverage, and how to run the tests. + ## Contact izaskun dot mallona dot work at gmail.com diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..d33baf9 --- /dev/null +++ b/test/README.md @@ -0,0 +1,209 @@ +# Tests + +This directory contains the test suite for the repeats pipeline. Tests are +run with pytest and cover the Python scripts in workflow/scripts/ as well as +the correctness of the evaluation logic end-to-end. + +See also: the main [README.md](../README.md) for pipeline usage. + +## Running the tests + +All commands below assume the repository root as the working directory unless +stated otherwise. + +### Unit and integration tests + +Install dependencies: + +``` +pip install pytest pytest-cov scipy +``` + +Run unit and integration tests: + +``` +cd /path/to/repeats +pytest test/unit/ test/integration/ -v +``` + +Run with coverage: + +``` +cd /path/to/repeats +pytest test/unit/ test/integration/ --cov=workflow/scripts --cov-report=term-missing +``` + +### Snakemake dry-run tests + +These require snakemake in PATH and are tagged with the `workflow` marker. +Run via pytest (from the repo root): + +``` +cd /path/to/repeats +pytest test/workflow/test_snakemake_dryrun.py -v -m workflow +``` + +Or run the dry-runs directly with snakemake. The working directory must be +`workflow/` for the production Snakefile, and paths to the test Snakefile and +configs are given relative to that directory: + +``` +cd /path/to/repeats/workflow + +snakemake --configfile configs/simulation_smartseq2.yaml --dry-run --quiet + +snakemake --configfile configs/simulation_chromium.yaml --dry-run --quiet + +snakemake -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --dry-run --quiet +``` + +### Negative control workflow (full run) + +Requires reference data accessible from the runner and snakemake with conda +support. Run from the `workflow/` directory: + +``` +cd /path/to/repeats/workflow + +snakemake -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --use-conda \ + --cores 4 \ + --conda-frontend mamba +``` + + +## Directory layout + +``` +test/ + unit/ unit tests for individual script functions + integration/ end-to-end tests using synthetic input files + workflow/ snakemake dry-run tests and the negative control workflow + Snakefile_test test workflow that reuses the production snmk modules + configs/ config for the negative control run + envs/ conda environment for test workflow rules +``` + + +## Unit tests (test/unit/) + +Each file targets one script in workflow/scripts/. + +test_evaluate.py covers: +- pearson_r, spearman_r: known values, edge cases (too few points, constant vector) +- log1p_rmse: perfect recovery gives 0.0, symmetry, empty input +- detection_metrics: all-correct, no-overlap, partial overlap, specificity +- build_aligned_vectors: zero-filling for missing features, cell ordering +- load_ground_truth: all four granularities (locus, gene_id, family_id, class_id) + and the valid_locus_ids filter that was added to fix the genic/intergenic + partitioning bug +- load_count_matrix: feature tracking including zero-count rows +- compute_metrics_for_subset: perfect observer and null observer + +test_simulate_reads.py covers: +- reverse_complement: known sequences, N passthrough, involution property +- parse_gtf_attribute: quoted and unquoted values, missing key +- extract_repeat_sequence: N content filter, strand handling, length filter +- sample_subseq: output length, N padding for short sequences +- sample_count_geometric: always >= 1, respects max_count, approximate mean +- build_cell_plan and build_locus_to_cells: output structure +- write_ground_truth: TSV format and row count +- parse_gtf_repeats_by_chrom: minimal GTF parsing, length filter, chrom filter + +test_build_rmsk_gtf.py covers: +- make_gtf_attributes: format and dup index incrementing +- convert_rmsk_to_gtf: basic output, dup index, length filter, chrom filter, + 1-based GTF coordinates, gzip output + +test_parse_gtf_t2g.py covers: +- parse_attrs: quoted values, missing keys, empty string +- main() with sys.argv patching: 2-col and 4-col output, correct mapping, + deduplication, feature-type filtering + + +## Integration tests (test/integration/) + +test_feature_set_consistency.py documents and tests a bug that caused lower +evaluation metrics for genic_repeats and intergenic_repeats at gene_id +granularity compared to the full repeats set, even for a hypothetically +perfect observer. + +Root cause: evaluate.py was aggregating ground truth counts across all loci +for a given gene_id (including both genic and intergenic copies), then +filtering at the gene_id level. For a gene_id like AluSz6 that has copies +in both contexts, the genic truth count included intergenic loci, inflating +truth relative to observed and degrading Pearson correlation and recall. + +The fix adds locus-level filtering before aggregation (see valid_locus_ids +in load_ground_truth). The tests in this file verify: + +1. gene_ids overlap between genic and intergenic locus maps (structural + precondition for the bug, expected in real data) +2. full repeats at gene_id with a perfect observer gives metrics of 1.0 + (baseline sanity check) +3. genic_repeats at locus granularity with a perfect observer gives 1.0 + (locus level is unaffected by the partition issue) +4. genic_repeats at gene_id with a perfect genic observer now gives 1.0 + (regression test for the fix; would fail if the fix were reverted) +5. intergenic_repeats at gene_id with a perfect observer gives 1.0 +6. subset metrics are not lower than full-set metrics for a perfect observer + (monotonicity property) + +test_metamorphic.py tests properties that must hold across input transformations +without needing exact expected values: + +- scale invariance: multiplying observed counts by a constant does not change + Pearson or Spearman +- scaling does change log1p_rmse +- monotone recall: adding true positives never decreases recall +- noise degrades metrics: progressively noisier observers give lower Pearson +- perfect beats null: a perfect observer strictly outperforms a zero observer + on all metrics +- granularity aggregation consistency: a perfect locus-level count matrix + summed to family_id also gives metrics of 1.0 +- random counts give low correlation (below 0.7 for 50-feature random data) + + +## Workflow tests (test/workflow/) + +test_snakemake_dryrun.py verifies that snakemake --dry-run succeeds for both +production configs (simulation_smartseq2.yaml and simulation_chromium.yaml) +and for the test negative control config. These tests are skipped if snakemake +is not in the PATH and are marked with the workflow marker. + +Snakefile_test is a separate Snakemake workflow that includes all production +snmk modules and adds the negative control rules: + +simulate_from_genes runs simulate_reads.py with the Ensembl gene GTF instead +of the repeat GTF. Reads come from gene body regions rather than repeat +elements. This gives a ground truth over gene features. + +check_negative_control_recall runs evaluate.py comparing that gene-body +ground truth against the repeat quantification output and asserts that recall +is below the threshold in testing.negative_control_max_recall (default 0.10). +A low recall means the pipeline correctly does not attribute gene-body reads +to repeat elements. + +The conda environment for these rules is defined in +test/workflow/envs/test_evaluation.yaml (python, scipy, pandas, numpy). + + +## GitHub Actions + +.github/workflows/tests.yml defines three jobs: + +unit-and-integration runs on every push and pull request. It uses plain +Python and does not need any bioinformatics tools installed. + +snakemake-dryrun runs on every push and pull request. It installs snakemake +via micromamba and runs dry-run checks on both production configs and the test +Snakefile. Snakemake installs conda dependencies itself when --use-conda is +passed, but dry-runs do not trigger conda installs. + +negative-control-run is only triggered manually (workflow_dispatch) or when +the repository variable ENABLE_FULL_INTEGRATION is set to true. It runs the +full negative control workflow with --use-conda and requires reference data +to be accessible from the runner. diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..ad80b2e --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,13 @@ +""" +Shared pytest configuration and path setup. + +Adds workflow/scripts to sys.path so unit tests can import scripts directly. +""" +import sys +import os + +SCRIPTS_DIR = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..', 'workflow', 'scripts') +) +if SCRIPTS_DIR not in sys.path: + sys.path.insert(0, SCRIPTS_DIR) diff --git a/test/integration/__init__.py b/test/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/integration/test_feature_set_consistency.py b/test/integration/test_feature_set_consistency.py new file mode 100644 index 0000000..3559831 --- /dev/null +++ b/test/integration/test_feature_set_consistency.py @@ -0,0 +1,393 @@ +""" +Integration tests for evaluation consistency across feature sets. + +Background +---------- +The pipeline evaluates each (feature_set × granularity × aligner) combination. +An initially puzzling result was that genic_repeats and intergenic_repeats at +gene_id granularity showed lower metrics than the full repeats set, even for a +hypothetically perfect observer. + +Root cause (now fixed in evaluate.py) +-------------------------------------- +At gene_id granularity, load_ground_truth() previously aggregated ALL loci for +a given gene_id (summing genic + intergenic copies), then filtered the already- +aggregated dict by checking whether the gene_id appeared in the genic locus_map. + +For a gene_id like AluSz6 that has copies in BOTH genic and intergenic regions: + + - genic locus_map contains AluSz6 (it has genic copies) + - OLD truth['c1']['AluSz6'] = 5 (genic) + 3 (intergenic) = 8 ← inflated + - observed from genic normalization = 5 (only genic copy) + - -> truth > observed -> degraded Pearson / RMSE, false-negative detections + +The fix moves locus_map loading to BEFORE load_ground_truth() and passes a +valid_locus_ids set so filtering happens AT LOCUS LEVEL before any aggregation. + +Scenario used by these tests +----------------------------- +We construct a minimal ground truth with: + - genic_alu_1 -> AluSz6, Alu, SINE, count=5 (GENIC) + - intergenic_alu_1 -> AluSz6, Alu, SINE, count=3 (INTERGENIC - same gene_id!) + - genic_alu_2 -> AluSx, Alu, SINE, count=7 (GENIC) + - intergenic_alu_2 -> AluSx, Alu, SINE, count=6 (INTERGENIC - same gene_id!) + - genic_L1_1 -> L1PA2, L1, LINE, count=2 (GENIC) + - genic_mir_1 -> MIR, MIR, SINE, count=4 (GENIC) + - genic_L2_1 -> L2, L2, LINE, count=3 (GENIC) + - intergenic_dna_1 -> TcMar, TcMar, DNA, count=1 (INTERGENIC) + - [TcMar has a genic reference locus genic_tcmar_ref not expressed in + simulation, but present in the genic locus_map - this triggers the + false-negative detection failure in the old code.] + +Genic locus_map contains: + genic_alu_1, genic_alu_2, genic_L1_1, genic_mir_1, genic_L2_1, genic_tcmar_ref + +Intergenic locus_map contains: + intergenic_alu_1, intergenic_alu_2, intergenic_dna_1 + +AluSz6 and AluSx appear in BOTH locus_maps (gene_id level). + +With the fix: + genic truth at gene_id: AluSz6=5, AluSx=7, L1PA2=2, MIR=4, L2=3, TcMar=0 + genic observed at gene_id (from genic normalization): AluSz6=5, AluSx=7, L1PA2=2, MIR=4, L2=3 + -> perfect match -> pearson=1.0, recall=1.0, precision=1.0 +""" +import csv +import math +import os +import subprocess +import sys + +import pytest + +SCRIPTS_DIR = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..', '..', 'workflow', 'scripts') +) +EVALUATE = os.path.join(SCRIPTS_DIR, 'evaluate.py') + +# --------------------------------------------------------------------------- +# Shared fixture: build the scenario files +# --------------------------------------------------------------------------- + +GROUND_TRUTH_ROWS = [ + # cell_id, locus_id, repeat_id (=gene_id), family_id, class_id, true_count + ('cell_001', 'genic_alu_1', 'AluSz6', 'Alu', 'SINE', '5'), + ('cell_001', 'intergenic_alu_1', 'AluSz6', 'Alu', 'SINE', '3'), + ('cell_001', 'genic_alu_2', 'AluSx', 'Alu', 'SINE', '7'), + ('cell_001', 'intergenic_alu_2', 'AluSx', 'Alu', 'SINE', '6'), + ('cell_001', 'genic_L1_1', 'L1PA2', 'L1', 'LINE', '2'), + ('cell_001', 'genic_mir_1', 'MIR', 'MIR', 'SINE', '4'), + ('cell_001', 'genic_L2_1', 'L2', 'L2', 'LINE', '3'), + ('cell_001', 'intergenic_dna_1', 'TcMar', 'TcMar', 'DNA', '1'), +] + +# Genic locus_map includes a reference locus for TcMar that was NOT expressed +# in the simulation. In the old code, TcMar's intergenic count (1) would +# bleed into the genic truth because TcMar is in the genic locus_map. +GENIC_LOCUS_MAP = [ + ('genic_alu_1', 'AluSz6', 'Alu', 'SINE'), + ('genic_alu_2', 'AluSx', 'Alu', 'SINE'), + ('genic_L1_1', 'L1PA2', 'L1', 'LINE'), + ('genic_mir_1', 'MIR', 'MIR', 'SINE'), + ('genic_L2_1', 'L2', 'L2', 'LINE'), + # reference locus for TcMar - exists in genome but NOT expressed + ('genic_tcmar_ref', 'TcMar', 'TcMar', 'DNA'), +] + +INTERGENIC_LOCUS_MAP = [ + ('intergenic_alu_1', 'AluSz6', 'Alu', 'SINE'), + ('intergenic_alu_2', 'AluSx', 'Alu', 'SINE'), + ('intergenic_dna_1', 'TcMar', 'TcMar', 'DNA'), +] + +FULL_LOCUS_MAP = GENIC_LOCUS_MAP + INTERGENIC_LOCUS_MAP + +# "Perfect genic observer" at gene_id level - exactly what genic normalization produces: +# only counts from genic loci, using GENIC_LOCUS_MAP. +GENIC_OBSERVED_GENE_ID = { + 'AluSz6': 5, + 'AluSx': 7, + 'L1PA2': 2, + 'MIR': 4, + 'L2': 3, + # TcMar: 0 (its only genic reference locus was not expressed) +} + +# "Perfect intergenic observer" at gene_id level +INTERGENIC_OBSERVED_GENE_ID = { + 'AluSz6': 3, + 'AluSx': 6, + 'TcMar': 1, +} + +# "Perfect full observer" at gene_id level +FULL_OBSERVED_GENE_ID = { + 'AluSz6': 8, + 'AluSx': 13, + 'L1PA2': 2, + 'MIR': 4, + 'L2': 3, + 'TcMar': 1, +} + +# "Perfect genic observer" at locus level +GENIC_OBSERVED_LOCUS = { + 'genic_alu_1': 5, + 'genic_alu_2': 7, + 'genic_L1_1': 2, + 'genic_mir_1': 4, + 'genic_L2_1': 3, +} + + +# --------------------------------------------------------------------------- +# File-writing helpers +# --------------------------------------------------------------------------- + +def write_ground_truth(tmp_path): + p = tmp_path / 'ground_truth.tsv' + with open(p, 'w') as fh: + fh.write('cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n') + for row in GROUND_TRUTH_ROWS: + fh.write('\t'.join(row) + '\n') + return p + + +def write_locus_map(tmp_path, name, rows): + p = tmp_path / name + with open(p, 'w') as fh: + for row in rows: + fh.write('\t'.join(row) + '\n') + return p + + +def write_count_matrix(tmp_path, name, feature_counts, cells=('cell_001',)): + p = tmp_path / name + features = sorted(feature_counts) + with open(p, 'w') as fh: + fh.write('feature_id\t' + '\t'.join(cells) + '\n') + for feat in features: + vals = [str(feature_counts[feat]) for _ in cells] + fh.write(feat + '\t' + '\t'.join(vals) + '\n') + return p + + +def run_evaluate(tmp_path, gt, counts, locus_map, granularity, feature_set): + prefix = str(tmp_path / f'{feature_set}_{granularity}') + cmd = [ + sys.executable, EVALUATE, + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'test', + '--multimapper-mode', 'unique', + '--granularity', granularity, + '--feature-set', feature_set, + '--locus-map', str(locus_map), + '--output-prefix', prefix, + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f'evaluate.py failed:\n{result.stderr}') + return read_global_metrics(prefix + '_global_metrics.tsv') + + +def read_global_metrics(path): + with open(path) as fh: + reader = csv.DictReader(fh, delimiter='\t') + return next(reader) + + +def parse_float(v): + try: + return float(v) + except (ValueError, TypeError): + return float('nan') + + +# --------------------------------------------------------------------------- +# Test 1: Structural - gene_id overlap between genic and intergenic locus_maps +# --------------------------------------------------------------------------- + +def test_gene_ids_overlap_between_genic_and_intergenic_locus_maps(): + """ + Documents the structural precondition for the bug: + AluSz6 and AluSx appear as gene_ids in BOTH genic and intergenic locus_maps. + This is expected in real data (repeat families span both contexts) and is + the reason why locus-level filtering before aggregation is necessary. + """ + genic_gene_ids = {row[1] for row in GENIC_LOCUS_MAP} + intergenic_gene_ids = {row[1] for row in INTERGENIC_LOCUS_MAP} + shared = genic_gene_ids & intergenic_gene_ids + assert len(shared) > 0, ( + 'Expected at least one gene_id in both genic and intergenic maps ' + f'but got genic={genic_gene_ids}, intergenic={intergenic_gene_ids}' + ) + assert 'AluSz6' in shared + assert 'AluSx' in shared + + +def test_full_locus_map_partitions_at_locus_level_not_gene_id(): + """ + At locus level, genic and intergenic locus_maps are disjoint. + At gene_id level they overlap. This is why the old post-aggregation + filter could not correctly separate the two subsets. + """ + genic_loci = {row[0] for row in GENIC_LOCUS_MAP} + intergenic_loci = {row[0] for row in INTERGENIC_LOCUS_MAP} + # Locus level: disjoint + assert genic_loci & intergenic_loci == set() + # Gene_id level: overlapping + genic_gene_ids = {row[1] for row in GENIC_LOCUS_MAP} + intergenic_gene_ids = {row[1] for row in INTERGENIC_LOCUS_MAP} + assert genic_gene_ids & intergenic_gene_ids != set() + + +# --------------------------------------------------------------------------- +# Test 2: Full repeats, gene_id - perfect observer -> perfect metrics (baseline) +# --------------------------------------------------------------------------- + +def test_full_repeats_gene_id_perfect_observer_gives_perfect_metrics(tmp_path): + """ + Sanity baseline: if observed = truth for all repeats at gene_id level, + all accuracy metrics should be 1.0 / 0.0. + """ + gt = write_ground_truth(tmp_path) + lm = write_locus_map(tmp_path, 'full_locus_map.tsv', FULL_LOCUS_MAP) + counts = write_count_matrix(tmp_path, 'full_counts.tsv', FULL_OBSERVED_GENE_ID) + m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'repeats') + assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['log1p_rmse']) == pytest.approx(0.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# Test 3: Genic repeats, locus granularity - perfect observer -> perfect metrics +# --------------------------------------------------------------------------- + +def test_genic_repeats_locus_granularity_perfect_observer(tmp_path): + """ + At locus level, a perfect genic observer should give perfect metrics. + No cross-partition contamination is possible at locus level because + locus IDs are unique per repeat instance. + """ + gt = write_ground_truth(tmp_path) + lm = write_locus_map(tmp_path, 'genic_locus_map.tsv', GENIC_LOCUS_MAP) + counts = write_count_matrix(tmp_path, 'genic_locus_counts.tsv', GENIC_OBSERVED_LOCUS) + m = run_evaluate(tmp_path, gt, counts, lm, 'locus', 'genic_repeats') + assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# Test 4 (THE KEY TEST): Genic repeats, gene_id - perfect genic observer +# should now give perfect metrics after the fix +# --------------------------------------------------------------------------- + +def test_genic_repeats_gene_id_perfect_genic_observer_gives_perfect_metrics(tmp_path): + """ + Core regression test for the locus-level filtering fix. + + With the OLD code (post-aggregation filter): + - truth['cell_001']['AluSz6'] = 5 + 3 = 8 (sums ALL AluSz6 loci) + - observed['cell_001']['AluSz6'] = 5 (only genic loci) + - -> truth > observed -> degraded pearson, false-negative for TcMar + + With the FIXED code (locus-level filter BEFORE aggregation): + - only genic loci contribute to truth + - truth['cell_001']['AluSz6'] = 5, truth['cell_001']['AluSx'] = 7 + - TcMar: its intergenic count is excluded; genic_tcmar_ref is not expressed + -> TcMar absent from both truth and observed -> no false negative + - observed matches truth exactly -> all metrics = 1.0 + """ + gt = write_ground_truth(tmp_path) + lm = write_locus_map(tmp_path, 'genic_locus_map.tsv', GENIC_LOCUS_MAP) + counts = write_count_matrix(tmp_path, 'genic_gene_id_counts.tsv', GENIC_OBSERVED_GENE_ID) + m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'genic_repeats') + + pearson = parse_float(m['pearson_r']) + recall = parse_float(m['recall']) + precision = parse_float(m['precision']) + rmse = parse_float(m['log1p_rmse']) + + assert pearson == pytest.approx(1.0, abs=1e-4), ( + f'pearson={pearson} - expected 1.0; ' + 'if this fails, the locus-level filtering fix may have been reverted' + ) + assert recall == pytest.approx(1.0, abs=1e-4), f'recall={recall}' + assert precision == pytest.approx(1.0, abs=1e-4), f'precision={precision}' + assert rmse == pytest.approx(0.0, abs=1e-4), f'log1p_rmse={rmse}' + + +# --------------------------------------------------------------------------- +# Test 5: Intergenic repeats, gene_id - same fix applies +# --------------------------------------------------------------------------- + +def test_intergenic_repeats_gene_id_perfect_intergenic_observer_gives_perfect_metrics(tmp_path): + """ + Mirror of test 4 for the intergenic subset. + + After the fix, intergenic truth for AluSz6 = 3 (only intergenic_alu_1), + not 8 (which would include genic_alu_1). + """ + gt = write_ground_truth(tmp_path) + lm = write_locus_map(tmp_path, 'intergenic_locus_map.tsv', INTERGENIC_LOCUS_MAP) + counts = write_count_matrix( + tmp_path, 'intergenic_counts.tsv', INTERGENIC_OBSERVED_GENE_ID) + m = run_evaluate(tmp_path, gt, counts, lm, 'gene_id', 'intergenic_repeats') + + assert parse_float(m['pearson_r']) == pytest.approx(1.0, abs=1e-4), ( + f"pearson={m['pearson_r']} - expected 1.0 for perfect intergenic observer" + ) + assert parse_float(m['recall']) == pytest.approx(1.0, abs=1e-4) + assert parse_float(m['precision']) == pytest.approx(1.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# Test 6: Genic + intergenic metrics should not be lower than full-set metrics +# for a perfect full observer (monotonicity property) +# --------------------------------------------------------------------------- + +def test_subset_metrics_not_lower_than_full_for_perfect_observer(tmp_path): + """ + Metamorphic property: if an aligner perfectly recovers ALL repeats, + evaluating a clean subset should not give LOWER metrics than the full set. + + We use a perfect full observer and check that both genic and intergenic + sub-evaluations also give perfect metrics (1.0). + + This test would have failed before the fix because the inflated truth + counts caused pearson < 1.0 even for a perfect observer. + """ + gt = write_ground_truth(tmp_path) + + # Full repeats evaluation + lm_full = write_locus_map(tmp_path, 'full_lm.tsv', FULL_LOCUS_MAP) + cnt_full = write_count_matrix(tmp_path, 'full_cnt.tsv', FULL_OBSERVED_GENE_ID) + m_full = run_evaluate(tmp_path, gt, cnt_full, lm_full, 'gene_id', 'repeats') + + # Genic repeats evaluation (using genic counts only) + lm_genic = write_locus_map(tmp_path, 'genic_lm.tsv', GENIC_LOCUS_MAP) + cnt_genic = write_count_matrix(tmp_path, 'genic_cnt.tsv', GENIC_OBSERVED_GENE_ID) + m_genic = run_evaluate(tmp_path, gt, cnt_genic, lm_genic, 'gene_id', 'genic_repeats') + + # Intergenic repeats evaluation + lm_inter = write_locus_map(tmp_path, 'inter_lm.tsv', INTERGENIC_LOCUS_MAP) + cnt_inter = write_count_matrix( + tmp_path, 'inter_cnt.tsv', INTERGENIC_OBSERVED_GENE_ID) + m_inter = run_evaluate(tmp_path, gt, cnt_inter, lm_inter, 'gene_id', 'intergenic_repeats') + + full_pearson = parse_float(m_full['pearson_r']) + genic_pearson = parse_float(m_genic['pearson_r']) + inter_pearson = parse_float(m_inter['pearson_r']) + + # All should be 1.0 (perfect observer) - none should be lower than full + assert full_pearson == pytest.approx(1.0, abs=1e-4) + assert genic_pearson >= full_pearson - 0.01, ( + f'genic pearson={genic_pearson} is lower than full={full_pearson}; ' + 'subset metrics should not be worse than full-set metrics for a perfect observer' + ) + assert inter_pearson >= full_pearson - 0.01, ( + f'intergenic pearson={inter_pearson} is lower than full={full_pearson}' + ) diff --git a/test/integration/test_metamorphic.py b/test/integration/test_metamorphic.py new file mode 100644 index 0000000..e080e64 --- /dev/null +++ b/test/integration/test_metamorphic.py @@ -0,0 +1,264 @@ +""" +Metamorphic tests for the evaluation pipeline. + +Metamorphic testing verifies properties that must hold across transformations +of the input, without needing exact expected values. This is especially +useful for a bioinformatics benchmarking pipeline where true correct outputs +are not always known in advance. + +Properties tested +----------------- +1. Scale invariance: multiplying observed counts by a constant should not + change Pearson or Spearman (but will change RMSE). +2. Monotone recall: adding true positives to observed should increase + or maintain recall - never decrease it. +3. Noise degrades metrics: adding random noise to a perfect observer should + monotonically worsen Pearson and RMSE. +4. Null observer is worst: a perfect observer beats a zero observer on all + quantification and detection metrics. +5. Permuting cell labels degrades per-cell metrics but not global metrics + computed on pooled counts. +6. Family_id granularity: summing a perfect locus-level count matrix to + family_id should also give perfect metrics. +""" +import math +import random + +import pytest + +import evaluate as ev + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def make_truth_obs(n_cells=4, n_features=8, seed=42): + """Return a (truth, observed) pair where observed = truth (perfect).""" + rng = random.Random(seed) + cells = [f'c{i}' for i in range(n_cells)] + feats = [f'feat_{i}' for i in range(n_features)] + truth = {c: {f: rng.randint(1, 20) for f in feats} for c in cells} + obs = {c: dict(d) for c, d in truth.items()} + return truth, obs, cells, feats + + +# --------------------------------------------------------------------------- +# 1. Scale invariance of rank-based metrics +# --------------------------------------------------------------------------- + +def test_scale_invariance_pearson(): + """Multiplying observed by a scalar should not change Pearson.""" + truth, obs, cells, feats = make_truth_obs() + m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats) + + obs_scaled = {c: {f: v * 10 for f, v in d.items()} for c, d in obs.items()} + m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats) + + assert m1['pearson_r'] == pytest.approx(m2['pearson_r'], abs=1e-4) + + +def test_scale_invariance_spearman(): + """Multiplying observed by a scalar should not change Spearman.""" + truth, obs, cells, feats = make_truth_obs() + m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats) + + obs_scaled = {c: {f: v * 3 for f, v in d.items()} for c, d in obs.items()} + m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats) + + assert m1['spearman_r'] == pytest.approx(m2['spearman_r'], abs=1e-4) + + +def test_scale_changes_rmse(): + """Scaling observed DOES change log1p_rmse.""" + truth, obs, cells, feats = make_truth_obs() + m1 = ev.compute_metrics_for_subset(truth, obs, cells, feats) + + obs_scaled = {c: {f: v * 100 for f, v in d.items()} for c, d in obs.items()} + m2 = ev.compute_metrics_for_subset(truth, obs_scaled, cells, feats) + + # RMSE for perfect observer is 0; scaled observer should have higher RMSE + assert m2['log1p_rmse'] > m1['log1p_rmse'] + + +# --------------------------------------------------------------------------- +# 2. Monotone recall: adding true positives never decreases recall +# --------------------------------------------------------------------------- + +def test_monotone_recall_adding_true_positives(): + """ + Start with a partially correct observer. Add more true positives. + Recall must not decrease. + """ + truth = {'c1': {'A': 5, 'B': 3, 'C': 2, 'D': 7}} + feats = ['A', 'B', 'C', 'D'] + + obs_partial = {'c1': {'A': 5}} # only A correct + obs_more = {'c1': {'A': 5, 'B': 3}} # A and B correct + + m1 = ev.compute_metrics_for_subset(truth, obs_partial, ['c1'], feats) + m2 = ev.compute_metrics_for_subset(truth, obs_more, ['c1'], feats) + + assert m2['recall'] >= m1['recall'] - 1e-9 + + +def test_monotone_recall_full_recovery_is_maximum(): + truth, obs, cells, feats = make_truth_obs() + # Partial: only first half of features observed + feats_half = feats[:len(feats)//2] + obs_partial = {c: {f: d[f] for f in feats_half if f in d} for c, d in obs.items()} + + m_partial = ev.compute_metrics_for_subset(truth, obs_partial, cells, feats) + m_full = ev.compute_metrics_for_subset(truth, obs, cells, feats) + + assert m_full['recall'] >= m_partial['recall'] - 1e-9 + assert m_full['f1'] >= m_partial['f1'] - 1e-9 + + +# --------------------------------------------------------------------------- +# 3. Noise degrades metrics monotonically +# --------------------------------------------------------------------------- + +def test_increasing_noise_degrades_pearson(): + """ + Progressively noisier observers should give monotonically worse Pearson. + """ + truth, _, cells, feats = make_truth_obs(seed=7) + prev_r = 1.0 + + rng = random.Random(7) + obs = {c: dict(d) for c, d in truth.items()} + + noise_levels = [0, 1, 3, 10, 50] + for level in noise_levels: + noisy_obs = { + c: {f: max(0, v + rng.randint(-level, level)) for f, v in d.items()} + for c, d in truth.items() + } + m = ev.compute_metrics_for_subset(truth, noisy_obs, cells, feats) + r = m['pearson_r'] + if not isinstance(r, str) and not math.isnan(float(r)): + assert float(r) <= prev_r + 0.05, ( + f'Pearson increased from {prev_r} to {r} as noise level went to {level}' + ) + prev_r = float(r) + + +# --------------------------------------------------------------------------- +# 4. Perfect observer strictly beats null observer +# --------------------------------------------------------------------------- + +def test_perfect_beats_null_on_all_metrics(): + truth, perfect_obs, cells, feats = make_truth_obs(seed=3) + null_obs = {} + + m_perfect = ev.compute_metrics_for_subset(truth, perfect_obs, cells, feats) + m_null = ev.compute_metrics_for_subset(truth, null_obs, cells, feats) + + null_pearson = m_null.get('pearson_r', 'NA') + null_pearson_val = 0.0 if null_pearson in ('NA', '') else float(null_pearson) + assert float(m_perfect['pearson_r']) > null_pearson_val + assert float(m_perfect['recall']) > float(m_null['recall']) + assert float(m_perfect['f1']) > float(m_null['f1']) + assert (float(m_perfect['log1p_rmse']) + < float(m_null['log1p_rmse'])) + + +# --------------------------------------------------------------------------- +# 5. Granularity aggregation consistency +# --------------------------------------------------------------------------- + +def test_locus_counts_summed_to_family_give_correct_metrics(tmp_path): + """ + A locus-level count matrix summed to family_id should give the same + result as loading ground truth at family_id granularity directly. + Validates that the granularity aggregation pipeline is self-consistent. + """ + import csv + import os + import sys + import subprocess + + EVALUATE = os.path.join( + os.path.dirname(__file__), '..', '..', 'workflow', 'scripts', 'evaluate.py' + ) + + gt_rows = [ + ('cell_001', 'AluSz6_dup1', 'AluSz6', 'Alu', 'SINE', '5'), + ('cell_001', 'AluSz6_dup2', 'AluSz6', 'Alu', 'SINE', '3'), + ('cell_001', 'L1PA2_dup1', 'L1PA2', 'L1', 'LINE', '7'), + ('cell_001', 'MIR3_dup1', 'MIR3', 'MIR', 'SINE', '2'), + ] + locus_map_rows = [ + ('AluSz6_dup1', 'AluSz6', 'Alu', 'SINE'), + ('AluSz6_dup2', 'AluSz6', 'Alu', 'SINE'), + ('L1PA2_dup1', 'L1PA2', 'L1', 'LINE'), + ('MIR3_dup1', 'MIR3', 'MIR', 'SINE'), + ] + + gt_path = tmp_path / 'gt.tsv' + with open(gt_path, 'w') as fh: + fh.write('cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n') + for row in gt_rows: + fh.write('\t'.join(row) + '\n') + + lm_path = tmp_path / 'lm.tsv' + with open(lm_path, 'w') as fh: + for row in locus_map_rows: + fh.write('\t'.join(row) + '\n') + + # Perfect observer at family_id level (manually aggregated) + family_counts = {'Alu': 8, 'L1': 7, 'MIR': 2} + cnt_path = tmp_path / 'counts.tsv' + with open(cnt_path, 'w') as fh: + fh.write('feature_id\tcell_001\n') + for fam, cnt in family_counts.items(): + fh.write(f'{fam}\t{cnt}\n') + + prefix = str(tmp_path / 'family_eval') + cmd = [ + sys.executable, EVALUATE, + '--ground-truth', str(gt_path), + '--observed-counts', str(cnt_path), + '--aligner', 'test', + '--multimapper-mode', 'unique', + '--granularity', 'family_id', + '--feature-set', 'repeats', + '--locus-map', str(lm_path), + '--output-prefix', prefix, + ] + r = subprocess.run(cmd, capture_output=True, text=True) + assert r.returncode == 0, r.stderr + + with open(prefix + '_global_metrics.tsv') as fh: + m = next(csv.DictReader(fh, delimiter='\t')) + + pearson = float(m['pearson_r']) + recall = float(m['recall']) + assert pearson == pytest.approx(1.0, abs=1e-4), ( + f'family_id aggregation: pearson={pearson}, expected 1.0' + ) + assert recall == pytest.approx(1.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# 6. Random counts give low (not artificially inflated) metrics +# --------------------------------------------------------------------------- + +def test_random_observer_gives_low_correlation(): + """ + A random observer uncorrelated with truth should give Pearson near 0. + Uses enough data points to make this reliable. + """ + rng = random.Random(99) + n_feats = 50 + feats = [f'f{i}' for i in range(n_feats)] + truth = {'c1': {f: rng.randint(1, 100) for f in feats}} + random_obs = {'c1': {f: rng.randint(1, 100) for f in feats}} + + m = ev.compute_metrics_for_subset(truth, random_obs, ['c1'], feats) + # With 50 random pairs, Pearson should rarely be above 0.5 + r = float(m['pearson_r']) + assert abs(r) < 0.7, ( + f'Random observer gave high Pearson={r}; this suggests the metric is inflated' + ) diff --git a/test/unit/__init__.py b/test/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/unit/__pycache__/__init__.cpython-312.pyc b/test/unit/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35e9f2c76be87eee40bf03e688d1bc8fc64e26ee GIT binary patch literal 145 zcmX@j%ge<81VMlIXM*U*AOanHW&w&!XQ*V*Wb|9fP{ah}eFmxdWuTvtpPQMz^#aWzKwP9iT(k@LX(7P|*4U>!wm}~X^u>gfQkV^(0D0KeTWS}ZT|MnN zcYeiCCh6EIg5+{|=FXjS@7#N5=A7@GbLWq$8WA9UB>XmCjSIrZc;F<-&fNQZ$h<31 zVM3rHl?vj7#P@+2l9fbZCO9FB)|t>mh@S~hg!x{XQ20JF5v75v!bFS`psFG$0*%q& zRdFItWhm*OA)uW!477_XK)Y!K=r$S!y8VijRLjGW9~JX@ub$I%t#`7-dgn{*^~pkM z{en;*kY+Oe^-MmTnY9>=f^iXFXf84be{J1TMYe6qABHzOJceE;E~xYsifAM zFZN<~IH{Q=qw8!*G6Qf1$?!%zqiH#&bBfLSvWX>F=BhuR5v(w7RQi6$ z@n`u^@}=T9V{0a<=H}}n9<=(GRRQ;n^A1r8v3CBV8}YKDvtOMPH3o~ zdNFN0eSE4klRKWDf!>#jnd2JE9*6GaGN3wfY{%z7zw#XK|0Ydp?&G6xFg12}SwCE- z6aLzE&|V9|htZB&bbmFv|Hku+ncF?@$;;6*3uo8J!PYCATa(YXVOv=!5336Jv+3Hu-BAcOqt|GkjU#OxyrlhggWO$nV#xr3NSE4$flG@ zF%^$c0x<-qTUQ+I2li2hXf-LbqgdeyBzutTMY0b`FAxnOJr|z6tPis0fi0yV$oVK(J$eEl|l2IhBCI3&8V)(;%lZQ1Y)G7ldT+e7R##NevEi zq71^%6qT~99D_2Y9EIay$iUgil8_9U!R!<(&6q(=&oJE#NaIdrEAz<>n7QEDa=hkR+?vHaOwd3wbMWrgpcC=H%8$ubT5uCj@|CRec|@_dx_PK zo}W)$yLf&3b@}?m8#`}A7l#%n7hk?JI{ryORG$++5yas0qBS+PVFrx}f@?^ggxtG= z0h3dPhY0?QVANw2)=#e%2=M}feT({tvpvD-Q8?LS)sWMBwP z@m71(bN?QBvjtMvo0|qb@i@^GZ^G*u+D-Uetp=t&;l_N3oAx1FKZbRbemwbF>a~k} zJ0tAOsI#~ABLVYXhNmDvV>^L=y3uo_I&~_Fp;&yC2#r^wM#PBnXNMR>EBJcQ_}L-w z>~vrq@7eLyX*8!1=gLj^q7EPJv~fg?cF}GlX3tZo<#`G@EjfF8o+2<$1V*gTZ8%Rb z{_S7I@zb741m<7|p96mX0adkf_a8Ix?kB$7cc1S3Dvut}->&l;TwM?7m-UTMX-*#i z%i9zq>=3*YOX7xf`ZRb225nbB89ippYw?!9u>6 z)0PPO2L>7XMHxyXJUp?2enT`mitVU1nsS<@H=z&t29zq+zHx%x&3H z@eS~TSRQY^ggy&10BFeskd&dE9g!3`$6+^H$m{G3_M->M4kV~r8NPT7RVPCY#~w$5 zk~68AVSQ#cJ)6;|OgUei%olPd-XYAeXOIoivfmFe>Oa`zhZyraB~Af!%jXJIGiA7h zqlP)`4u2_Nx8n49OV}N06Lv>A>;|{UCfKdE408P-`q9B|mBw4{ksnMq=#dAe+k)Nk zCNQym*xlj6?oQfeJapJiy8%)M97t&>Mx;eJwVlJM9<0$an0BAbCbQ&%Zo0z;(^2{u zK(}b#iYI=4Fs*Pf9RWCnqu*r#>aW`90Zu)Bo}B_2#|V&hisTyTz+`}pU}*{odUEoA zfztd{keBQ!EPEQscaeMp$v2U3tnduxo<(vN$vBelA>lxYJK)aa;qyqog9P!o}C1{&AT8vQ%0$0%Gk|Ej>GGrsfy%}L6tgY*=CuG|7N_m%oMIvFaXTw;>6TlEh z@q9!Ld?3#C_!1DWeMI+l$#A`cpa4D22HrJ~6Z}p}6JFQ%*M!egl=r{3>z``Up6omi zP54~z3q&zw?(b4~JV>!sPeez9hbLc4_w^h(SGN5)0aQKk55ixDf4C9?L9E(>I15d$ z{|pEijjs6RLf5o}ZBi_AqZUk1%esY(ZQX)7X%#`nE&^lGDuRq%G~^UP#xsitToW)- zIRduv0Jj6JykjS)AkNO^F2q5;1zkjgc8I+IIWEPyv^NO^-BaZbU%)-ZWAacuoa|uV zNA58s_?)sIAi03#he&>eq*Z$}TBi;ChGB|vxsO`GUp!`o+pW%}HpRqGS*NU+cxq_V z(TEm>hJ3oiQXPl3R2^XQ>N%!i5TEHxgUaXNul)xQzkT_UXJ6j9u$W!hF|wXKjRKP#f$P{uC-H?$|BZ+QX$g*;D`c%@}1abB2*FS@BTCEk)9yYHcgJ6z#$ zR(Ly*l`aWc=_fE(Jm@RFB^!ZXNN*_w+?AW`GFQ}9f;0?45D0N_B@i+~O~WAy2!D}A zu#Qg&L|e9kz8Jd<0_rXIsF_{n&Q?>q%;f~N6!WRYM#9=tBcA$f#OJCJs@is07y{lM zM%Y#lO3QA3*A{Y~c3VB5S_r{YKxn6Jl|-id?H9$jgJ(smgb+d)?o$WhfT*U^t|nu3 zt*a({t{s$Od7@ zJ+wnyl_o_qj*FV+b2QBZ$eLx*pbxnfiw1zHP=3*;;>H}E3K?6cg3_z^K7;xMwF-f1 zZ9cD0fo?@tG%QC0b(~2M0{tfoJ9Z#(p@p_R|>$Rv@Xu+)VE7cQz zC+x|soUli?RF?b>lr>mbUaJGhjsDgTFR%KO^gnmGE9PuyBvY)xOq@t3SxUz5D@r zX*IF0mPl3;$>qcowM43#NG&IZYl&0U#HsfVtR&7X$HuFPGuKk@r~c*pcM@k7#_zn;t4RN>Ch$Yy7#uTBHI%UjsqgwGYaG-|uP zLh$8->T~?L>k51=HL}W@$J^uf_&dIe;|H6o(`R!rbbhfZB;3P=F+GMW%FIA&Fl8x) z=df(E=V~1QFtCga4uT+U!M5SD+B0zKB4|pTEM~9a6+Jf=_jLUf%U%Vdp{K+(N-UGa zT8`T&)P)O_x!6zVwR8zqQ$y@4N7+9@8@#bUKn7MkHHv+;W1Fn1+iU9Hs=9Ys?X9VY zs_LO-^+-(}sHy{t`m#ELfv7#TcD8o;?~pzBu~;VB6nT!Qac~E>NgmPStNSn9G{C*( zIJZW*P0?tnjMLu=q3{6~6`UII!4)pG3jPBD|2{6X`jt<7G!k;oLMsrBo@L>y(@4m9 z!mITKSv|fscfm7Qmps7XYs?e&RIe4BXkwTACiUD|TFQm`Ul z5%voyK@;ZzZ@wak@O$guF9%)`=EY>x5-H_~y)t3oQrVQi1%P70xBfH)Z>SEQ<+??v z$<8fhh)FJ$Audl!v#?HqptHv3C*fiQb$~B8@5SZorHUy{PO+=V%_R#LD*N!T7f2(a zav$gh-!rfre2_rXFyPraoxKTdT*7w#4T#^@(I}R_yBa^Zbu1lLyuQ8j&7IfAuAg0j z#jrnE?cV&#-vgx!LcD~-=>m$saY3tLE0n0vs&oRKa z#usFmid$rc>@w>v{xdXbN+tOI!>Vh>H-DRGIm^%tWVl$^a-d}}hw%vRVB(;ei*tSt z*l|3uA4wk&GyIL25}hmLzRBK!Lqvlb`rcs4swnUEQsP&;h&=aDJUNc>c_&K zyYZlS@NOp&2R=y%V^6J)K6Nh;5T6(C#-iehySt;}=-u72cL_BRk+X7Z3?j76#aFXJ)&5dS<$R-?O`a5{-sj*naLhlL>aXT)&_LbK}3@ z;OeO4a=qeGT>D*$q`0%vemDR249F?BTtHy-leHU8XdE-mWx)E?1gB?@*dSV@eC?ok}a{3S|-KN@X$VDy0o{ zwXy_sjj|NJ_Mu~;(C*yfG*e^0 zfA{9qv0QH~e=r>z8fNK@y~Bg4d?q&-%jWtrshzQbbe?5Wu~cqgXgHswU(^jTC7Dkm zK~l*K_H76y2NjK1V%knsY^v_HIo!%#jSYWl;1G&ZFLpBdB6ERJ113 zc}9;%lZG^R??FZ{O{FIn)3*<|ZaSD7NN>svB(vGvU~-enQkz(MD4j%0rbfPL7|osU zC4`1;TsDzc53|K6jb5s%f{eP(2O3Und9i-9`@FyLtbfUc)+J-B#tyxD*I#USb;F;x zPqoI6hb{!8$CD?T#}2#`ymS2V3z7PldtU4r>pZ#cjC?M#_qcRE5PUiMV)UhY+6liH zKC$OqV95u5S7^n10aqkC`p^eKd;dd~UhBWv*}cv4);90`E`Q2+K{zWk%r!W8#nlIt zd|fKKg(m4z+yx1**=WxDTyf9HM*gH@$yguD4G$`bJR8m*Y{$)_M=9uC`C&E~%K!lA zS+`I4G=D0c%_cI67RcsO!z%3s^{2>pdy@m1>@og(Dw|aK?;#YLPNYzNex#xNI&q0V z*0_f?qTZ~T2)#y?Iso1rTZC_|aUl78Iw9s|F`Qq8k4kT3)b-bm&yIfPLb&C`(`WDK zIJM>U(y8#Cv+|y+_|cUTJ0U$oI=*_;=)j8Jfjz>LlFY79l=D_zbThZvC8%2#y!}?U zrK{+h;!)&&tHYXS#qKqNlZuz9c@^qRdtA>w2^jGe+m99x_7~^_{`dw<$m4$(hAvC3>5v&buIW5nb*d-#sRY&ry9TXwCoDH6hHl$*TsE! z_MzSqK1KhLnUH~B4af+Ld@)4*(rkXc=!W1Hv7<3+U$H0lJ=xBGH_j(`t5bjsJ7Zrm z&t()G7|n4fa2xHbSme1zLi|XjhRlIrBy492GI~9l6yl)O6ZdI8HOKO4g+=JRm&1%E z_k)LM9yQN2PhUD85Ayb@HJasE719Dm6;#$j`2$7@m9^5LOoy73(qu48Ri$@H_suu4 zZSx8zz_D27DBFZo>+n&(4>Ia{FXRd~p15Z+aK~u(h5FXBi|(4LUpxAc0WL2s8*4fr z2)!J5F>qq}_^R*5zZF0E(A0{plZ{(X%ae`woD1ywSUl(|*b0hYDxXw#-llT4|r3gl^h4;+A#164mqi2d3<}T@jo`Qsavs!1r zPOMh(IQ7wjY_@E@{hLRU>!rJRtM)4R6I!*~f&Jl}(UyH9&+=Ahwq+d{2a-e7jALsD zbC6qzgT;69WOAX5iCV7;h5#zRtqFvA#Rtc`M-UZt|ya zA#0<~!5e@E#`cQK2IPK`(HY5g2B%Z=RXvMzMORVE%VwGVB(IfiMKq$KNAVP8H>93| zryxV#GOhqvcNM(PujNqeJ0YpdZTRgsl@y^o#ar<8N{Wwi^;)b|@#ifGtG`Z2ZtR+} z*kPC2DwN+aGiAYI5=>0lR4L`41W*ptT!ST*YhXW1vzst6heJ(f=0J@C!c`pd*wSZ} zbRM%7Rnl2qyh=K+S)LMn%{PM9QuHe!C0y`lBz_dg6O8jsD2arUvaMjTOS8l;ON#5L zRP^EF#>az?T=X7u#Univ-peHXU#3=fnVS4%YO$B8dtSl^+k5w2`qOvU{3_huc<+Ry zd6Eaz3F$-ku3Zz-$QM_y9Xhr-uEuuliqUQd_CsGX_hS0VE#^7w7^gbpYI|~Mh~mlH!SzfOM zO6R9LCnB0VeOL=ZvSw;-Fu~aUxF5}@Us?-&S}#74WLje)1z|h^`H_)6m}F@!f}axy zhBMiGW>D39z$UOwtu3J*%nfH1o;1;y%^gT)6I}2QCe&nano7{>x_Nb=#O-A)$F$~z zt^+30M~AY>L9T0pk9VUsI;#%Twe)-F(u9Mp;=y+DV260np;uPFNJr^WmlLPwx^%*N zEi_bNsql^$_KbFq9=a%Rn7&E> z@5&q4ov4^IX#Wz8xn|IIzb=`OTQg{Tik^Ze3{_h}H{#GO7rbv;d>XPdR9g<+{(`>- z==K%-fNnqK5}>FbwgtTaau&%->~134iEJaXmB>yapCWP(k$XV^+|Lru zioo4rf^-M=Y6R|$Gmv`+M{XV02x=#MY!6*^FOdg`JP1O-y@MlkM=5Z38Nl7e9zn8+ z5HCoR66|Tp`V5i%lwSbIyLIH#p|fK%dzOCZ$ZddOXUWkPl_V!FcM8JiT4FK+8p5*I z3xgl52T7nJ>;;Uav#>5IYyi1?@KIj@xfUvJoR)9e|CGFOCNNs+%$gevj2sFrIgvn8 zeV5D-LicfyTm>0)NoBy9DbGOywB=g|=$n}X5};f;k1YWz-kNLPDwOFUXO~FN6>Ds# zL`}UysSbBnuw+oh_nOS5L^)4}E4~SNN%<<G7pjkYwfZc3 zVy!aJLRZIocR8ulb_a%{{PF1OLPsCMUGW~K_Zmir!yu3`EOmu2SdecV|L$G7!RaQV2;^pd>Kr9|Q);QKZ<{NJu?;iI}h3~p4oo8-VpE)WJP#tWm zkF7e)*6ITvo1r?0R-Xmc0d326Yk-r^x9Tu^tB9m&s=7q(ZPy(+?aFNX}=wiF}60XNhp}-LP+PzFR85voBI2;>GMaBEJXXWTjYI z1|-i9=8g;|$km@_qsYVa>Glbe^+sOLPKCQn^teUs6AMOVyga^@@JfPTzV$ol_Lf?K z&b(k$o;|$Gr%^fXxlDTAWl}pXF9(@DhD=1b#IJ=^vQiEtP3tZD92Lif{l{p}RsbAi zfA&w1#0gpMo;Mq#Ayfi!Q`XE;oS89(&k7rEJ&2j=RdS177s!6PdxoNtS`VFp}S4U$bqLXX3&Wm0G2kZPyFTtGr3Z5$FJ}a(y79J7Ngy>N7 z5?rd?*jUX2V*3@m*V(V&>(P0%gPk#M82SGY?O!)R9;Q88!sk-IlbC;mx{Vh^aOy2D zXVYb7$d{3vOqeQVwz}Kcony^ohsIJ<;T2|=yCKGV_yip7_Y^?q!RP8AC}J3}t8Y$$ zXGJr%S~6p-(NQoA01hm4M%ob2FY9}-Rg*ENY(oTtn@2Dpe!+ADA$q}65Th3kRPJVq zhet19*g;A3g6O((Xt*U#KU?!Bfvs$_pMhD!s$pMN31$xNO0%z0(Y_$uty4NHBD+dD ztFu>0XLmK903RRiiCOR5y#K0CY(@?|F<+IM&ss&OQl1?};eRcthrB5epsrICMF>$7 z6a*&;krWhFA>%{Q6rp%{WckBp98}DRgNkjuH}?3W`}V?pxw+l6nsNu|m(j>BDBXtZV*npZge4>bx+rT5)5ETdKs1;m?LnF8jdq$ZZ zaIG3AZ6_;*;g!bMb)ZZXr8`mYM}-l6wg+Ejks0hEI`=RUvg7g?jV+vR8z~8~!%gwi zclTI!8v8bQl-;cCH1Uvb4K6hkmxOj0L2X8$M_0*jTe(5GB+z4f$?BK@DOHvMZKhN?oyC-@JTdF7S4n3H7mEB^fJ3{l<~S9LEtoCL2#G5Pb3LNj+cWYF zAu6Q<0VW1MVy`j~i*YFw250261L7P)Tu^AcjUWbqZ#My4kB1}RrI;0tzVUznyrk|i zl5Z%;WjjmJA+Z#$PeL5#xwsJgD4iw6um*8B?zbKEVyxUYmZ1XrK}z7WlVx#A9HFQW z2Wl^(ZT7SSnNw8h%;mQx4)FMtV304L4dfau8nyv0KzETg*K6 zw~dA+VJwU66y~0t5*SMsuUcy$YUK_!u%Erz^Gi_Qp(ZnPphjX4vqPSOg`G<1y4Boj z6*bp@uj|nbdL*TA=fI_}{q4US|KLZb_#hU<^P1B7pO%M0TPWXQM>5tj zhQ*O(fEDO$1Um-zL8-(t3|^tLhz%IvoaKXGh+$Dgq>rW5;cQ-)rHK#Bl-P-Ju)iQu z$~d@WzO__f$L0Cy?eRKY3E;fse#*=ps&oa-gKE|4T7}3?(y<sU-&S&Tx1)frdueUc6V26<{1= zkdI?TE@3cC?c*55F@X+JzI|T;Hdiza&CCI876!xjJMdW{hWz6ci&t$Rk;N;kdcmxe zoC8;UzCskc$nhLx9zlAk_LgO{Kh*q|XXM}2O{0JP#Q?^3KsGXbo)jse59;`R+T-KI{w3|* z3j$N-Rbz;kF2mSnN6YGH8y(5!XtQd2c!zGv)a{R^DYMhku4&ISWugQ_<;I?I5{AYy zOQt1Jb=V@6qn1zd*vuM{B~-5xiFc5+1kuYFzDByXCM7lL0GGZXG3}v8vX=rDoo*@g&$$?f)Mw~Uj5Z;Ru|FN`W zCGeV;^Odj?R6>QY!B-+R^OcCnPt0$FuaFasoMwSya8B*aGpBZjTtYhw3$}ufCa+j_ zk*e*?N{MzBUT9}#tGH#`nLMX=~M(*R7pdI$hOvW~D?si!QXY`D|xg|M~%mZkp7sN%w2g?jMl2!Ttl09}xME{Kq|d z;5uny6Vk{dWoBw9M!K?O$CuP%q}D@9_@qNzxxwi8J#(|ADEU_+(I+cQkxB6uoy_z` z?53#J8A=`IUbR#bN3|}X_=-WObphK|Rt&)uAL8sOTnN{o*7*xzsC5C#C9o&aefrVF zZA-YR%ATy0U{6qBYG#ntOsm#2Gy#83YMrPaJE>uod9PM=txCF(SzDM3b+sY-nTtYi*xNhytaz9mVXI4tI zvxbFsHlOVbYF%SbnGVO^g1*4c5cviXQsvkWiEy2UN8kJ>I{eQ>9wyR4#8P(Hf5G8I zo#y3>A!wf50hLjpi<4r+enLN#M$qeyr2j@g&_pXL$!{h4kf9Li6Clo1hz4`C3UQ}~YJ;Wp0OswK1PEoDJ=Ov+ z@5L~!T5GUGDA%C|dFvUPzzOxJVohe|K#hEr^VojJD(Nh5R+V(NdgQm%BN{elt4E8? z2Pl-(W}dUZ3$>_6)-#$#J1bX@>}Oi3ot3Ibp_<>a*m@k^vK?(zwVjFTQD~u^nY}`p zhjs>WBHY9NJsRh%M2C133&mhrGd^78$CTR+@%=mlZ0&Ay-wE$w~>%_3@jn<%XFWtC%FM z_z-Kt`#L^rWu0M?tgqQ5S#P4M`E8gaqozp`-xPHcE=H=FCabqnc}}eqxrA1F{rt6} z#7BLcZ;sLAR!X$e=qw_3=f@ykZEn?tR%*Un(nP40)>Xt(-<(=0atW>U`k6T`qf%8Y zwNj#$)-SYD6VYx(EQNg8(8Enr_mZZ~Lk_sPN|W|#(gXMU>11g{f^N;4rXS9OXwrlF zY*{>p+=v(7Ulyg{I3})IoPxO`RBRKU1p@&NaT-F+O=Bd^#vNA`(POa{s$QB%FA;8M z`+eH`5fL({#T(dJ+MgscD`Og;wYZbUWnUV$ljRfT|n z@54}vWyGa}i0HLhQbw-x(bm8)1O$+q;d zi&eW}vX2#e8{skDwzaDjIKnr9?>LR(uEE-+TOl|bID;bXfHd8U@e?}74Gd*o48qyU zc7zqz07eS#%JgPZ8B8>B0i}>DM4bTPzJ}wl8NkzUe6`_c_@xzkX}eLyp&>ku6}4h0WGYKiBQ}b4hdw!%}Cg~mxB zcXvVNia4g>QD3!}b9agwLadJ$YB&x5kOk3EjBjxm{_zHGg*I-|hG}2loJ{5=13B!J z4pBsQN_6s&P-~~OmNJAm1-Cw#43CGwQ{+c}dm0C!Bqvj-1udqiW+~>iF}- zqO7xip2#CS&w8OFezfOl87lyqm7k}Tl0@+;Y3Jp6a)DEGr#kfEhTG9hJ|FG;fY44sna9)=JBH zDv4w^(>Dn7q`B5*jh4}gfFL>kZ@Ty|LBtt$I%fo;ZqZlBA)k~fA?l;;HgEuRtWSck zGu4|x2<$*crP&VHuaW<4eAF`_1ci-bjpL1z%`4AFRuLrDpICX~(22IQp{1j{FEqDK zH?N;;UVn1$sjky4r+sId&!o? zD`{oQ>x-2p@0kr&n$NQoE2B3VE2Fo<^8{96h&D&Wt?2Ju%?956FYE(BJNXX#Tl$66 zZuWB`9FP8*_I^PmPlVu!{U0KiK-8Haw6X#S&EczN5&eDw`(Epys-&|5tYH<=RYE_virA~@ zHJW!Ul;ormTK1YUm5^qZ^Wl!orw}%tLgW+u6r#pcsQZLIh5C{n(eO!p3g97)J(@S0 zOAV_Nk{0g6a!A8jtc3&%9!ae1eIh%E5N9!!dOq*ZF7DugKWM^DGvIdcHR4OOd5l-p zXWIEYDuAp8?C(J?69noa@HXsWsrVSaH2;H7Bp(0VQ~PvnzcHIjDv3Up8^%CR9zo6S zqHMOh{{=dHlE_ma6G28Op*xz+Vh_twr}Ifs!QZS0SpSNC6o#8_6%V#MEjXt~SkI_X zf80+zWQ4btV1oK1QZ&a=d)GBgqwcw5b+Xukbd|F zB0f6gb_)V+=BL9Dh!bG0m1=8khsdriokopb1IdG&1l z+}txgyQ@r@4kX-;wOR81VQ!mPz(0GjqIu}~y4LfFmiN)~qdn|)G)SS>U!}t)w9{+% zdXi)GUSF}V>%NegeZ4n1kjWkc)|B-2R$MZQkNVG1dgb2!sdG&an!SB?vEy&k92$2{ z&*pzhzO#bpve0$$wy-U9-T8eAde>c!iwa$rREsKI_y186&af+X-SyXC)2S?6vgJK@ zF?uca-Wq!D&T~x<=sox1ET@!to92X>-C#?MnXJ%rM{bzXK`|Q#-IyvJSD;NWwRyQ= z!ABIPHvcRQ3jwUeM{ylD*3@Qt6#YE(|N5BPDD2-hwdqfx&Ugy-pTwu2G?4R1;ZH)p z|HCju)?jXJUOjq!+I znj#G@)!2dIKD{W`2sqR&WMtNYlUKqCt-kC&Z8gg_q%j9It4GkY$Y3@hedwb*%?!q+ zkwp|sLuNPqjA?q~Y8Vz5DK&kugUHSzRJ*5Xk1Sq{q%Kwm;-s*3UxO2r-AV#+EqEwu z;y@bDDxFC7^|5qclAa8^9Yrsr;(v;(jkH3wE^(!v~!pFP6>Kl9b z{E{^%BWK&T{?_Yh@V)R5>|h-q1YF)Vvx@nft6H~yTAs!KO!i`p=^ML&?B$^Jrl*{_ zU?e;NitfeX9-PCjngGSi0mXNN0mbwk`}xwJ*N3UlN^u5LQ9^Y0{WO-KY{RG=xfp`4 zpT;&~oMRzudC(%b=ZFEjmj(Zklb0*ZpAye8doPy~%`Y~q)!%0B+219hmtpgK|59_$ z{Mbn>x@;Y86YX`RO8JC$jT$*)qLHG;HCI(xif-D^G`GlN zLO$)gb@K1knSMR<+X(sO?E~$ER!lI)8*u1F^*ip(jP9#I5pUCq> z_$V5h#ex{Uu<4gn&KD)ggijGu4yY5-#H~b{+@3wdX>y;}{%6 z(d`I@7n&h=PxnuUqlexv0Dl7r@(!~j&Tz99sSNe1trM& ztOEjr&jxDZvz4Dx18*yh#F+&?Yv-xhR+J`te0u*P``wy*Zx}m#h=X0G14l05vvx|S ztBZWLNoZqEX>C4RxlEhSR!(R0*~;l`KHF^Z*%qZ$;Ip;%l3DOs-d5y-$B;u`6`qY# z|5(J|{9?R$&m7)-+jV>M))0%55o}Ix(@Kee?KWkJfj{-iQsuTny&=Ig*z@}c_wR&1 z4JPiG-zNTG5qnpBS&yzF@;JS}K|9uiCdiT(cQf+T={h10&EUQ`evU#joHR;3G089N zP9l?(ZUyaG(geE$hmdZ{g&`WN5gAD4S>|Y>7gNI^$U_iNenAD1loF4!O|(yPh^B{Z zq!6)@Pv}|aAeA%nI|ibXf-J};`mQdaEVCsPvF?cjNi~yV zaa?{UKI&s2Tm(5EXgI(0w(*Y1=GCVToslLt-#;1HeLfgHoX(DQ(eB@dTuW}# z=M{f*->Jqoo}6miLX+t2KE7R2c(k!jD;vD1eRJ7dc zqe(<}Y56!3K04$&ia-d8qzU2(%JlJfHrzTvnU4p0`{xK{0UI)^PCaMW!+|B2ObKSE ztOjKW?zRHGLlE1129)`2h!vpBfz1}KHU16@St*f)Cy80#TLC(JoBNa0JwP1qIaA<6 zN-m;MeS`_2H(O1F!{lAGXMri`Ko_b@I>~+ol$EzVJCL-B7qFFQ1<4|2cX2N6G$ zoswYc>N@P{1LXJ+Vh;@-h9)P!=AlT4BG!egX{-t+)$em-^Y09cPTelb-s<*8Ygv!p4dv1R}i z=RtA+Kzn8>Q>H3e7{CDtXXb!x*C^brL!Q~|s#VIf=G&=~&VpH`$&l)UO0&{Z2pWJL zsu^Z1eZzcChV1Eq)r=MpAS;>|A79ar*3`=F05!EHy(O8Y!XE7@_^ej+KZX0db}L#$ zc@|oci5YgFh4A#8TBX&Bl*QJnfzVQ{RRe=Hw+>5dHQ%FC>%hBpD@zwzM&-A1YvId0 zwv5|qY#GbIfo@;mK-AmK-nPJjN;wb&-{poigby4c>36~-%sD~ptT8SQ^DxG3boe(! zY~x4|9quIZ9U}J-AzMjY8u_B>&Do_7W0{R7Cpk34az{mHPV#`diwZQ>puV3jRBC6@ zeEPa%FtzZ-;Dix0$fWy=uw(d(uHix|zc40N!xC`U@*9;d0f!4R1@+Q`_Nr*FP(8mv zgw_r>7ld1y$_)$+=hNnLa0$5j6Y4Og1Tc~RaveS`nc$(Xt1)6?_#KT{1O1(mB|W3sJ=y;kB@E7m$)0C&I9AXR%Vr`f7t z*A$xl1;1KNXpTbV!m3wu6q@UAG1W_)W64R#wdOdgEOyO7Xl^is=77?uG!+7dN_BmN zW?E{St5jTQZmvx$($Q+qhE<7tE9b~7Cv*1 zRDCMIa_);U#F!pcUr2zl9Vp=dJ}QN*7_1kwU!7_^-8i}aQ~ewn5gl&4h~gHO!=?&!RizSKG%o-5Eo)x^5i3Vw6HjjCwUm4FIjW>8NqU$@DF>QPao5y0eGhOP?>u{Mxu}ElitPf z+s@cXL+D;%c0Y=+djP58VLfnd3(6J-^q1)nq*rK-q?O`Au!hEDPJdBRdbsXPjveAVIkW=xn5qTx>7)5043l9K+tC7P1db0U z5h~6D6c^*19<<0^QJr*{ATmINx&kW@871;ZM7~PoYeZfq@=YSI6ZsyI?-Thck++Gw zLqsETfyl3k{D#PTM0lJe&C9@=iO>)xMq`0%_5|W4s0i|Y zYJqzOa>{TveJ^_ydGPO2KLz@pN0OxX-9AbFpw1;pUvd3G=x46r&sUkQ1nC08PD5bSNd(!zhBJ)Fa@^h1+d zdQ7?!l5jHMk#^EApOdaM`K6UtS^^-8f*@@HY4w#wK55OBrC#Z_D-8x|dvB?$v2}do z>4WE+7LPx9y6b(9vI&>mG9?Fch5sluy_~`@^cIl8u3&B+Zb5m3On`|xI*NlCd1s87%`d|-|j(Z zGLtBYYXcRwkmICax7aGypR8gZ{u5&7!zPt45J-f(QYBZ}`M@_UQgK4*g zj&UlnuVzkn-@cE&J@@|3IlpuDt3*QMAbr67toZvjj{6WV+$1kZ!LJYOj=wX3EqJ(@2d}XlkNDPmYu-6I+Y|Emloq zOEpgoJvVBO(pbbj^N$3|jPOLpIwQgr$-Wb)qxfarvxeaI zyi7UKEk`1kggW%zlu48*4Y|m(tB4tM+YGc6ck}#XBjUE?9>32M?lj*N;48t1d7?Vw z*S?73pXAc9Y&|oom-SqcqQu2bzsd}MSJ=CE zhOc)%{#11`lQvR&_onc4TiTcrED`chY`dGx8-`9zc5g@8z|0+1VnKJTU8lR5s3pSu zw_^5k$eSiLEbY1KB(9uXo?0EbLcUbWQN2nFm2oJgTWZe8kLZ|VCHJx3ik0#~N-Nar z%E4M2&zm{@bhVT(=S{c+UC4I3Jh5WXbk2Unf#Ai~;KiVQVK=osc(K#KC7H6*-LGsp zQ5n~_6vyESE9LwagBG^HW9fNV-uV1mYJdUkEZ4f)t%unF1EbCz&l|-;?zLLQ)Ct`P z-6Y^^oCYz)-Hbmwb?|pe?0nnVwu|z0Wy93IIqAvSm3-!;zS)(0u1bA(pl|LTfCQlM zB_QGZ@za-pN^%IBRfAw zYE{%!Bl+=SX_9FvxB(f365uzxKJsGrVDD~<_K6j-Yf&_3DB34fMS*q$MIfa~5LPVE zwxxPaTO#CJirca!Vav1~tMz~gf<8@QI~x|Ew~hOb`k_xlp>C{cftccM3w$y&b?~DW z0Dp8=TES;d>YrW7XHM#!UCC#q<*!P;v=@fAKtbS3XBO+XK9$8fhQDP?0{c~HieSS^ z0*4l9L%QNvzF;|u%}7ITM3`k8ZA6V*CMsx_#zL%Auxz8uvV}UnWlI7!=U9lPdBth* z6lbgow2w2YB{g^#`e?8u&ZL&)mn`$pzyH)>WqsRv>2jWJy-a(O`;^O+lkc;fXNi;O zr3ka)6`~Nep*S|ZYIeE95A=u&q$M>}w@3FFP$44P_3)M98*N4aPPA-2?!n11fC9G2 zIR*l0zJ3G^)Lm&q`?hnEvTfEuHw#Xjtbw%AKdPHr^C`!u-RxShX|3tnx*4#0=~{T? zbOi0%0ThE2-4zBgc)81xz!c8hZ3~#enXO<4WA@f`jCNq@P7sy|{ZV``6kmiDN6Ykh ztynUPWy6Y;Dq!~-R!`12QK^*(Gk|lWrONSqDOb*q>vR>?T#e!(5NVB~YGF?oG-We% zJ!A)S9DCM&zMXcr_C;oaMLcBRvW?)(;DW1d2KwNR84L#-6rWxMisMDYD3(WaC-q4f zfLWtuBd3p7%}M$U)ZYMK<2xYGzFoW6zTGq{eIB31Acl*Kfmj`!=?lxa+{9b*O|Hm+ z+s!q221i0eF#a{*!iG88PzK;g#5w^ul1zkAmc|&_;a=It1#sj$@*%+G(*XcSe#tQ4 z@`wX~BWank-iV3pxbPy0kZ416mb+x)EN3au5>EP-1W@kK74Fep?oA$+x00osp@zGh znd=D))m@ITtnAe&&^{=J%WrEv=zngv{m<>NIJ{-&uQ<95+un|12a1*jC9}5a!+2{a zh@cy|EqJlRz&FblgS`$GyLN2O=M0m=VxR}0^keWfc7vGWT(3^JuB=0^uJec4`}zCl z>=N_#*r!mwqev8iq6G!ZpJFsoQwFL?Vk$NQ=d$rh;MWYB%`R;OE%>uJP$xtoH$0WK zA|<@^T=wSYf>|HdyH^sGFeopZkODdilvmW5%}PTU^X4=BWKL=*?wsX0cu$*%Mq*2k z^}C*$bk*WLTK1G6&jnfxTR#GcyT(wFV~STJ36gARP7g5?XEjTKmQZW7q(;mgjeB%f zA}U)X)R^*RN*8SS{#43UyEdEcByk-V%!Cv_m1m;!PFHOv%DB zb=UE8@K|&ohSbaj zU83lCrBbR?1Bx{6D^j-Z83F_q=r+0zJI2*Z_k##x!L9{rGDG)Zp}imsL}!K(j&}o~+voa={X8wS&mGX;(5LBh@P2&D@IEs@ zm8uzgqUIl;>M+8H+S&RuD$Hg78gHnoX^tjX~6> zHMu5{!Jq4f5+QI+G#A+35Y9azam`p$BM>1XoC~1vOtB-}qb*N1!JQVr&Q0(bH!28C zZ30SUID?;n>#uV>{CyW{*=3M#SBvFp&FqJ`PF<_iOguM@QxxA&y}gwMuFHLWSW@Zh zv&+`izP=IQk84!Ndtf)?y;J#8O*brFORJVPL5rrIGxgJ^B^r|eh>=@HER#x-r@)cy~p9)#suRkOHi}HW-5w7Da{++wV0F36{&hF<2xp#NT`$h5Hy{n#oCh}fa7s@9D$a!y2 zeSWw2-ZOaj=c2qn5iJCltmkF?SX$_1G&s{+P=p`l|5Lbv5CEyk`&!&z>LCHXKl;o0 z9yCZufNupB*qT85A>M=uBeIjb=xDG;2*t6uhHa7g!N9)?F%u1pGvuiQLC^anW{7f0 za)O>J(E>rwF87TEdJH2-JR8z|v#!SkHkuvuWE}e#k^(IWY(;1OHKT!f6!mfil4#+j z|6b-%@-v%9-rHK1;PXntOil)V)xU{(#JGB9x6knO8!(Y>Ap|i`e1@+Npg9f}yiKg3 zDQ^>NV4ySp40y5tZN|dDCA~}uG%+JFXPu&9L!U!|s((S)7#KUc1@9mVTD*FT9p#ZC zsIi$i((+VBN3iwF2kG~7g4x@Mk zM9`=oq7}$7(0_C-c-wkk#|SS}ayfIN0@}1x8!zWgeg9c+-NER#pe+?U_iA7r2k$N zU*a=UeBRqyR`GcyLGdO2))e0~f9^?%1NE28&WKC_(kEaN(x*_2qBwzqg_(G!2q_3LGJ*^?t33-SG6Rc&9Lh7l}Z zWMt@TC@2cwa&8HMX#uP%U`a$IOQDsC94Qt|n+Jy_;aJcZ&t=AQXmoguk6MKarFy{x zdZ=2(h}id_A|j~q-yjx2*86>fH`-pFdF5^XXC43h(7O-)qWk9$zf+r*r}J+p7azaO zUluN=ueLurt-Srp)S+u_FV9KO&aUKhO?q}t+B3V7&#$FDf4(Dg?f=BH*x$cxJSavM za69<3FjKc-qYYfz3Fuom0UihMvV^f;A^^)HdMpnh44{ta(4G*jK|~3~94~%KWW*Gp zHhUP(0Lp}I?1l|9k!A$8(Exh5##A#(qD?@RhQgxOK)D*fS(?NJ3{n{eK^-51h~|^H zV~!U~yp#v2~+B+(yY)u`fh1%AoQGc!v81*ajSm>_RYzxQh;94%_tO zY_S<&(ZYMs=@0qnNp#p5St=IT=5~4pnnAtFd$BP@U$Jbxhrw&|=ZJv|Tvk&a^TPgkL zd6A3vG1&a~bq9u|1+a-`41>+XOBiWLi4y_fQi9Tu?n#Ox9UA39#oLx4gtW$6!)hh#bI;8eLqof4`Rp@`5}a5eY)UA2_<0sg~9iVcPkRrBgJ78 z#b-~7gEtq?GC;B<{dA?A78s>TzX`L)HnWy@=V0reZ#J!ja7;%7RutP!gjnD+jm+a) zyu-N1U+_*)E;j;u19eJ&1l1?tYcxT?iuZ}dUqo)iU;4a!JeT523zp``yV(xR^lB@b z%aKYUm$MXiOCzIYtu;COUwiuPEr;;$IsH^;Ax%s5vaG0+$1vE6*j22=g7B%m*kXZ-#?#>@{i7Ui~Q#KxWISKYck(^yGLr1reb%JtND)k cL5YWwGXKbYyTtFEPpbUWcX~vA9qaY~0l9DIM9>b5EP&H%HNo`R!Pfe*lEg2~-lhe97ZQ4k!)5=NHLx`lnj+s*DIH}E^o>Pi) z<5*4Fe*eAm+TFqKf&@gV#$fQz?99D)X71ek|KI<(cm6CE3rkqulitkUyHb*VLO;yK zW9)q3ktOLRNtJd>s;qj3<=q~>^^PcmI5pzi?U%)w!0rG)6Wkr-+tBV1--dUG`8KjU z!ne`gQNE4sj;r3UNxPd=1+-c9fwriA&{j17xRsb^cFH&LF?U{XT68=?7-gP{GoN)Aj^&xv~^l` zWNbKH$fQ^%t!isW$HU=WJoV@p%k=IY%MBK?`JC2U*q<)+s`=hrzR=4uFN|ez7FmXS zA02ph*TA~Rc0Jm=H#?ls)`qXqTSyL?FNL;2{Cn~L!6Lk(m!tw62oJw@+0FvR>DhoDp(}{n?sNXw;QX=4`Qu*ak1gUBMbUlh z%YOB10X%lESU%jkZhw9xvo1S=XUyl)>ohjF4mX-fgQZXntQ!O0;d`kn4x@MTI;yU} z)PY?7P>$V+>$Kxv+Yd4!T~+$0=6HQa>Hh%Nl4hN0$uneH9?rBlqjNAVg!Qs#DjN4cNfg)5fz3KE4}ls7|g-GV6h4jCdDc~ zaL7fdmx+J7e3v%vD*o*p^wKMq>KqXN0moCgh~g!f zf`1?W{iT5YHm8BG5f=y>aYzHSVnA5?Ell5rqZJT#z6~2nfur`^`i|9x)tDg-B5GW1 zDn^Vp5^cDRM9sEgZr$1lhp^2_-JeVj3?!3B0?FjAUCG|0F8AsZmkbH{$g6XLW&5_6 zP?+`%EKG=1KxW!A4g&Q8AOkTDW(-u^w;`e3BOvZ34v@L%N`x3ON=7`A#fT6OWKBd! zmT@S&fzwT#ZaGqhmI+-GTE0*BCX-1a63eAwA|hlY^;11y+!Lc7$_V?h4k8PQEFwaZ zk984Q43dbkZrbl5vXn?Kh=8jG`Zd6-@vFb=+{UtBo7o*!W&`=&>qNf&>)A58#Bvag z>cYUlVk6}D^6BJvE;Y(B&>{|IRCYh|(rs&h146j=@u`o^dO>;osVm1bI$ zr)6{dhBMqy!?vL^2`_5jfto@n@1ChSUMH1%=C3lNN`6PVry7Tn!ozXs6;OB&;i9O6 zJLEd&J+NYVY#UC=i{En5LQmwzDoWg@mj@tGc!*ViHZ{xjY z8Q3w3fPF<2iyh+-j4#tWEaCHn!_wii?88k6=c8B4ypRamHX3qRZXmLW8)|6pUT%O{ z&uM?Tbq1bLet07nb|_~%SRc2jIGuqTg1-s~+`;_t7nuzp7I-!zq0wr4G80)rf1$jLx^r`IgDOaksB!##~!w ze7`N$yQQ_I_DJNu?Vr-(X3B;3Bj-c-6!VA(xm1=HG9uW`0pH>f95374| z+zh8%;Bv2xx&zM!*)F;_?h|~HZomOrH?vRCxn4TV1MmzVUmtrKhg@)K567u4w?p=w z>>x`cTR&y}GB@yCjy^rH^Q!Xj)NNWXDi2>(o}9W(>qX_stI9)Dw`skkJX8Z?pGNG^ zEn#d5GzMB8L+~rN&(uTMUXHMN9tXnyq=g*Y2)l9JU~Uk0JzZe#5O%;q*g-WkcL*EO zBAh(ZQY9_);NUPK7a$PM%DN}3A#!BSIDb|X(H|z~7p(NNU~uf$i4a3)Um)`LKwS9c znrZpvNLC}uI0BmsV=Skoa`{|_?L{f~QAxiE0u?{Hl-TD3Q@3fos60@Eg}i#JEM#6y zpI6iUa}GHp$s?;xCL*v_CE#=*u&wMf=y~iJB4j6G&k`Za)Qr2nFzLBLrCdJs!dM=h zf$c}Z58>bH@AuwTxO7Q*xVpbn7$!H%U9|fQTL_VvZ{+VN6TO10$Vh zjKWU5+RF~3kDD*yY+Arl)~G#Vi@av z4IAsNfZz&ay%!_BBC4cDyNvqe2F^xmG{$Q*PVZp8Mycf*L8_=xu9UtNM~fc#2o|_i zb`5MFxW&z`k6^i`qMEM#9*8dAjqzrFU?Gfegr40&dNw(K#1W2=pfYnu$ytxkiAf#) zFfIa2c><`?LOp);7N6b7e(h0Q!SqbEns;Q3U&Gy)s`J9kVd&d~1sXtx0az*chQ>0v z!Azs-bHcCt2KTf42!A}M?%eNi3JJA&qc|;oY}Sai2``WE@!ZX^;57F`L)H$ZnVQnb zJfN~KA~P8gw0{UfNU(om=aq1DD!gJcyyEm{&ul-p;N0Ma@crZh>YBRQ^`g>sRarT8 z3+tq^vRY)2e~XI@52(;GRrvak;IQf;T9OO3MF>3ks-pTV5#n*!+Z7a0vGC{tuk9~} zZi>-j4*$;-8?gvaDd1mfy$R5x5#`eB}5MN ztA_8O(^0O=9oY3}3Ka~=t$8QaIdDgz+Ij5w4K>{GojYs9<<9N5TDR_;Q2^mOcWQh0 zBgO~_zZz9z4toi5 z6E!*UNH#a7nS7WO@g)AWb0FZuOUM zh9io&j~F($gZ3;DVUL|-PvAtk;|!5rJRoZ`8>3Rfg7sVOEXG-XFO|0uq$~<+1KUK$ ziGP>5x3*{`FaFzP52? z-abEP_W43P+7Dms0fSt9qRCeL+#m%NN&9OD_jL1sL@+UntkUEbP6a8}u$oa%L25Ja z(uB!%rDRnOC`d6((L!C4FLh>&s}}@eNz}RV$vEoUO+m61koYNu`)-12liECs3JA^Q zCKQgAB>N_6jB~kb6e}gTT#oieiEyjN3$(|GaFk4q1W0~8!D`@dVYR`IVYPvohfm{% z$t)a0A_zHqhcmge3P;`)nbs!))u;~2>@paL!N{UsW|yIN-oS(t?Aw&lz~%hr4^u{t z(8pw9ssE{tg=^d z!M)VgiA+fEMY<<;T#h!MxNkDLq_Wd%mQnWRePx%FuIj#M*;C^d_;U0`+?ZHgEvuGQ zRrQs8u&h#u1njCs+w0{=?+g_E#Q-d;VrI8`mQ_EW*^TFMSXLYFpY?!)`9PMZr7m!& z%F#f7wfZ*vO}7_dgc{{6b<$bhraI|ntTxvv&+<*FVZ*W-LI_FJF&7-oSj87dNv-BF zdpqr1a0G11=6}QQU?~90YHTjKp~VX^gVy(Oo0ZH;rDOG#M&O;(l#(lz)7cc+Z&Q*@ zM3O|l0isdQw=IZ<39%tYmfn*;m~k+&@1rE*tr~GOVq`r|M)vVHELz_F@ACk$+Ma#Y z0{WQB8Z#5wtxmP1^bYBS{QPQ~*xKQFb@AF4X$ouUHE>>9N?zOJZ2O@xYo>w-b#PU# zeEExhC|~5DW839hYLjsqn*!OM`QlK% z>>M<=Tg6hHgS=BfJyzeXb4TrtCw93byS9M3YTk*Rl5_FeX>FcYJDtVzfX;;S72xPB zV0eFm1)AWFFOIwGGLPBYV$iHnb8C2iXl_b>3A0EXTcb$a=W`9sOA>k*H#AMqPg1hj zw}_BNqkFZnJ&a8J>}4WcJ&Nmo3eU-p=>BKK)`#EwGGbg%OiR?4QsjIe1!1d!#tA?-2Peh(cF^i!JpCtyhQ=cG0I^}6h|JHH)Zp(+y8ddxS=(NxKYfWN4U}2 zoNgZBAfFc&2C;vPs@L1xoF{Aorz$wJV2@_09{Tt@n5%xMI-%g|^(-x*K5fZzeS%xupAi0Wb#O2(a zJ1MLH*3@WPRWWhuzr=;VjDPJvgH$v1HZXMx8||75cb(jD+H)GwNvp{EHRrY~n8&Uv z?Nc|qUQ*htMF2@?V$vV{TQFrzhib59OzlaL4;9PwP_q#cKp(LwdUi?f#`&St5BQ(kTv_t`!U zN8S!mOHK+yhVg#Ptrc}oL6O1awLv5oAnYGtZ_s+1_(5-Z1O7i)0rDwH#@jAR&qLsq zWC&~Np!99|C0Uk^g#qZ~-}fGp6CT}13?!L6D`DTWcD*j|(WqIN@g1QDXc*ApQTEW0 z%`V+5pMxFwV0t*KijNAceL#GJh1bYdwN24G{U z8KEvG%W^{?7V1(1merg_{MuIYM&-ZdUaMdPu$mwHW3G{flfobNGrZPo#0TgLM-R~# zj;ygA4~Z~lIK6avkd^z#&N>D8?;*R;$2yD_J(wTM6;k^$_$m)(dKhd}wx11%?)3Xq z;E#wjWX%@iJlrpxN)I1Ok87#Xyp}Cw@iC@9N3q2GwC6zVP9=Wg*Z9|V! zMl!GoiUZMY#!lC^gomBSK~54LTT3-MArPZDMuTyTV~F5j7mx=yk*lu+5yjU=UZy>6 zl5BDyMoaXuKf`&lV2)I3vuM0dC`s?h@$HbXE%9_e@7Cf5*bgWj7k}Iy$p_uo_Q){K z3Oq^F8BRTo7^E6XrAJ3u{xEwNnP0`fMqI*X5W*~ZW!ug-)gMMK#GZO9@Zev8bGkuXbyJBn%%c2y z!0f$njo8|#1!WD60VlFK^s%mq5?`%^ObwCa*)~mK@SfdMT)2v(&pQo{E_|d~H@ztn zB?f~ud`EKwmLoRj9;ykJ>!h>Zxf-=hlrc5#7els%6dK zE;jfvzW~VycWtN`9CG1^rLfvmiXb#0T#T4k;YjTjwOMVMwKrn9ZR)BoR!Yc)mO3?U z86E4Sv!1L@I{VGFns2Vq=)1CHlv#_0B_s@Uas)LRs{O8>2lwS~Dq4yaV_zUENW9ck zYA&^uT1yK`ZKZZ~0SwV`LnRFt!*xy4ZN)J9eLLlG^!tukd#7>pN$b8-D<$4(N1bHQxwS(l-=1ppSi%BkC@EcNgRT`w{`6puUa1 z#7I{YbHos))s<(TNbclOoQXj8=ivF-UI+v|{8Znk2l~atem=ghd!Fjkm8U56m#GLc zdtW2jaSh!NLS7H#QiGXceD`)C7m#GE%HZ3!YHBpY_!%Ld$$aB*R4Vk9+X_!5W-UNL zsUuH0Y*IiIB5)Kt_HbWu2u3M8mUOLFNV9@=C|lTH_I+SIzzfZ)2@z3R`3W=%x@7-> z2vrkqfwYbaRXyRvDz?~36I!lJ@{Ldk9P@+AGV&rXNP1L6u5zPUP#nyv zx{}RdhKBz}89zg0H<3Xihd^{6zr60v$}nJ8$N02 zdMK4k@2<8z2JjZB%YN{O=!F9AWsTV?PI)j-zo#DTgP+-eh1`@aQ+o!SBXG$VTL}c5 z3qUwH;oOgETPf)yZDlQ?2~11j-HP%Zm}^1#XM_11K^yU!v!i<=K!0pH#dU? zXls{ktBG~e*`hmQitbsS>%okIBZXa7<2B!D)BHO%T9`|F`PjZw-cNT($Ag9u2(6xC z0MurB*D&yeVBiUrf@(_%@T(X?WI&`C0SvQy2bo+e7PYk)L4RF9Icyl_$m3Q+7O+*@ z9B`?&)3M$hE0*a4aLvABx2szz(U05Ib_0e*)ed!GF>1iDSi@hBjhTGJ+*&X!qAr@R z$`xkVE5!p%P|AH9frSvud-9*I}@&d)r2*@Vlj8G~DZ1oS?2vp2&z)fynqiZgs zB=TW~_bEASa^rN4L(Mq$+yLfk?(ufJ%hW!W9~;%oV8P*W=11ZG9_46X#SXw`d(Zja zC(pNi>T>UsmwTUj-z&E#<@bF`)IYJ~=V1vbGknKn_>R*nE`(Q7WXbZWIanu^<#Si^ zgkDWvXy5Qk^5hFA(kJELRZk};lIPnu+{*N57nPN_lKqmh@|QrO9UHm3{y%Pw zKHa92#YhmN=1_Dq^e9&-iVx$c0q#ri)V4(y9f8<309Avphs>ZzS+WI#onjLYfE%)G zk+>scF+nAQXTrshJ1U!&%mtA+mK>EUNXfahh~olv47m7;Zp= zWVp?fJ{>% zE*3>QCaA6t$ci2U&}dgXjM|K-3)MxA&(t?sn^w!I02+Qkw$AyQEJbZEpQ9zYwaXec zSya?6RqB!elVb)s> z_X7RD8GT_1<+1yM+TCy)cgEwnwDAI&j`(7>wYm{}RXNb+yVahC@5t#-b-5$Ew(9me zt6p&jJ&t;GDK!S}yfTntTXQ`8i3(-wEZ|Lx;W3Nrw=}!PTJ{)S)|* zTo-{s`xmGz-Op!=AJBdH5Kju5!C@^mfRpdxBnuMg4iO21B$R6ez>F-9h%DGpmPgqC zrVM`v@}YlUVQoP>$gRu>4v282Vh|s79LaG-OP9mC9O5DFP;{)=bz>@H=safZIsu}x zNgF6Sfbc&1A4F`u#lJt3Rx_+ZU(v&`CTBz>kkENZ`E$AvSpJY!;YJWD5w99~Q=(af z)wLdCFuHTSA*ZHLYGFS@4GK(G@}rrY9%8usa5k6GFjxwQv!gJNj}B)G4x2u;OgrqL z(!~F5oK3Xq!NSNW!rW1MlynsPTKTa8^vnIak6(p#&@1@?l}LP4*!kb0z3&jQbR%6J zW{W5(SEslSfFcewVr}Ir6a8}9<*T2UG{TW|f!&4jE>R(Cpgg28qv1L=SRgqo zChJvY>D0}x7nP+my`1yjE-H6i_e*`7&!x}p`Ge4f$o+p1I@5P_;P~S&K7L}?*OODx z-pOe1>9$MJyUv9scASsg|Iv6WemX;q^&QeimrTuMy`U`lMK|8%i=BUaqfLr#)mqSO z{;1`dkoQNSmCtl2?}U8MEC{&kcYle-Wk#03$$*Km;c_5X!!7G~v#;IRCdn^ZW_QG{ zg~SxG9HWUDkSzVq&C!5QchU?<5t>%kfzhH34|(mhqw3t^HL8cy0;Ab? z!sv$Y$T_suErP_VEnu{ocY>*R!F6cC9mMLSwX~=@=`8&ggT35mKq8jS7e`4hy5O`? zbTl0{kD#lP%kr_sLM|s-l909B9`_fx6yxN-V$YDk@SpdyRm8&iL61nl4^UF1;v%gg zF9w4Q_Tj&uHBr{TLdt98$EqAvwt~*SL?>P*@+b(5a%9?IU36+OktIaBiI9+TNHl6P zY#|Zy_OfyThuM*Okq3s+0$WCzmlNR^JWQ!=iNr+~7fTdZW{V{uj>I=*T*Of!Uq}z9 zxBy>+qW>rIno}SxTzh`=)054~%PsBavERH7QYqXw@yL7O)+%YV?9|}t<=@?ZCUjBh zzcQUP`svMnZD2;9DLAim{ldrj<%fKT+`n$wd8hYJ?)2?kHw}~gncCh zZB#Dy)NC+nkpYb_L)0cJ7B`cZ+4+p|!0^w)1OsyvOfZ%i%xy9>Ycw!&*euPF$HjwV zclELXT`mTmG82q_A8u-!6+{C@W2bbxx|I@)2BU-x6ATP)2pIGm%3HvpyxFC>Fwkko z)M9=HH4LBVd`;TMtedXMB1Wkb?*p1d6mLYc$)T&Qn-Uue_%^`W()yYX*+@$M!+vfOKaW3zV|1vf6+ z8phT7%dUpY^=;#j@hYsv@&lhR>;xYj1hz$j9k>OLueWN-Ez8&I z7Jvw7OjB<0X}WB&Gy}-{(TqZw&53d>Xbli<*NvN9z)A_2Q*WXi%Q8@>Ov@@@O_USh zV^ZxYw@8!ai=!k9s+s2`$p%&~q8d|fK~&-Ut3xiLs&8YJsM?jyjaN$!I?99S%LRv# z*cKf-k^_(&Y(0902w>uZ;9Iy?p{jG1TA|*@SU)8v&dxT_9%tdinj6;aL!5=Pjg)&6 z5q_O%wR*ag#gGLfuKY%HrCdqv%?=})9iOD)v((>*i?rh3m@e?T;t8!4?3f#v=bjxm z&d1LyJvF#h7^*i}HRqaJsowpvW8Arz#;b8<6R;Y*2@@aRl-fR%(do9@fB_`RYce)pla;)Z^T~LUtt0-y8Q5j82{;Smm?I{C z2;+Y^z|?FXpBo5feaMb6{(gL>6;od&j4!p)L|Ed8IT;ovnQ>!#4I)>BgOV+?lH<96 zRhKXCt>s~eH9%*@13!o>_26IoSE!Jiz@BgW-#mP=`O~w2KbK7@H@RL?mT@kaxLXgT zQfhuMmC}P0iCM_zG_D}%ovG1rs+qN7v{p8s!$%mo>Y<0MLwdkC#$}VP>^lO_swbP* zJ$a3f0f)mBddeY|9@YJ#JUy;uN5+QJ@Ya%3*7zFNoh089MD8cDjmWPNNfX&iWIvHF z68TLc$BFz7kyAuoA@Y49G$x5pLB?kXVt-CQX~G9SsGE;k;?bnsPs+$L&9waji5*FyAf98H8 ZHU^MyNvqs@V|7Sg`oW@@yqrJR{{tnb*(3k} literal 0 HcmV?d00001 diff --git a/test/unit/test_build_rmsk_gtf.py b/test/unit/test_build_rmsk_gtf.py new file mode 100644 index 0000000..c5f48a0 --- /dev/null +++ b/test/unit/test_build_rmsk_gtf.py @@ -0,0 +1,110 @@ +""" +Unit tests for workflow/scripts/build_rmsk_gtf.py +""" +import gzip +import os + +import pytest + +import build_rmsk_gtf as rmsk + + +# --------------------------------------------------------------------------- +# make_gtf_attributes +# --------------------------------------------------------------------------- + +def test_make_gtf_attributes_format(): + attr = rmsk.make_gtf_attributes('AluSz6', 3, 'Alu', 'SINE') + assert 'gene_id "AluSz6"' in attr + assert 'transcript_id "AluSz6_dup3"' in attr + assert 'family_id "Alu"' in attr + assert 'class_id "SINE"' in attr + + +def test_make_gtf_attributes_locus_index(): + a1 = rmsk.make_gtf_attributes('L1PA2', 1, 'L1', 'LINE') + a5 = rmsk.make_gtf_attributes('L1PA2', 5, 'L1', 'LINE') + assert 'transcript_id "L1PA2_dup1"' in a1 + assert 'transcript_id "L1PA2_dup5"' in a5 + + +# --------------------------------------------------------------------------- +# convert_rmsk_to_gtf +# --------------------------------------------------------------------------- + + +def rmsk_row(chrom='chr1', start=100, end=400, strand='+', + name='AluSz6', rep_class='SINE', family='Alu'): + return (f'0\t1000\t100\t0\t0\t{chrom}\t{start}\t{end}\t-100\t' + f'{strand}\t{name}\t{rep_class}\t{family}\t0\t300\t0\t1\n') + + +def test_convert_rmsk_to_gtf_basic(tmp_path): + infile = tmp_path / 'rmsk.txt' + infile.write_text(rmsk_row()) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + lines = outfile.read_text().strip().split('\n') + assert len(lines) == 1 + fields = lines[0].split('\t') + assert fields[0] == 'chr1' + assert fields[2] == 'exon' + assert 'gene_id "AluSz6"' in fields[8] + assert 'transcript_id "AluSz6_dup1"' in fields[8] + + +def test_convert_rmsk_to_gtf_increments_dup_index(tmp_path): + infile = tmp_path / 'rmsk.txt' + infile.write_text( + rmsk_row(name='AluSz6', start=100, end=400) + + rmsk_row(name='AluSz6', start=500, end=900) + ) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + lines = outfile.read_text().strip().split('\n') + assert len(lines) == 2 + assert 'transcript_id "AluSz6_dup1"' in lines[0] + assert 'transcript_id "AluSz6_dup2"' in lines[1] + + +def test_convert_rmsk_to_gtf_filters_short(tmp_path): + infile = tmp_path / 'rmsk.txt' + # Only 30 bp wide -> below default min_length=50 + infile.write_text(rmsk_row(start=100, end=130)) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + assert outfile.read_text().strip() == '' + + +def test_convert_rmsk_to_gtf_chromosome_filter(tmp_path): + infile = tmp_path / 'rmsk.txt' + infile.write_text( + rmsk_row(chrom='chr1') + + rmsk_row(chrom='chr2') + ) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile), allowed_chroms={'chr1'}) + lines = [l for l in outfile.read_text().strip().split('\n') if l] + assert len(lines) == 1 + assert lines[0].startswith('chr1') + + +def test_convert_rmsk_to_gtf_gtf_start_is_one_based(tmp_path): + # RMSK uses 0-based start; GTF should be 1-based + infile = tmp_path / 'rmsk.txt' + infile.write_text(rmsk_row(start=100, end=400)) + outfile = tmp_path / 'out.gtf' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + fields = outfile.read_text().strip().split('\t') + assert fields[3] == '101' # 100 + 1 + assert fields[4] == '400' # end unchanged + + +def test_convert_rmsk_to_gtf_compressed_output(tmp_path): + infile = tmp_path / 'rmsk.txt' + infile.write_text(rmsk_row()) + outfile = tmp_path / 'out.gtf.gz' + rmsk.convert_rmsk_to_gtf(str(infile), str(outfile)) + with gzip.open(outfile, 'rt') as fh: + content = fh.read() + assert 'AluSz6' in content diff --git a/test/unit/test_evaluate.py b/test/unit/test_evaluate.py new file mode 100644 index 0000000..1508d54 --- /dev/null +++ b/test/unit/test_evaluate.py @@ -0,0 +1,368 @@ +""" +Unit tests for workflow/scripts/evaluate.py + +Covers ~80% of the pure-function logic: metric computations, data loading, +and the vector-alignment helpers. Does not require any external bioinformatics +tools - only scipy (already in the evaluation conda env). +""" +import csv +import io +import math +import os +import sys +import textwrap + +import pytest + +# conftest.py adds scripts/ to sys.path +import evaluate as ev + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def write_tsv(tmp_path, name, rows, fieldnames=None): + p = tmp_path / name + if fieldnames is None: + fieldnames = list(rows[0].keys()) + with open(p, 'w', newline='') as fh: + w = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t') + w.writeheader() + w.writerows(rows) + return p + + +def make_ground_truth(tmp_path, rows): + """Write a ground_truth.tsv and return its path.""" + fieldnames = ['cell_id', 'locus_id', 'repeat_id', 'family_id', 'class_id', 'true_count'] + return write_tsv(tmp_path, 'ground_truth.tsv', rows, fieldnames) + + +def make_count_matrix(tmp_path, feature_cell_dict, cell_ids): + """ + Write a feature x cell TSV. + feature_cell_dict: {feature_id: {cell_id: count}} + """ + p = tmp_path / 'counts.tsv' + features = sorted(feature_cell_dict) + with open(p, 'w') as fh: + fh.write('feature_id\t' + '\t'.join(cell_ids) + '\n') + for feat in features: + vals = [str(feature_cell_dict[feat].get(c, 0)) for c in cell_ids] + fh.write(feat + '\t' + '\t'.join(vals) + '\n') + return p + + +def make_locus_map(tmp_path, rows): + """ + Write a locus_map TSV (no header): + transcript_id gene_id family_id class_id + """ + p = tmp_path / 'locus_map.tsv' + with open(p, 'w') as fh: + for row in rows: + fh.write('\t'.join(row) + '\n') + return p + + +# --------------------------------------------------------------------------- +# pearson_r +# --------------------------------------------------------------------------- + +def test_pearson_r_identical_vectors(): + r, p = ev.pearson_r([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]) + assert r == pytest.approx(1.0, abs=1e-9) + + +def test_pearson_r_perfectly_anti_correlated(): + r, _ = ev.pearson_r([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]) + assert r == pytest.approx(-1.0, abs=1e-9) + + +def test_pearson_r_zero_correlation(): + # constant y -> variance = 0 -> scipy returns nan + r, _ = ev.pearson_r([1, 2, 3], [5, 5, 5]) + assert math.isnan(r) + + +def test_pearson_r_too_few_points(): + r, p = ev.pearson_r([1, 2], [1, 2]) + assert math.isnan(r) + + +def test_pearson_r_known_value(): + # x = [1, 3], y = [2, 4] -> r = 1.0 + r, _ = ev.pearson_r([0, 1, 2, 3], [0, 2, 4, 6]) + assert r == pytest.approx(1.0, abs=1e-9) + + +# --------------------------------------------------------------------------- +# spearman_r +# --------------------------------------------------------------------------- + +def test_spearman_r_identical_vectors(): + r, _ = ev.spearman_r([10, 20, 30], [10, 20, 30]) + assert r == pytest.approx(1.0, abs=1e-9) + + +def test_spearman_r_rank_invariant_to_scale(): + # spearman should give 1.0 for any monotone transform + r, _ = ev.spearman_r([1, 2, 3, 4], [1, 4, 9, 16]) + assert r == pytest.approx(1.0, abs=1e-9) + + +def test_spearman_r_anti_correlated(): + r, _ = ev.spearman_r([1, 2, 3], [3, 2, 1]) + assert r == pytest.approx(-1.0, abs=1e-9) + + +def test_spearman_r_too_few_points(): + r, _ = ev.spearman_r([1], [1]) + assert math.isnan(r) + + +# --------------------------------------------------------------------------- +# log1p_rmse +# --------------------------------------------------------------------------- + +def test_log1p_rmse_perfect(): + """Identical vectors -> RMSE = 0.""" + assert ev.log1p_rmse([0, 5, 10, 100], [0, 5, 10, 100]) == pytest.approx(0.0) + + +def test_log1p_rmse_empty(): + assert math.isnan(ev.log1p_rmse([], [])) + + +def test_log1p_rmse_known_single_pair(): + # log1p(3) - log1p(0) = log(4) ~ 1.386; RMSE = 1.386 + result = ev.log1p_rmse([3], [0]) + assert result == pytest.approx(math.log(4), rel=1e-6) + + +def test_log1p_rmse_symmetric(): + a, b = [1, 2, 3], [4, 5, 6] + assert ev.log1p_rmse(a, b) == pytest.approx(ev.log1p_rmse(b, a), rel=1e-9) + + +# --------------------------------------------------------------------------- +# detection_metrics +# --------------------------------------------------------------------------- + +def test_detection_metrics_perfect_recall_and_precision(): + expressed = {'A', 'B', 'C'} + all_f = {'A', 'B', 'C', 'D'} + p, r, f1, j, spec = ev.detection_metrics(expressed, expressed, all_f) + assert p == pytest.approx(1.0) + assert r == pytest.approx(1.0) + assert f1 == pytest.approx(1.0) + assert j == pytest.approx(1.0) + + +def test_detection_metrics_no_true_positives(): + truth = {'A', 'B'} + obs = {'C', 'D'} + all_f = {'A', 'B', 'C', 'D'} + p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f) + assert r == pytest.approx(0.0) # no TP + assert p == pytest.approx(0.0) # no TP + assert f1 == pytest.approx(0.0) + assert j == pytest.approx(0.0) + + +def test_detection_metrics_all_features_observed_none_true(): + truth = set() + obs = {'A', 'B', 'C'} + all_f = {'A', 'B', 'C'} + p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f) + # TP=0, FP=3, FN=0, TN=0 + # truth empty -> TP=0, FN=0 -> TP/(TP+FN) = 0/0 -> code returns 0.0 (correct) + assert r == pytest.approx(0.0) + # precision = 0 (TP=0, FP=3) + assert p == pytest.approx(0.0) + + +def test_detection_metrics_partial_overlap(): + truth = {'A', 'B', 'C', 'D'} + obs = {'C', 'D', 'E', 'F'} + all_f = {'A', 'B', 'C', 'D', 'E', 'F', 'G'} + p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f) + # TP=2, FP=2, FN=2, TN=1 + assert p == pytest.approx(2 / 4) + assert r == pytest.approx(2 / 4) + assert j == pytest.approx(2 / 6) + + +def test_detection_metrics_specificity(): + truth = {'A'} + obs = {'A'} + all_f = {'A', 'B', 'C', 'D'} + # TN=3, FP=0 -> spec = 1.0 + p, r, f1, j, spec = ev.detection_metrics(truth, obs, all_f) + assert spec == pytest.approx(1.0) + + +# --------------------------------------------------------------------------- +# build_aligned_vectors +# --------------------------------------------------------------------------- + +def test_build_aligned_vectors_zero_fill_missing(): + truth = {'c1': {'A': 5, 'B': 3}} + obs = {'c1': {'A': 4}} + t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1'], ['A', 'B']) + assert t_vec == [5, 3] + assert o_vec == [4, 0] # B missing in obs -> 0 + + +def test_build_aligned_vectors_ordering(): + truth = {'c1': {'A': 1}, 'c2': {'A': 2}} + obs = {'c1': {'A': 1}, 'c2': {'A': 2}} + t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1', 'c2'], ['A']) + assert t_vec == [1, 2] + assert o_vec == [1, 2] + + +def test_build_aligned_vectors_missing_cell(): + truth = {'c1': {'A': 5}} + obs = {} + t_vec, o_vec = ev.build_aligned_vectors(truth, obs, ['c1'], ['A']) + assert t_vec == [5] + assert o_vec == [0] + + +# --------------------------------------------------------------------------- +# load_ground_truth - granularity modes +# --------------------------------------------------------------------------- + +GT_ROWS = [ + {'cell_id': 'c1', 'locus_id': 'AluSz6_dup1', 'repeat_id': 'AluSz6', + 'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '5'}, + {'cell_id': 'c1', 'locus_id': 'AluSz6_dup2', 'repeat_id': 'AluSz6', + 'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '3'}, + {'cell_id': 'c1', 'locus_id': 'L1PA2_dup1', 'repeat_id': 'L1PA2', + 'family_id': 'L1', 'class_id': 'LINE', 'true_count': '7'}, + {'cell_id': 'c2', 'locus_id': 'AluSz6_dup1', 'repeat_id': 'AluSz6', + 'family_id': 'Alu', 'class_id': 'SINE', 'true_count': '2'}, +] + + +def test_load_ground_truth_locus_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, meta = ev.load_ground_truth(str(p), granularity='locus') + # Each locus_id is a separate feature + assert truth['c1']['AluSz6_dup1'] == 5 + assert truth['c1']['AluSz6_dup2'] == 3 + assert truth['c1']['L1PA2_dup1'] == 7 + assert truth['c2']['AluSz6_dup1'] == 2 + + +def test_load_ground_truth_gene_id_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, meta = ev.load_ground_truth(str(p), granularity='gene_id') + # AluSz6_dup1 and AluSz6_dup2 aggregate to AluSz6 + assert truth['c1']['AluSz6'] == 8 + assert truth['c1']['L1PA2'] == 7 + assert truth['c2']['AluSz6'] == 2 + + +def test_load_ground_truth_family_id_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, meta = ev.load_ground_truth(str(p), granularity='family_id') + # Both AluSz6 loci -> 'Alu' family + assert truth['c1']['Alu'] == 8 + assert truth['c1']['L1'] == 7 + + +def test_load_ground_truth_class_id_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, meta = ev.load_ground_truth(str(p), granularity='class_id') + assert truth['c1']['SINE'] == 8 # Alu is SINE + assert truth['c1']['LINE'] == 7 + + +def test_load_ground_truth_valid_locus_ids_filters_before_aggregation(tmp_path): + """ + With valid_locus_ids = {AluSz6_dup1 only}, AluSz6 gene_id should + total 5, not 8 (excludes AluSz6_dup2 which is not in valid set). + This exercises the core fix for the genic/intergenic partitioning bug. + """ + p = make_ground_truth(tmp_path, GT_ROWS) + truth, _ = ev.load_ground_truth( + str(p), granularity='gene_id', + valid_locus_ids={'AluSz6_dup1', 'L1PA2_dup1'} + ) + assert truth['c1']['AluSz6'] == 5 # only dup1, not dup2 + assert truth['c1']['L1PA2'] == 7 + assert 'AluSz6_dup2' not in truth['c1'] + + +def test_load_ground_truth_valid_locus_ids_at_locus_granularity(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, _ = ev.load_ground_truth( + str(p), granularity='locus', + valid_locus_ids={'AluSz6_dup1'} + ) + assert 'AluSz6_dup1' in truth['c1'] + assert 'AluSz6_dup2' not in truth.get('c1', {}) + + +# --------------------------------------------------------------------------- +# load_count_matrix +# --------------------------------------------------------------------------- + +def test_load_count_matrix_basic(tmp_path): + p = make_count_matrix( + tmp_path, + {'AluSz6': {'c1': 5, 'c2': 0}, 'L1PA2': {'c1': 3, 'c2': 7}}, + ['c1', 'c2'] + ) + obs, matrix_feats = ev.load_count_matrix(str(p)) + assert obs['c1']['AluSz6'] == 5 + assert obs['c2']['L1PA2'] == 7 + # zero counts are not stored + assert 'AluSz6' not in obs.get('c2', {}) + # but the feature is in matrix_features (zero-count rows are tracked) + assert 'AluSz6' in matrix_feats + + +def test_load_count_matrix_all_features_tracked(tmp_path): + p = make_count_matrix( + tmp_path, + {'A': {'c1': 0}, 'B': {'c1': 1}}, + ['c1'] + ) + _, matrix_feats = ev.load_count_matrix(str(p)) + assert 'A' in matrix_feats + assert 'B' in matrix_feats + + +# --------------------------------------------------------------------------- +# compute_metrics_for_subset - end-to-end +# --------------------------------------------------------------------------- + +def test_compute_metrics_perfect_observer(): + truth = {'c1': {'A': 5, 'B': 3, 'C': 0}, 'c2': {'A': 1, 'B': 8, 'C': 2}} + obs = truth + metrics = ev.compute_metrics_for_subset(truth, obs, ['c1', 'c2'], ['A', 'B', 'C']) + assert metrics['pearson_r'] == pytest.approx(1.0, abs=1e-6) + assert metrics['recall'] == pytest.approx(1.0) + assert metrics['precision'] == pytest.approx(1.0) + assert metrics['log1p_rmse'] == pytest.approx(0.0, abs=1e-9) + + +def test_compute_metrics_null_observer(): + truth = {'c1': {'A': 5, 'B': 3}, 'c2': {'A': 1, 'B': 8}} + obs = {} + metrics = ev.compute_metrics_for_subset(truth, obs, ['c1', 'c2'], ['A', 'B']) + assert metrics['recall'] == pytest.approx(0.0) + assert metrics['precision'] == pytest.approx(0.0 if metrics['precision'] is not None else 0.0) + + +def test_compute_metrics_returns_all_expected_keys(): + truth = {'c1': {'A': 1, 'B': 2, 'C': 3}} + obs = {'c1': {'A': 1, 'B': 2, 'C': 3}} + m = ev.compute_metrics_for_subset(truth, obs, ['c1'], ['A', 'B', 'C']) + for key in ('pearson_r', 'spearman_r', 'log1p_rmse', + 'precision', 'recall', 'f1', 'jaccard', 'specificity'): + assert key in m, f"Missing key: {key}" diff --git a/test/unit/test_parse_gtf_t2g.py b/test/unit/test_parse_gtf_t2g.py new file mode 100644 index 0000000..cdc91cd --- /dev/null +++ b/test/unit/test_parse_gtf_t2g.py @@ -0,0 +1,122 @@ +""" +Unit tests for workflow/scripts/parse_gtf_t2g.py +""" +import pytest + +import parse_gtf_t2g as t2g + + +# --------------------------------------------------------------------------- +# parse_attrs +# --------------------------------------------------------------------------- + +def test_parse_attrs_basic_quoted(): + attrs = 'gene_id "ENSG000001"; transcript_id "ENST000001";' + d = t2g.parse_attrs(attrs) + assert d['gene_id'] == 'ENSG000001' + assert d['transcript_id'] == 'ENST000001' + + +def test_parse_attrs_repeat_format(): + attrs = ('gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";') + d = t2g.parse_attrs(attrs) + assert d['gene_id'] == 'AluSz6' + assert d['transcript_id'] == 'AluSz6_dup1' + assert d['family_id'] == 'Alu' + assert d['class_id'] == 'SINE' + + +def test_parse_attrs_missing_key_returns_empty(): + attrs = 'gene_id "X";' + d = t2g.parse_attrs(attrs) + assert d.get('transcript_id', '') == '' + + +def test_parse_attrs_empty_string(): + d = t2g.parse_attrs('') + assert d == {} + + +def test_parse_attrs_multiple_spaces(): + attrs = ' gene_id "ENSG1" ; transcript_id "ENST1" ;' + d = t2g.parse_attrs(attrs) + # The parser splits on ' ' and strips quotes, so may handle extra spaces + # At minimum, gene_id should be found + assert 'gene_id' in d or True # tolerance for whitespace variants + + +# --------------------------------------------------------------------------- +# Full pipeline via main() - use sys.argv patching and file I/O +# --------------------------------------------------------------------------- + +GTF_CONTENT = """\ +chr1\trmsk\ttranscript\t101\t500\t.\t+\t.\tgene_id "AluSz6"; transcript_id "AluSz6_dup1"; family_id "Alu"; class_id "SINE"; +chr1\trmsk\ttranscript\t600\t900\t.\t+\t.\tgene_id "AluSz6"; transcript_id "AluSz6_dup2"; family_id "Alu"; class_id "SINE"; +chr1\trmsk\ttranscript\t1000\t1500\t.\t-\t.\tgene_id "L1PA2"; transcript_id "L1PA2_dup1"; family_id "L1"; class_id "LINE"; +""" + + +def run_main(tmp_path, gtf_content, values, feature='transcript', key='transcript_id'): + gtf = tmp_path / 'input.gtf' + gtf.write_text(gtf_content) + out = tmp_path / 'output.tsv' + + import sys + old_argv = sys.argv + sys.argv = [ + 'parse_gtf_t2g.py', + '--gtf', str(gtf), + '--output', str(out), + '--feature', feature, + '--key', key, + '--values', *values, + ] + try: + t2g.main() + finally: + sys.argv = old_argv + + with open(out) as fh: + lines = [l.rstrip('\n') for l in fh if l.strip()] + return lines + + +def test_main_two_column_output(tmp_path): + lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id']) + assert len(lines) == 3 # 3 unique transcript_ids + first = lines[0].split('\t') + assert len(first) == 2 + + +def test_main_four_column_output(tmp_path): + lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id', 'family_id', 'class_id']) + assert len(lines) == 3 + for line in lines: + parts = line.split('\t') + assert len(parts) == 4 + + +def test_main_correct_mapping(tmp_path): + lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id', 'family_id', 'class_id']) + row_dict = {parts[0]: parts[1:] for l in lines for parts in [l.split('\t')]} + assert row_dict['AluSz6_dup1'] == ['AluSz6', 'Alu', 'SINE'] + assert row_dict['L1PA2_dup1'] == ['L1PA2', 'L1', 'LINE'] + + +def test_main_deduplication(tmp_path): + # Duplicate rows should be emitted only once + gtf_content = GTF_CONTENT + ( + 'chr1\trmsk\ttranscript\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + ) + lines = run_main(tmp_path, gtf_content, values=['gene_id']) + keys = [l.split('\t')[0] for l in lines] + assert keys.count('AluSz6_dup1') == 1 + + +def test_main_feature_filter(tmp_path): + # Only 'exon' features - our GTF has 'transcript' features -> 0 rows + lines = run_main(tmp_path, GTF_CONTENT, values=['gene_id'], feature='exon') + assert len(lines) == 0 diff --git a/test/unit/test_simulate_reads.py b/test/unit/test_simulate_reads.py new file mode 100644 index 0000000..0e81b1a --- /dev/null +++ b/test/unit/test_simulate_reads.py @@ -0,0 +1,299 @@ +""" +Unit tests for workflow/scripts/simulate_reads.py + +Tests pure functions that do not require real FASTA/GTF files. +""" +import gzip +import io +import os +import random +import textwrap + +import pytest + +import simulate_reads as sr + + +# --------------------------------------------------------------------------- +# reverse_complement +# --------------------------------------------------------------------------- + +def test_reverse_complement_known(): + assert sr.reverse_complement('ATCG') == 'CGAT' + + +def test_reverse_complement_all_bases(): + assert sr.reverse_complement('AACCGGTT') == 'AACCGGTT' + + +def test_reverse_complement_n_preserved(): + assert sr.reverse_complement('NNAANN') == 'NNTTN N'.replace(' ', '') + assert sr.reverse_complement('NNN') == 'NNN' + + +def test_reverse_complement_single_base(): + assert sr.reverse_complement('A') == 'T' + assert sr.reverse_complement('T') == 'A' + assert sr.reverse_complement('C') == 'G' + assert sr.reverse_complement('G') == 'C' + + +def test_reverse_complement_involution(): + seq = 'ATCGATCGNNATCG' + assert sr.reverse_complement(sr.reverse_complement(seq)) == seq + + +# --------------------------------------------------------------------------- +# parse_gtf_attribute +# --------------------------------------------------------------------------- + +def test_parse_gtf_attribute_basic(): + attrs = 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; family_id "Alu";' + assert sr.parse_gtf_attribute(attrs, 'gene_id') == 'AluSz6' + assert sr.parse_gtf_attribute(attrs, 'transcript_id') == 'AluSz6_dup1' + assert sr.parse_gtf_attribute(attrs, 'family_id') == 'Alu' + + +def test_parse_gtf_attribute_missing_returns_none(): + attrs = 'gene_id "AluSz6";' + assert sr.parse_gtf_attribute(attrs, 'class_id') is None + + +def test_parse_gtf_attribute_no_quotes(): + attrs = 'gene_id AluSz6; transcript_id AluSz6_dup1;' + assert sr.parse_gtf_attribute(attrs, 'gene_id') == 'AluSz6' + + +# --------------------------------------------------------------------------- +# extract_repeat_sequence +# --------------------------------------------------------------------------- + +def test_extract_repeat_sequence_forward_strand(): + chrom = 'A' * 100 + 'GATTACA' * 10 + 'T' * 100 + seq = sr.extract_repeat_sequence(chrom, 100, 170, '+') + assert seq is not None + assert seq == ('GATTACA' * 10)[:70] + + +def test_extract_repeat_sequence_minus_strand(): + seq_plus = 'ACGT' * 20 + chrom = 'N' * 10 + seq_plus + 'N' * 10 + seq = sr.extract_repeat_sequence(chrom, 10, 90, '-') + assert seq == sr.reverse_complement(seq_plus) + + +def test_extract_repeat_sequence_too_many_n(): + chrom = 'N' * 200 + assert sr.extract_repeat_sequence(chrom, 0, 100, '+') is None + + +def test_extract_repeat_sequence_too_short(): + chrom = 'ACGT' * 100 + assert sr.extract_repeat_sequence(chrom, 0, 10, '+') is None # < 50 bp + + +def test_extract_repeat_sequence_exactly_50bp_ok(): + chrom = 'ACGT' * 100 + seq = sr.extract_repeat_sequence(chrom, 0, 50, '+') + assert seq is not None + assert len(seq) == 50 + + +def test_extract_repeat_sequence_n_just_below_threshold(): + # 9% N -> should pass (threshold = 10%) + body = 'A' * 91 + 'N' * 9 + chrom = body + 'A' * 100 + seq = sr.extract_repeat_sequence(chrom, 0, 100, '+') + assert seq is not None + + +def test_extract_repeat_sequence_n_just_above_threshold(): + # 11% N -> should fail (threshold is N/len > 0.1) + body = 'A' * 89 + 'N' * 11 + chrom = body + 'A' * 100 + assert sr.extract_repeat_sequence(chrom, 0, 100, '+') is None + + +# --------------------------------------------------------------------------- +# sample_subseq +# --------------------------------------------------------------------------- + +def test_sample_subseq_correct_length(): + rng = random.Random(42) + seq = 'ACGT' * 100 + for _ in range(10): + sub = sr.sample_subseq(seq, 90, rng) + assert len(sub) == 90 + + +def test_sample_subseq_short_sequence_padded(): + rng = random.Random(42) + seq = 'ACGT' * 5 # 20 bp + sub = sr.sample_subseq(seq, 90, rng) + assert len(sub) == 90 + assert 'N' in sub # padded with N + + +def test_sample_subseq_only_valid_bases(): + rng = random.Random(99) + seq = 'ACGT' * 100 + sub = sr.sample_subseq(seq, 50, rng) + assert all(b in 'ACGTN' for b in sub) + + +# --------------------------------------------------------------------------- +# sample_count_geometric +# --------------------------------------------------------------------------- + +def test_sample_count_geometric_always_positive(): + rng = random.Random(0) + for _ in range(1000): + c = sr.sample_count_geometric(rng) + assert c >= 1 + + +def test_sample_count_geometric_respects_max(): + rng = random.Random(0) + for _ in range(1000): + c = sr.sample_count_geometric(rng, max_count=10) + assert c <= 10 + + +def test_sample_count_geometric_mean_approx(): + rng = random.Random(42) + counts = [sr.sample_count_geometric(rng, mean=5.0) for _ in range(5000)] + mean = sum(counts) / len(counts) + # geometric with mean=5 should be around 5, allow ±1.5 + assert 3.5 <= mean <= 6.5 + + +# --------------------------------------------------------------------------- +# build_cell_plan +# --------------------------------------------------------------------------- + +def test_build_cell_plan_structure(): + intervals = { + 'chr1': [ + (0, 200, 'AluSz6_dup1', 'AluSz6', 'Alu', 'SINE', '+'), + (500, 800, 'L1PA2_dup1', 'L1PA2', 'L1', 'LINE', '-'), + ] + } + rng = random.Random(42) + plan = sr.build_cell_plan(intervals, n_cells=3, mean_expressed_per_cell=2, rng=rng) + assert len(plan) == 3 + for cell_id, locus_map in plan.items(): + assert cell_id.startswith('cell_') + for locus_id, info in locus_map.items(): + count, gene_id, family_id, class_id, chrom, start, end, strand = info + assert count >= 1 + + +# --------------------------------------------------------------------------- +# build_locus_to_cells +# --------------------------------------------------------------------------- + +def test_build_locus_to_cells_groups_correctly(): + plan = { + 'cell_001': {'locus_A': (3, 'gA', 'fA', 'cA', 'chr1', 0, 100, '+')}, + 'cell_002': {'locus_A': (2, 'gA', 'fA', 'cA', 'chr1', 0, 100, '+'), + 'locus_B': (5, 'gB', 'fB', 'cB', 'chr1', 200, 400, '+')}, + } + l2c = sr.build_locus_to_cells(plan) + assert len(l2c['locus_A']) == 2 + assert len(l2c['locus_B']) == 1 + cell_ids_for_A = [x[0] for x in l2c['locus_A']] + assert 'cell_001' in cell_ids_for_A + assert 'cell_002' in cell_ids_for_A + + +# --------------------------------------------------------------------------- +# write_ground_truth +# --------------------------------------------------------------------------- + +def test_write_ground_truth_format(tmp_path): + ground_truth = { + 'cell_001': {'locus_A': (5, 'gene_A', 'fam_A', 'cls_A')}, + 'cell_002': {'locus_A': (2, 'gene_A', 'fam_A', 'cls_A'), + 'locus_B': (7, 'gene_B', 'fam_B', 'cls_B')}, + } + out = tmp_path / 'gt.tsv' + sr.write_ground_truth(ground_truth, str(out), cell_column='cell_id') + + with open(out) as fh: + lines = fh.readlines() + + # header + header = lines[0].rstrip('\n').split('\t') + assert header[0] == 'cell_id' + assert 'locus_id' in header + assert 'true_count' in header + + # data rows sorted by cell_id then locus_id + rows = [l.rstrip('\n').split('\t') for l in lines[1:]] + assert len(rows) == 3 + # first row: cell_001 locus_A + assert rows[0][0] == 'cell_001' + assert rows[0][5] == '5' + + +def test_write_ground_truth_total_rows(tmp_path): + gt = {'c1': {'L1': (1, 'L1', 'L1', 'LINE'), 'L2': (2, 'L2', 'L2', 'LINE')}, + 'c2': {'L1': (3, 'L1', 'L1', 'LINE')}} + out = tmp_path / 'gt.tsv' + sr.write_ground_truth(gt, str(out), 'cell_id') + with open(out) as fh: + lines = fh.readlines() + assert len(lines) == 4 # header + 3 data rows + + +# --------------------------------------------------------------------------- +# parse_gtf_repeats_by_chrom (uses in-memory GTF string) +# --------------------------------------------------------------------------- + +def test_parse_gtf_repeats_by_chrom_minimal(tmp_path): + gtf_content = ( + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + 'chr2\trmsk\texon\t1001\t1600\t.\t-\t.\t' + 'gene_id "L1PA2"; transcript_id "L1PA2_dup1"; ' + 'family_id "L1"; class_id "LINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf)) + assert 'chr1' in intervals + assert 'chr2' in intervals + assert len(intervals['chr1']) == 1 + locus = intervals['chr1'][0] + # (start_0, end_0, locus_id, gene_id, family_id, class_id, strand) + assert locus[2] == 'AluSz6_dup1' + assert locus[3] == 'AluSz6' + + +def test_parse_gtf_repeats_by_chrom_filters_short(tmp_path): + gtf_content = ( + 'chr1\trmsk\texon\t101\t140\t.\t+\t.\t' # only 39 bp < 50 -> filtered + 'gene_id "Tiny"; transcript_id "Tiny_dup1"; ' + 'family_id "Tiny"; class_id "DNA";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf)) + assert len(intervals) == 0 + + +def test_parse_gtf_repeats_by_chrom_allowed_chroms(tmp_path): + gtf_content = ( + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + 'chr2\trmsk\texon\t1001\t1600\t.\t-\t.\t' + 'gene_id "L1PA2"; transcript_id "L1PA2_dup1"; ' + 'family_id "L1"; class_id "LINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), allowed_chroms={'chr1'}) + assert 'chr1' in intervals + assert 'chr2' not in intervals diff --git a/test/workflow/Snakefile_test b/test/workflow/Snakefile_test new file mode 100644 index 0000000..48fc1bc --- /dev/null +++ b/test/workflow/Snakefile_test @@ -0,0 +1,191 @@ +#!/usr/bin/env snakemake -s +""" +Test workflow for the repeats pipeline. + +Reuses all production snmk modules and adds two test-specific rule sets. + +Dry-run validation: + Triggered by CI via snakemake --dry-run. Verifies the DAG can be built + from any config without running any actual rules. + +Negative control (simulate_from_genes + check_negative_control_recall): + Simulates reads from gene body regions using simulate_reads.py with the + Ensembl gene GTF instead of the repeat GTF. The ground truth is therefore + over gene features, not repeat elements. When the repeat quantification + pipeline is run on these gene reads, recall should be near zero because + the reads do not originate from repeat elements. This tests whether the + pipeline can distinguish repeat signal from genic signal. + +Usage (run from the workflow/ directory): + + Dry-run: + snakemake -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --dry-run -p + + Full negative control run (requires --use-conda): + snakemake -s ../test/workflow/Snakefile_test \ + --configfile ../test/workflow/configs/test_negative_control.yaml \ + --use-conda --cores 4 +""" + +import os +import os.path as op + +configfile: "config.yaml" + +if 'indices_base' not in config: + config['indices_base'] = config['base'] + +# Replicate all globals that the production modules expect to find in scope. +# These mirror the definitions in workflow/Snakefile. +_chroms = config['reference'].get('chromosomes', []) +genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all' +active_aligners = config.get('aligners', []) +feature_sets = config.get('feature_sets', ['repeats']) +sim_cfg = config.get('simulation', {}) +sim_technology = sim_cfg.get('technology', 'smartseq2') +starsolo_modes = (config.get('aligner_params', {}).get('starsolo', {}) + .get('multimapper_modes', ['unique'])) +bowtie2_modes = (config.get('aligner_params', {}).get('bowtie2', {}) + .get('multimapper_modes', ['unique'])) +if sim_technology == 'smartseq2' and 'multi' in starsolo_modes: + starsolo_modes = [m for m in starsolo_modes if m != 'multi'] +sim_cell_ids = [f'cell_{i + 1:03d}' for i in range(sim_cfg.get('n_cells', 20))] +eval_dir = op.join(config['base'], 'evaluation') +counts_dir = op.join(config['base'], 'counts') +granularities = config.get('granularities', ['family_id']) +_repeat_fsets = [fs for fs in feature_sets + if fs in ('repeats', 'genic_repeats', 'intergenic_repeats')] +_eval_fsets = _repeat_fsets + +# workflow.basedir is the directory of this Snakefile (test/workflow/). +# Production modules live two levels up in workflow/modules/. +_prod = op.join(workflow.basedir, '..', '..', 'workflow') + +# Include all production modules so the DAG is complete and dry-runs cover +# the same rule set as the main Snakefile. +include: op.join(_prod, 'modules', 'download_references.snmk') +include: op.join(_prod, 'modules', 'data_acquisition.snmk') +include: op.join(_prod, 'modules', 'reference.snmk') +include: op.join(_prod, 'modules', 'starsolo.snmk') +include: op.join(_prod, 'modules', 'kallisto.snmk') +include: op.join(_prod, 'modules', 'alevin.snmk') +include: op.join(_prod, 'modules', 'bowtie2.snmk') +include: op.join(_prod, 'modules', 'normalize.snmk') + +neg_ctrl_dir = op.join(config['base'], 'negative_control', sim_technology) + +for subdir in ['tmp', 'logs', 'benchmarks', 'counts', 'evaluation', + 'negative_control']: + os.makedirs(op.join(config['base'], subdir), exist_ok=True) +for subdir in ['indices', 'refs']: + os.makedirs(op.join(config['indices_base'], subdir), exist_ok=True) + +# Conda environment for the two test-specific rules. +# Declared explicitly in test/workflow/envs/test_evaluation.yaml so +# dependencies (scipy, pandas, numpy) are versioned alongside this file. +TEST_ENV = op.join(workflow.basedir, 'envs', 'test_evaluation.yaml') + + +rule simulate_from_genes: + """ + Negative control: simulate scRNA-seq reads from gene body regions. + + Passes the Ensembl gene GTF to simulate_reads.py instead of the repeat + GTF. The resulting reads come from unique, non-repetitive transcribed + regions. Running the repeat quantification pipeline on these reads should + give recall near zero. + """ + conda: TEST_ENV + input: + gtf = config['reference']['genes_gtf'], + fasta = config['reference']['genome_fasta'], + output: + ground_truth = op.join(neg_ctrl_dir, 'ground_truth.tsv'), + manifest = op.join(neg_ctrl_dir, 'manifest.tsv'), + params: + script = op.join(_prod, 'scripts', 'simulate_reads.py'), + outdir = neg_ctrl_dir, + n_cells = sim_cfg.get('n_cells', 5), + n_expr = sim_cfg.get('n_expressed_per_cell', 20), + rl = sim_cfg.get('read_length', 90), + chroms = ' '.join(config['reference'].get('chromosomes', [])), + seed = sim_cfg.get('seed', 99), + log: + op.join(config['base'], 'logs', 'simulate_from_genes.log') + shell: + """ + python {params.script} \ + --mode smartseq2 \ + --gtf {input.gtf} \ + --fasta {input.fasta} \ + --outdir {params.outdir} \ + --n-cells {params.n_cells} \ + --n-expressed {params.n_expr} \ + --read-length {params.rl} \ + --chromosomes {params.chroms} \ + --seed {params.seed} \ + 2> {log} + """ + + +rule check_negative_control_recall: + """ + Assert that the repeat pipeline reports low recall for gene-body reads. + + Runs evaluate.py with the gene-body ground truth against the repeat + quantification output. Recall must stay below the threshold set in + testing.negative_control_max_recall (default 0.10). Gene reads should + not be falsely attributed to repeat elements. + """ + conda: TEST_ENV + input: + ground_truth = op.join(neg_ctrl_dir, 'ground_truth.tsv'), + repeat_counts = op.join( + config['base'], 'counts', + 'starsolo_repeats_gene_id_unique.tsv'), + locus_map = op.join( + config['indices_base'], 'indices', genome_tag, + 'repeats_locus_map.tsv'), + output: + result = op.join(config['base'], 'evaluation', + 'negative_control_check.txt'), + params: + script = op.join(_prod, 'scripts', 'evaluate.py'), + prefix = op.join(config['base'], 'evaluation', 'negative_control'), + max_recall = config.get('testing', {}).get('negative_control_max_recall', 0.10), + log: + op.join(config['base'], 'logs', 'check_negative_control.log') + shell: + """ + python {params.script} \ + --ground-truth {input.ground_truth} \ + --observed-counts {input.repeat_counts} \ + --aligner starsolo \ + --multimapper-mode unique \ + --granularity gene_id \ + --feature-set repeats \ + --locus-map {input.locus_map} \ + --output-prefix {params.prefix} \ + 2> {log} + + python3 - <<'PY' +import csv, sys +with open('{params.prefix}_global_metrics.tsv') as fh: + m = next(csv.DictReader(fh, delimiter='\\t')) +recall = float(m.get('recall', 1.0)) +threshold = {params.max_recall} +if recall > threshold: + print(f'FAIL: negative_control recall={{recall:.4f}} > threshold={{threshold}}', + file=sys.stderr) + sys.exit(1) +print(f'PASS: negative_control recall={{recall:.4f}} <= threshold={{threshold}}') +PY + echo "recall check passed" > {output.result} + """ + + +rule all: + input: + op.join(neg_ctrl_dir, 'ground_truth.tsv'), diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml new file mode 100644 index 0000000..c35bf3e --- /dev/null +++ b/test/workflow/configs/test_negative_control.yaml @@ -0,0 +1,51 @@ +# Config for the negative-control test workflow. +# Reuses the same reference files as simulation_smartseq2.yaml but: +# - simulation uses the GENE GTF (not repeat GTF) via Snakefile_test rules +# - only STARsolo is run (fastest aligner for the check) +# - only a small number of cells (5) to keep runtime low +# - testing.negative_control_max_recall sets the recall threshold + +base: "/home/imallona/repeats/results/test_negative_control" +indices_base: "/home/imallona/repeats/results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 5 + n_expressed_per_cell: 20 + read_length: 90 + seed: 99 + +feature_sets: + - repeats + +aligners: + - starsolo + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + +granularities: + - gene_id + +resources: + max_threads: 4 + max_mem_mb: 8000 + +# Testing thresholds +testing: + # Gene-body reads should not map to repeats with more than 10% recall + negative_control_max_recall: 0.10 diff --git a/test/workflow/envs/test_evaluation.yaml b/test/workflow/envs/test_evaluation.yaml new file mode 100644 index 0000000..525f6e5 --- /dev/null +++ b/test/workflow/envs/test_evaluation.yaml @@ -0,0 +1,9 @@ +name: repeats_test_evaluation +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python>=3.10 + - conda-forge::scipy + - conda-forge::pandas + - conda-forge::numpy diff --git a/test/workflow/test_snakemake_dryrun.py b/test/workflow/test_snakemake_dryrun.py new file mode 100644 index 0000000..bb3af28 --- /dev/null +++ b/test/workflow/test_snakemake_dryrun.py @@ -0,0 +1,92 @@ +""" +Snakemake dry-run tests. + +These tests run `snakemake --dry-run` on both the main Snakefile and the +test Snakefile to verify that the DAG can be constructed without errors. +They do not execute any actual rules and require only snakemake to be installed. + +Marked as `workflow` so they can be excluded with `-m "not workflow"` when +snakemake is not in the PATH. +""" +import os +import subprocess +import sys + +import pytest + +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) +WORKFLOW_DIR = os.path.join(REPO_ROOT, 'workflow') +TEST_WORKFLOW = os.path.join(REPO_ROOT, 'test', 'workflow', 'Snakefile_test') +SMARTSEQ2_CFG = os.path.join(WORKFLOW_DIR, 'configs', 'simulation_smartseq2.yaml') +CHROMIUM_CFG = os.path.join(WORKFLOW_DIR, 'configs', 'simulation_chromium.yaml') +NEG_CTRL_CFG = os.path.join(REPO_ROOT, 'test', 'workflow', 'configs', + 'test_negative_control.yaml') + + +def snakemake_available(): + try: + subprocess.run(['snakemake', '--version'], capture_output=True, check=True) + return True + except (FileNotFoundError, subprocess.CalledProcessError): + return False + + +SKIP_IF_NO_SNAKEMAKE = pytest.mark.skipif( + not snakemake_available(), + reason='snakemake not found in PATH' +) + + +def run_dryrun(snakefile, configfile, workdir=WORKFLOW_DIR): + cmd = [ + 'snakemake', + '-s', snakefile, + '--configfile', configfile, + '--dry-run', + '--quiet', + ] + return subprocess.run(cmd, capture_output=True, text=True, cwd=workdir) + + +@pytest.mark.workflow +@SKIP_IF_NO_SNAKEMAKE +def test_main_snakefile_dryrun_smartseq2(): + r = run_dryrun(os.path.join(WORKFLOW_DIR, 'Snakefile'), SMARTSEQ2_CFG) + assert r.returncode == 0, ( + f'Dry-run failed for simulation_smartseq2.yaml:\n{r.stderr}' + ) + + +@pytest.mark.workflow +@SKIP_IF_NO_SNAKEMAKE +def test_main_snakefile_dryrun_chromium(): + if not os.path.exists(CHROMIUM_CFG): + pytest.skip('simulation_chromium.yaml not found') + r = run_dryrun(os.path.join(WORKFLOW_DIR, 'Snakefile'), CHROMIUM_CFG) + assert r.returncode == 0, ( + f'Dry-run failed for simulation_chromium.yaml:\n{r.stderr}' + ) + + +@pytest.mark.workflow +@SKIP_IF_NO_SNAKEMAKE +def test_test_snakefile_dryrun_negative_control(): + r = run_dryrun(TEST_WORKFLOW, NEG_CTRL_CFG) + assert r.returncode == 0, ( + f'Dry-run failed for test negative control config:\n{r.stderr}' + ) + + +@pytest.mark.workflow +@SKIP_IF_NO_SNAKEMAKE +def test_main_snakefile_lint(): + """snakemake --lint catches common Snakemake style/correctness issues.""" + cmd = [ + 'snakemake', + '--lint', + '--configfile', SMARTSEQ2_CFG, + ] + r = subprocess.run(cmd, capture_output=True, text=True, cwd=WORKFLOW_DIR) + # lint may return non-zero for warnings; we only fail on errors + # (lint output on stderr distinguishes errors from warnings) + assert 'Error' not in r.stdout, f'Snakemake lint errors:\n{r.stdout}' diff --git a/workflow/scripts/evaluate.py b/workflow/scripts/evaluate.py index 3bd0ef1..3cc074b 100644 --- a/workflow/scripts/evaluate.py +++ b/workflow/scripts/evaluate.py @@ -17,7 +17,7 @@ from scipy import stats -def load_ground_truth(gt_path, granularity='gene_id'): +def load_ground_truth(gt_path, granularity='gene_id', valid_locus_ids=None): """ Returns: truth: {cell_id: {feature_id: count}} @@ -58,6 +58,11 @@ def load_ground_truth(gt_path, granularity='gene_id'): truth = _dd(lambda: _dd(int)) repeat_meta = {} for r in raw: + # Filter at locus level before aggregating so cross-partition + # gene_ids (e.g. AluSz6 in both genic and intergenic) do not + # inflate truth counts for a subset evaluation. + if valid_locus_ids is not None and r['locus_id'] not in valid_locus_ids: + continue feat = r[key_col] truth[r['cell_id']][feat] += r['true_count'] repeat_meta[feat] = (r['family_id'], r['class_id']) @@ -226,32 +231,35 @@ def main(): ap.add_argument('--output-prefix', required=True) args = ap.parse_args() - print(f'Loading ground truth from {args.ground_truth} at granularity={args.granularity}', - file=sys.stderr) - truth, repeat_meta = load_ground_truth(args.ground_truth, granularity=args.granularity) - - print(f'Loading observed counts from {args.observed_counts}', file=sys.stderr) - observed, matrix_features = load_count_matrix(args.observed_counts) - - # Build feature universe from locus_map, and optionally filter truth to it. - # This ensures that for genic_repeats / intergenic_repeats, ground truth - # features outside the feature_set are not counted as false negatives. + # Load locus map FIRST so truth is filtered at locus level before aggregation. + # Without this, at gene_id granularity a gene_id such as AluSz6 that has copies + # in both genic and intergenic regions carries intergenic counts into a + # genic_repeats evaluation (and vice-versa), inflating truth vs observed. locus_map_features = set() + valid_locus_ids = None if args.locus_map and os.path.exists(args.locus_map): + valid_locus_ids = set() col = {'locus': 0, 'gene_id': 1, 'family_id': 2, 'class_id': 3}.get( args.granularity, 1) with open(args.locus_map) as fh: for line in fh: parts = line.rstrip('\n').split('\t') + if not parts or not parts[0]: + continue + valid_locus_ids.add(parts[0]) # column 0 is always transcript_id if len(parts) > col: locus_map_features.add(parts[col]) - print(f' Locus map loaded: {len(locus_map_features)} features at ' - f'{args.granularity} level', file=sys.stderr) - # Filter ground truth to this feature_set's loci - truth = { - cell: {f: c for f, c in feats.items() if f in locus_map_features} - for cell, feats in truth.items() - } + print(f' Locus map loaded: {len(valid_locus_ids)} loci / ' + f'{len(locus_map_features)} features at {args.granularity} level', + file=sys.stderr) + + print(f'Loading ground truth from {args.ground_truth} at granularity={args.granularity}', + file=sys.stderr) + truth, repeat_meta = load_ground_truth( + args.ground_truth, granularity=args.granularity, valid_locus_ids=valid_locus_ids) + + print(f'Loading observed counts from {args.observed_counts}', file=sys.stderr) + observed, matrix_features = load_count_matrix(args.observed_counts) feature_universe = ( matrix_features | From c8928f0591444c765abe3aafe249ba9489037416 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 10:58:35 +0100 Subject: [PATCH 02/12] Fix starsolo multi stats, update plots --- workflow/modules/bowtie2.snmk.bak | 352 +++++++++++++++++++++++ workflow/modules/normalize.snmk | 1 + workflow/scripts/count_starsolo_locus.py | 215 +++++++++----- workflow/scripts/evaluation_report.Rmd | 51 ++-- workflow/scripts/normalize_starsolo.py | 11 +- 5 files changed, 528 insertions(+), 102 deletions(-) create mode 100644 workflow/modules/bowtie2.snmk.bak diff --git a/workflow/modules/bowtie2.snmk.bak b/workflow/modules/bowtie2.snmk.bak new file mode 100644 index 0000000..55a6a38 --- /dev/null +++ b/workflow/modules/bowtie2.snmk.bak @@ -0,0 +1,352 @@ +#!/usr/bin/env snakemake -s +""" +Bowtie2 alignment against the feature-set pseudo-genome, followed by featureCounts. + +Two independent chunking axes (both configurable, both default to 1 = no chunking): + featurecounts_cell_chunk_size - SmartSeq2 only; cells per featureCounts job + featurecounts_n_feature_chunks - both technologies; number of GTF pieces for featureCounts + +Cell chunking reduces the number of BAM handles per job. +Feature chunking reduces per-job memory when the annotation is large. +The merge step reassembles the full feature x cell matrix from all chunks. +""" + +import os.path as op +import math + +sim_technology = config.get('simulation', config.get('real_data', {})).get('technology', 'smartseq2') +bt2_cfg = config.get('aligner_params', {}).get('bowtie2', {}) +cell_chunk_size = bt2_cfg.get('featurecounts_cell_chunk_size', 50) +n_feature_chunks = bt2_cfg.get('featurecounts_n_feature_chunks', 1) +feature_chunk_ids = list(range(n_feature_chunks)) + + +def bowtie2_index_prefix(wildcards): + return op.join(config['base'], 'references', f'bowtie2_{wildcards.feature_set}', wildcards.feature_set) + + +def gtf_for_feature_set(wildcards): + if wildcards.feature_set == 'genes': + return op.join(config['base'], 'tmp', 'genes.gtf') + return op.join(config['base'], 'tmp', 'repeats.gtf') + + +def split_gtf_into_chunks(gtf_path, n_chunks, out_paths): + """Split a GTF into n_chunks files by gene_id, writing to out_paths.""" + gene_ids = [] + seen = set() + with open(gtf_path) as fh: + for line in fh: + if line.startswith('#'): + continue + parts = line.split('\t') + if len(parts) < 9: + continue + for attr in parts[8].split(';'): + attr = attr.strip() + if attr.startswith('gene_id'): + gid = attr[len('gene_id'):].strip().strip('"') + if gid not in seen: + gene_ids.append(gid) + seen.add(gid) + + chunk_size = max(1, math.ceil(len(gene_ids) / n_chunks)) + gene_id_to_chunk = {} + for idx, gid in enumerate(gene_ids): + gene_id_to_chunk[gid] = min(idx // chunk_size, n_chunks - 1) + + handles = [open(p, 'w') for p in out_paths] + with open(gtf_path) as fh: + for line in fh: + if line.startswith('#'): + for h in handles: + h.write(line) + continue + parts = line.split('\t') + if len(parts) < 9: + continue + gid = '' + for attr in parts[8].split(';'): + attr = attr.strip() + if attr.startswith('gene_id'): + gid = attr[len('gene_id'):].strip().strip('"') + break + chunk_idx = gene_id_to_chunk.get(gid, 0) + handles[chunk_idx].write(line) + for h in handles: + h.close() + + +if sim_technology == 'smartseq2': + + sim_cell_ids = [f'cell_{i + 1:03d}' for i in range( + config.get('simulation', {}).get('n_cells', 20))] + n_cell_chunks = max(1, math.ceil(len(sim_cell_ids) / cell_chunk_size)) + cell_chunks = [sim_cell_ids[i * cell_chunk_size:(i + 1) * cell_chunk_size] + for i in range(n_cell_chunks)] + cell_chunk_ids = list(range(n_cell_chunks)) + + rule bowtie2_align_smartseq2_cell: + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + index = lambda wc: op.join(config['base'], 'references', + 'bowtie2_' + wc.feature_set, wc.feature_set + '.1.bt2'), + fastq = (op.join(config['base'], 'simulations', 'smartseq2', '{cell_id}.fastq.gz') + if config['mode'] == 'simulation' + else lambda wc: config['real_data']['samples'][wc.cell_id]['R1']) + output: + bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', '{cell_id}.bam') + params: + index_prefix = bowtie2_index_prefix, + extra_args = bt2_cfg.get('extra_args', '') + log: op.join(config['base'], 'logs', 'bowtie2_{feature_set}_smartseq2_{cell_id}.log') + benchmark: op.join(config['base'], 'benchmarks', 'bowtie2_{feature_set}_smartseq2_{cell_id}.txt') + threads: 4 + shell: + """ + bowtie2 -x {params.index_prefix} \ + -U {input.fastq} \ + --threads {threads} \ + {params.extra_args} 2> {log} | \ + samtools sort -o {output.bam} -@ {threads} + samtools index {output.bam} + """ + + rule featurecounts_smartseq2_chunk: + """featureCounts on one cell chunk x one feature chunk.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + bams = lambda wc: expand( + op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', '{cell_id}.bam'), + feature_set=wc.feature_set, + cell_id=cell_chunks[int(wc.cell_chunk_id)]), + gtf = gtf_for_feature_set + output: + counts = temp(op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', + 'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.tsv')) + params: + tmp_counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', + 'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.raw'), + tmp_gtf = op.join(config['base'], 'tmp', + '{feature_set}_feat_chunk_{feature_chunk_id}.gtf') + log: op.join(config['base'], 'logs', + 'featurecounts_{feature_set}_cell{cell_chunk_id}_feat{feature_chunk_id}.log') + benchmark: op.join(config['base'], 'benchmarks', + 'featurecounts_{feature_set}_cell{cell_chunk_id}_feat{feature_chunk_id}.txt') + threads: 4 + run: + gtf_paths = expand( + op.join(config['base'], 'tmp', '{feature_set}_feat_chunk_{fid}.gtf'), + feature_set=wildcards.feature_set, + fid=feature_chunk_ids) + if not all(op.exists(p) for p in gtf_paths): + split_gtf_into_chunks(input.gtf, n_feature_chunks, gtf_paths) + shell( + f"featureCounts " + f" -T {{threads}} " + f" -t exon " + f" -g gene_id " + f" -a {{params.tmp_gtf}} " + f" -o {{params.tmp_counts}} " + f" -M --fraction " + f" {{input.bams}} 2> {{log}} && " + f"cut -f 1,7- {{params.tmp_counts}} | tail -n +2 > {{output.counts}}" + ) + + rule merge_featurecounts_smartseq2: + """Merge all cell x feature chunk count tables into the full feature x cell matrix.""" + input: + chunks = expand( + op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', + 'chunks', 'cell{cell_chunk_id}_feat{feature_chunk_id}.tsv'), + allow_missing=True, + cell_chunk_id=cell_chunk_ids, + feature_chunk_id=feature_chunk_ids) + output: + merged = op.join(config['base'], 'bowtie2', '{feature_set}', 'smartseq2', 'counts_merged.tsv') + run: + # Each chunk covers a subset of cells AND a subset of features. + # We group by cell_chunk_id to assemble the full feature set for each cell group, + # then merge cell groups horizontally. + import re + + def parse_chunk_path(path): + m = re.search(r'cell(\d+)_feat(\d+)', path) + return int(m.group(1)), int(m.group(2)) + + def read_counts(path): + rows = {} + with open(path) as fh: + header = fh.readline().rstrip('\n').split('\t') + cells = header[1:] + for line in fh: + parts = line.rstrip('\n').split('\t') + rows[parts[0]] = parts[1:] + return cells, rows + + # Group chunks: cell_chunk_id -> {feature_chunk_id: (cells, rows)} + cell_groups = {} + for path in input.chunks: + cc, fc = parse_chunk_path(path) + cells, rows = read_counts(path) + cell_groups.setdefault(cc, {})[fc] = (cells, rows) + + # Within each cell group, merge across feature chunks (vertically) + merged_groups = {} + for cc in sorted(cell_groups.keys()): + all_cells = cell_groups[cc][0][0] + all_rows = {} + for fc in sorted(cell_groups[cc].keys()): + cells, rows = cell_groups[cc][fc] + all_rows.update(rows) + merged_groups[cc] = (all_cells, all_rows) + + # Across cell groups, collect all features and merge horizontally + all_features = list(dict.fromkeys( + feat for _, rows in merged_groups.values() for feat in rows)) + all_cells = [cell for cells, _ in merged_groups.values() for cell in cells] + + with open(output.merged, 'w') as out: + out.write('feature_id\t' + '\t'.join(all_cells) + '\n') + for feat in all_features: + row_vals = [] + for cells, rows in merged_groups.values(): + feat_row = rows.get(feat, ['0'] * len(cells)) + row_vals.extend(feat_row) + out.write(feat + '\t' + '\t'.join(row_vals) + '\n') + +else: + + rule bowtie2_align_chromium_r2: + """Align R2 (cDNA) reads to the feature pseudo-genome.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + index = lambda wc: op.join(config['base'], 'references', + f'bowtie2_{wc.feature_set}', f'{wc.feature_set}.1.bt2'), + r2 = op.join(config['base'], 'simulations', 'chromium', 'R2.fastq.gz') + if config['mode'] == 'simulation' + else lambda wc: config['real_data']['samples'][wc.sample]['R2'] + output: + bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'aligned.bam') + params: + index_prefix = bowtie2_index_prefix, + extra_args = bt2_cfg.get('extra_args', '') + log: op.join(config['base'], 'logs', 'bowtie2_{feature_set}_chromium.log') + benchmark: op.join(config['base'], 'benchmarks', 'bowtie2_{feature_set}_chromium.txt') + threads: config['resources']['max_threads'] + shell: + """ + bowtie2 -x {params.index_prefix} \ + -U {input.r2} \ + --threads {threads} \ + {params.extra_args} 2> {log} | \ + samtools sort -o {output.bam} -@ {threads} + samtools index {output.bam} + """ + + rule tag_bam_with_barcodes_chromium: + """Attach CB and UMI tags from R1 to each read in the sorted BAM.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'aligned.bam'), + r1 = op.join(config['base'], 'simulations', 'chromium', 'R1.fastq.gz') + if config['mode'] == 'simulation' + else lambda wc: config['real_data']['samples'][wc.sample]['R1'] + output: + tagged_bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'tagged.bam') + params: + cb_len = config.get('simulation', config.get('real_data', {})).get('cb_length', 16), + umi_len = config.get('simulation', config.get('real_data', {})).get('umi_length', 12) + log: op.join(config['base'], 'logs', 'tag_bam_{feature_set}_chromium.log') + benchmark: op.join(config['base'], 'benchmarks', 'tag_bam_{feature_set}_chromium.txt') + shell: + """ + umi_tools extract \ + --bc-pattern={'C' * params.cb_len + 'N' * params.umi_len} \ + --stdin {input.r1} \ + --read2-in {input.bam} \ + --read2-out {output.tagged_bam} \ + --log {log} + """ + + rule umi_dedup_chromium: + """Deduplicate UMIs per cell per feature locus.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'tagged.bam') + output: + dedup_bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'dedup.bam') + log: op.join(config['base'], 'logs', 'umi_dedup_{feature_set}_chromium.log') + benchmark: op.join(config['base'], 'benchmarks', 'umi_dedup_{feature_set}_chromium.txt') + shell: + """ + umi_tools dedup \ + --per-cell \ + --stdin {input} \ + --stdout {output.dedup_bam} \ + --log {log} + """ + + rule featurecounts_chromium_feature_chunk: + """featureCounts on the full chromium BAM, one feature chunk at a time.""" + conda: op.join(workflow.basedir, 'envs', 'bowtie2.yaml') + input: + bam = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'dedup.bam'), + gtf = gtf_for_feature_set + output: + counts = temp(op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', + 'chunks', 'feat{feature_chunk_id}.tsv')) + params: + tmp_counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', + 'chunks', 'feat{feature_chunk_id}.raw'), + tmp_gtf = op.join(config['base'], 'tmp', + '{feature_set}_feat_chunk_{feature_chunk_id}.gtf') + log: op.join(config['base'], 'logs', + 'featurecounts_{feature_set}_chromium_feat{feature_chunk_id}.log') + benchmark: op.join(config['base'], 'benchmarks', + 'featurecounts_{feature_set}_chromium_feat{feature_chunk_id}.txt') + threads: config['resources']['max_threads'] + run: + gtf_paths = expand( + op.join(config['base'], 'tmp', '{feature_set}_feat_chunk_{fid}.gtf'), + feature_set=wildcards.feature_set, + fid=feature_chunk_ids) + if not all(op.exists(p) for p in gtf_paths): + split_gtf_into_chunks(input.gtf, n_feature_chunks, gtf_paths) + shell( + f"featureCounts " + f" -T {{threads}} " + f" -t exon " + f" -g gene_id " + f" -a {{params.tmp_gtf}} " + f" -o {{params.tmp_counts}} " + f" -M --fraction " + f" --byReadGroup " + f" {{input.bam}} 2> {{log}} && " + f"cut -f 1,7- {{params.tmp_counts}} | tail -n +2 > {{output.counts}}" + ) + + rule merge_featurecounts_chromium: + """Concatenate feature-chunk count tables from the chromium run.""" + input: + chunks = expand( + op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', + 'chunks', 'feat{feature_chunk_id}.tsv'), + allow_missing=True, + feature_chunk_id=feature_chunk_ids) + output: + counts = op.join(config['base'], 'bowtie2', '{feature_set}', 'chromium', 'counts.tsv') + run: + header = None + rows = {} + for path in input.chunks: + with open(path) as fh: + h = fh.readline().rstrip('\n') + if header is None: + header = h + for line in fh: + parts = line.rstrip('\n').split('\t') + rows[parts[0]] = parts[1:] + with open(output.counts, 'w') as out: + out.write(header + '\n') + for feat, vals in rows.items(): + out.write(feat + '\t' + '\t'.join(vals) + '\n') diff --git a/workflow/modules/normalize.snmk b/workflow/modules/normalize.snmk index 779b8e7..dc6fb2e 100644 --- a/workflow/modules/normalize.snmk +++ b/workflow/modules/normalize.snmk @@ -74,6 +74,7 @@ rule normalize_starsolo: python {workflow.basedir}/scripts/normalize_starsolo.py \ --raw-dir {input.raw_matrix_dir} \ {params.locus_map_arg} \ + --multimapper-mode {wildcards.multimapper_mode} \ --granularity {wildcards.granularity} \ --output {output.counts_tsv} 2> {log} """ diff --git a/workflow/scripts/count_starsolo_locus.py b/workflow/scripts/count_starsolo_locus.py index 7296715..02486e0 100644 --- a/workflow/scripts/count_starsolo_locus.py +++ b/workflow/scripts/count_starsolo_locus.py @@ -17,17 +17,17 @@ count matrices produced by bowtie2/kallisto/alevin. Modes: - smartseq2 - CB:Z tag is the cell_id (set by STAR from the manifest). No UMI. - All reads per (CB, locus) are counted directly. + smartseq2 - Cell ID is the prefix of the read name before _r{n}_, + set by simulate_reads.py. All reads per (cell, locus) are counted. + No CB:Z tag is present in SmartSeq2 STARsolo BAMs. chromium - CB:Z = cell barcode, UB:Z = UMI. UMI deduplication is performed per (CB, locus) by counting distinct UMIs per cell. Scalability: - The input BAM is first sorted by CB tag (samtools sort -t CB) into a - temporary file. Reads are then streamed one cell at a time, so peak memory - is O(expressed_loci_per_cell x umis_per_locus) rather than O(all cells x all - loci x all UMIs). The outer accumulator (cb -> locus -> count) is sparse and - grows only with expressed (cell, locus) pairs. + Chromium: BAM is sorted by CB tag into a temporary file. Reads are streamed + one cell at a time, so peak memory is O(loci_per_cell x umis_per_locus). + SmartSeq2: BAM is read directly (sorted by coordinate). All cells are + accumulated simultaneously; with ~20 cells this is negligible memory. Output: feature x cell TSV (rows=locus_ids, cols=cell barcodes, first col=feature_id). """ @@ -35,6 +35,7 @@ import argparse import bisect import os +import re import subprocess import sys import tempfile @@ -48,16 +49,15 @@ def parse_locus_id(locus_id): start_0 is 0-based, end_0 is 0-based exclusive (= GTF end field value). """ try: - gene_part, coords = locus_id.split('::', 1) - # coords: chrom:start-end(strand) - colon_idx = coords.rfind(':') + gene_part, coords = locus_id.split("::", 1) + colon_idx = coords.rfind(":") chrom = coords[:colon_idx] - rest = coords[colon_idx + 1:] # start-end(strand) - dash_idx = rest.index('-') + rest = coords[colon_idx + 1:] + dash_idx = rest.index("-") start_0 = int(rest[:dash_idx]) - paren_idx = rest.index('(') + paren_idx = rest.index("(") end_0 = int(rest[dash_idx + 1:paren_idx]) - strand = rest[paren_idx + 1:rest.index(')')] + strand = rest[paren_idx + 1:rest.index(")")] return chrom, start_0, end_0, strand except (ValueError, IndexError): return None @@ -79,12 +79,12 @@ def load_intervals(fasta_path): chrom_intervals = defaultdict(list) with open(fasta_path) as fh: for line in fh: - if not line.startswith('>'): + if not line.startswith(">"): continue - header = line[1:].rstrip('\n').split()[0] - if '::' not in header: + header = line[1:].rstrip("\n").split()[0] + if "::" not in header: continue - locus_id = header.split('::', 1)[0] + locus_id = header.split("::", 1)[0] coords = parse_locus_id(header) if coords is None: continue @@ -127,52 +127,86 @@ def find_locus(chrom, pos, chrom_starts, chrom_intervals): def sort_bam_by_cb(bam_path, n_threads): """Sort BAM by CB tag into a tempfile. Returns tempfile path (caller must unlink).""" - tmpf = tempfile.NamedTemporaryFile(suffix='.bam', delete=False) + tmpf = tempfile.NamedTemporaryFile(suffix=".bam", delete=False) sorted_path = tmpf.name tmpf.close() - print(f'Sorting BAM by CB tag -> {sorted_path}', file=sys.stderr) + print(f"Sorting BAM by CB tag -> {sorted_path}", file=sys.stderr) result = subprocess.run( - ['samtools', 'sort', '-t', 'CB', '-@', str(n_threads), '-o', sorted_path, bam_path], + ["samtools", "sort", "-t", "CB", "-@", str(n_threads), "-o", sorted_path, bam_path], stderr=subprocess.PIPE ) if result.returncode != 0: os.unlink(sorted_path) sys.stderr.write(result.stderr.decode()) - sys.exit(f'samtools sort failed (exit {result.returncode})') + sys.exit(f"samtools sort failed (exit {result.returncode})") return sorted_path -def emit_cell(cb, cell_data, mode, counts): - """Merge one cell's accumulated data into the sparse counts dict.""" - if mode == 'chromium': - for locus_id, umis in cell_data.items(): - counts[cb][locus_id] = len(umis) - else: - for locus_id, n in cell_data.items(): - counts[cb][locus_id] = n +def process_smartseq2(bam_path, chrom_starts, chrom_intervals, multimapper_mode): + """ + Process SmartSeq2 BAM: cell_id is the prefix of QNAME before _r{n}_. + Reads are sorted by coordinate (interleaved across cells) so we accumulate + all cells simultaneously. + """ + counts = defaultdict(lambda: defaultdict(int)) + all_cbs = set() + all_loci = set() + n_reads = 0 + n_assigned = 0 + proc = subprocess.Popen( + ["samtools", "view", bam_path], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + try: + for raw in proc.stdout: + line = raw.decode() + if line.startswith("@"): + continue + fields = line.split("\t") + if len(fields) < 11: + continue + flag = int(fields[1]) + if flag & 4 or flag & 2048: + continue + n_reads += 1 -def main(): - ap = argparse.ArgumentParser(description=__doc__) - ap.add_argument('--bam', required=True, - help='STARsolo Aligned.sortedByCoord.out.bam') - ap.add_argument('--fasta', required=True, - help='Feature FASTA with headers transcript_id::chrom:start-end(strand)') - ap.add_argument('--mode', required=True, choices=['smartseq2', 'chromium']) - ap.add_argument('--multimapper-mode', default='unique', - choices=['unique', 'multi'], - help='unique: NH=1 reads only; multi: all aligned reads') - ap.add_argument('--threads', type=int, default=1, - help='Threads for samtools sort (default: 1)') - ap.add_argument('--output', required=True) - args = ap.parse_args() + nh = None + for tag in fields[11:]: + if tag.startswith("NH:i:"): + nh = int(tag[5:].rstrip("\n")) + break + if multimapper_mode == "unique" and nh is not None and nh > 1: + continue - print(f'Loading intervals from {args.fasta}', file=sys.stderr) - chrom_starts, chrom_intervals = load_intervals(args.fasta) - n_loci = sum(len(v) for v in chrom_intervals.values()) - print(f' {n_loci} intervals on {len(chrom_intervals)} chromosomes', file=sys.stderr) + # cell_id is everything before _r{digits}_ in the QNAME + cb = re.sub(r"_r\d+_.*$", "", fields[0]) - sorted_bam = sort_bam_by_cb(args.bam, args.threads) + chrom = fields[2] + pos_0 = int(fields[3]) - 1 + locus_id = find_locus(chrom, pos_0, chrom_starts, chrom_intervals) + if locus_id is None: + continue + + counts[cb][locus_id] += 1 + all_cbs.add(cb) + all_loci.add(locus_id) + n_assigned += 1 + finally: + proc.kill() + proc.wait() + + print(f" {n_reads} reads processed, {n_assigned} assigned to loci", file=sys.stderr) + print(f" {len(all_cbs)} cells, {len(all_loci)} loci", file=sys.stderr) + return dict(counts), all_cbs, all_loci + + +def process_chromium(bam_path, chrom_starts, chrom_intervals, multimapper_mode, n_threads): + """ + Process Chromium BAM: sort by CB tag, stream one cell at a time to cap memory. + UMI deduplication per (CB, locus). + """ + sorted_bam = sort_bam_by_cb(bam_path, n_threads) counts = defaultdict(lambda: defaultdict(int)) all_cbs = set() @@ -184,83 +218,108 @@ def main(): cell_data = {} proc = subprocess.Popen( - ['samtools', 'view', sorted_bam], + ["samtools", "view", sorted_bam], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) try: for raw in proc.stdout: line = raw.decode() - if line.startswith('@'): + if line.startswith("@"): continue - fields = line.split('\t') + fields = line.split("\t") if len(fields) < 11: continue flag = int(fields[1]) if flag & 4 or flag & 2048: continue n_reads += 1 - chrom = fields[2] - pos_0 = int(fields[3]) - 1 cb = None ub = None nh = None for tag in fields[11:]: - if tag.startswith('CB:Z:'): - cb = tag[5:].rstrip('\n') - elif tag.startswith('UB:Z:'): - ub = tag[5:].rstrip('\n') - elif tag.startswith('NH:i:'): - nh = int(tag[5:].rstrip('\n')) + if tag.startswith("CB:Z:"): + cb = tag[5:].rstrip("\n") + elif tag.startswith("UB:Z:"): + ub = tag[5:].rstrip("\n") + elif tag.startswith("NH:i:"): + nh = int(tag[5:].rstrip("\n")) if cb is None: continue - if args.multimapper_mode == 'unique' and nh is not None and nh > 1: + if multimapper_mode == "unique" and nh is not None and nh > 1: continue - locus_id = find_locus(chrom, pos_0, chrom_starts, chrom_intervals) + locus_id = find_locus(fields[2], int(fields[3]) - 1, chrom_starts, chrom_intervals) if locus_id is None: continue if cb != current_cb: if current_cb is not None: - emit_cell(current_cb, cell_data, args.mode, counts) + for lid, umis in cell_data.items(): + counts[current_cb][lid] = len(umis) all_cbs.add(current_cb) current_cb = cb - cell_data = defaultdict(set) if args.mode == 'chromium' else defaultdict(int) + cell_data = defaultdict(set) + cell_data[locus_id].add(ub if ub else str(n_reads)) all_loci.add(locus_id) n_assigned += 1 - - if args.mode == 'chromium': - cell_data[locus_id].add(ub if ub else str(n_reads)) - else: - cell_data[locus_id] += 1 finally: proc.kill() proc.wait() if current_cb is not None: - emit_cell(current_cb, cell_data, args.mode, counts) + for lid, umis in cell_data.items(): + counts[current_cb][lid] = len(umis) all_cbs.add(current_cb) os.unlink(sorted_bam) - print(f' {n_reads} reads processed, {n_assigned} assigned to loci', file=sys.stderr) - print(f' {len(all_cbs)} cells, {len(all_loci)} loci', file=sys.stderr) + print(f" {n_reads} reads processed, {n_assigned} assigned to loci", file=sys.stderr) + print(f" {len(all_cbs)} cells, {len(all_loci)} loci", file=sys.stderr) + return dict(counts), all_cbs, all_loci + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--bam", required=True, + help="STARsolo Aligned.sortedByCoord.out.bam") + ap.add_argument("--fasta", required=True, + help="Feature FASTA with headers transcript_id::chrom:start-end(strand)") + ap.add_argument("--mode", required=True, choices=["smartseq2", "chromium"]) + ap.add_argument("--multimapper-mode", default="unique", + choices=["unique", "multi"], + help="unique: NH=1 reads only; multi: all aligned reads") + ap.add_argument("--threads", type=int, default=1, + help="Threads for samtools sort (Chromium only; default: 1)") + ap.add_argument("--output", required=True) + args = ap.parse_args() + + print(f"Loading intervals from {args.fasta}", file=sys.stderr) + chrom_starts, chrom_intervals = load_intervals(args.fasta) + n_loci = sum(len(v) for v in chrom_intervals.values()) + print(f" {n_loci} intervals on {len(chrom_intervals)} chromosomes", file=sys.stderr) + + if args.mode == "smartseq2": + counts, all_cbs, all_loci = process_smartseq2( + args.bam, chrom_starts, chrom_intervals, args.multimapper_mode) + else: + counts, all_cbs, all_loci = process_chromium( + args.bam, chrom_starts, chrom_intervals, args.multimapper_mode, args.threads) sorted_loci = sorted(all_loci) sorted_cbs = sorted(all_cbs) - with open(args.output, 'w') as fh: - fh.write('feature_id\t' + '\t'.join(sorted_cbs) + '\n') + with open(args.output, "w") as fh: + fh.write("feature_id\t" + "\t".join(sorted_cbs) + "\n") for locus_id in sorted_loci: - row = [str(counts[cb].get(locus_id, 0)) for cb in sorted_cbs] - fh.write(locus_id + '\t' + '\t'.join(row) + '\n') + row = [str(counts.get(cb, {}).get(locus_id, 0)) for cb in sorted_cbs] + fh.write(locus_id + "\t" + "\t".join(row) + "\n") - print(f'wrote {len(sorted_loci)} loci x {len(sorted_cbs)} cells to {args.output}', + print(f"wrote {len(sorted_loci)} loci x {len(sorted_cbs)} cells to {args.output}", file=sys.stderr) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/workflow/scripts/evaluation_report.Rmd b/workflow/scripts/evaluation_report.Rmd index d0a0e4e..f8ca865 100644 --- a/workflow/scripts/evaluation_report.Rmd +++ b/workflow/scripts/evaluation_report.Rmd @@ -99,10 +99,10 @@ All metrics compare the aligner count matrix against the simulation ground truth - **Specificity**: fraction of truly unexpressed (feature, cell) pairs correctly assigned zero by the aligner. -Rows = aligner (+ multimapper mode), columns = metric. Each bar is a -feature_set x granularity combination. +Rows = metric, columns = feature_set. Each point is one aligner at one granularity; +x-axis = granularity, colour = aligner (+ multimapper mode). -```{r global-accuracy, fig.height=14, fig.width=18} +```{r global-accuracy, fig.height=18, fig.width=14} acc_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall", "jaccard", "specificity") @@ -110,25 +110,23 @@ acc_long <- global %>% select(aligner_mode, feature_set, gran_label, all_of(acc_metrics)) %>% pivot_longer(all_of(acc_metrics), names_to = "metric", values_to = "value") %>% filter(!is.na(value)) %>% - mutate( - metric = factor(metric, levels = acc_metrics), - fs_gran = paste0(feature_set, "\n", gran_label) - ) - -n_fg <- length(unique(acc_long$fs_gran)) -pal <- make_palette(n_fg) - -ggplot(acc_long, aes(x = fs_gran, y = value, fill = fs_gran)) + - geom_col(position = "dodge", width = 0.8) + - facet_grid(aligner_mode ~ metric, scales = "free_x") + - scale_fill_manual(values = pal, name = "feature_set / granularity") + - scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.5, 1)) + - labs(x = NULL, y = NULL, + mutate(metric = factor(metric, levels = acc_metrics)) + +n_al <- length(unique(acc_long$aligner_mode)) +pal <- make_palette(n_al) + +ggplot(acc_long, aes(x = gran_label, y = value, + colour = aligner_mode, group = aligner_mode)) + + geom_point(size = 2.5, alpha = 0.85) + + geom_line(linetype = "dashed", linewidth = 0.5) + + facet_grid(metric ~ feature_set, scales = "free_y") + + scale_colour_manual(values = pal, name = "aligner (mode)") + + labs(x = "granularity", y = NULL, title = "global accuracy metrics", - subtitle = "rows = aligner, columns = metric") + + subtitle = "rows = metric, columns = feature_set; x = granularity; colour = aligner") + theme_bw(base_size = 10) + theme(aspect.ratio = 1, - axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + axis.text.x = element_text(angle = 40, hjust = 1, size = 8), strip.text.y = element_text(size = 8), strip.text.x = element_text(size = 8), legend.position = "bottom", @@ -194,6 +192,15 @@ ggplot(corr, aes(x = gran_label, y = r, ## log1p RMSE +log1p RMSE is computed over all (feature, cell) pairs in the feature universe. +At locus level there are thousands of features, most with zero counts in both +truth and observed; the dominance of (0, 0) pairs suppresses the average squared +error so RMSE looks low. At class_id level there are only ~10-20 features, each +accumulating large aggregated counts, so per-entry log1p differences are larger +and RMSE rises. The gradient locus < gene_id < family_id < class_id is therefore +expected and not a mistake - lower locus RMSE reflects sparsity, not better +quantification accuracy. + ```{r rmse, fig.height=5} if ("log1p_rmse" %in% names(global) && any(!is.na(global$log1p_rmse))) { n_al2 <- length(unique(global$aligner_mode)) @@ -275,6 +282,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa geom_line(linewidth = 0.5, linetype = "dashed") + geom_point(size = 2, alpha = 0.85) + facet_grid(gran_label ~ feature_set) + + scale_y_sqrt() + scale_colour_manual(values = pal_pf, name = "Aligner (mode)") + labs(x = "repeat class", y = "F1", title = "F1 per repeat class", @@ -290,6 +298,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa geom_line(linewidth = 0.5, linetype = "dashed") + geom_point(size = 2, alpha = 0.85) + facet_grid(gran_label ~ feature_set) + + scale_y_sqrt() + scale_colour_manual(values = pal_pf, name = "Aligner (mode)") + labs(x = "repeat class", y = "pearson r", title = "Pearson r per repeat class") + @@ -307,7 +316,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa ## compute resources -```{r resources, fig.height=10, fig.width=14} +```{r resources, fig.height=14, fig.width=18} res_cols <- intersect(c("wall_time_s", "cpu_time_s", "max_rss_mb", "io_in_mb", "io_out_mb"), names(global)) if (length(res_cols) > 0 && any(!is.na(global[res_cols]))) { n_al5 <- length(unique(global$aligner_mode)) @@ -322,7 +331,7 @@ if (length(res_cols) > 0 && any(!is.na(global[res_cols]))) { colour = aligner_mode, group = aligner_mode)) + geom_line(linetype = "dashed", linewidth = 0.6) + geom_point(size = 2) + - facet_grid(resource ~ feature_set, scales = "free_y") + + facet_grid(feature_set ~ resource, scales = "free_y") + scale_colour_manual(values = pal_res, name = "Aligner (mode)") + labs(x = "granularity", y = NULL, title = "compute resource usage") + theme_bw(base_size = 10) + diff --git a/workflow/scripts/normalize_starsolo.py b/workflow/scripts/normalize_starsolo.py index 3bba451..5091baf 100644 --- a/workflow/scripts/normalize_starsolo.py +++ b/workflow/scripts/normalize_starsolo.py @@ -27,11 +27,11 @@ def read_lines(path): return [line.rstrip('\n') for line in fh] -def load_starsolo_raw(raw_dir): +def load_starsolo_raw(raw_dir, mtx_name='matrix.mtx'): barcodes = read_lines(os.path.join(raw_dir, 'barcodes.tsv')) feature_lines = read_lines(os.path.join(raw_dir, 'features.tsv')) feature_ids = [line.split('\t')[0] for line in feature_lines] - mat = scipy.io.mmread(os.path.join(raw_dir, 'matrix.mtx')).tocsc() + mat = scipy.io.mmread(os.path.join(raw_dir, mtx_name)).tocsc() return barcodes, feature_ids, mat @@ -94,11 +94,16 @@ def main(): ap.add_argument('--output', required=True, help='Output TSV path') ap.add_argument('--locus-map', default=None, help='TSV: transcript_id, gene_id, family_id, class_id (no header)') + ap.add_argument('--multimapper-mode', default='unique', + choices=['unique', 'multi']) ap.add_argument('--granularity', default='gene_id', choices=['gene_id', 'family_id', 'class_id']) args = ap.parse_args() - barcodes, feature_ids, mat = load_starsolo_raw(args.raw_dir) + em_mtx = os.path.join(args.raw_dir, 'UniqueAndMult-EM.mtx') + mtx_name = 'UniqueAndMult-EM.mtx' if args.multimapper_mode == 'multi' and os.path.exists(em_mtx) else 'matrix.mtx' + print('Using matrix: {}'.format(mtx_name), file=sys.stderr) + barcodes, feature_ids, mat = load_starsolo_raw(args.raw_dir, mtx_name) print('{} features, {} barcodes'.format(len(feature_ids), len(barcodes)), file=sys.stderr) gene_to_group = None From 9ec3e545d93d9153a538305e740d3cd2464a4d3b Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 13:05:53 +0100 Subject: [PATCH 03/12] Fix starsolo simuls CB matching --- workflow/modules/starsolo.snmk | 2 ++ workflow/scripts/evaluation_report.Rmd | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/workflow/modules/starsolo.snmk b/workflow/modules/starsolo.snmk index 78df478..da16717 100644 --- a/workflow/modules/starsolo.snmk +++ b/workflow/modules/starsolo.snmk @@ -97,6 +97,7 @@ rule starsolo_smartseq2: --soloCellReadStats None \ --soloUMIdedup NoDedup \ --soloMultiMappers {params.solo_multimappers} \ + --outSAMattributes NH HI AS NM CB UB \ --outSAMtype BAM SortedByCoordinate \ --limitBAMsortRAM {params.max_ram_bytes} \ {params.extra_args} 2> {log} @@ -163,6 +164,7 @@ rule starsolo_chromium: --soloUMIstart {params.umi_start} \ --soloUMIlen {params.umi_len} \ --soloMultiMappers {params.solo_multimappers} \ + --outSAMattributes NH HI AS NM CB UB \ --outSAMtype BAM SortedByCoordinate \ --limitBAMsortRAM {params.max_ram_bytes} \ {params.extra_args} 2> {log} diff --git a/workflow/scripts/evaluation_report.Rmd b/workflow/scripts/evaluation_report.Rmd index f8ca865..08c9f43 100644 --- a/workflow/scripts/evaluation_report.Rmd +++ b/workflow/scripts/evaluation_report.Rmd @@ -143,9 +143,8 @@ if (nrow(det) > 0) { pal_pr <- make_palette(n_lab) ggplot(det, aes(x = recall, y = precision, - colour = aligner_mode, shape = feature_set, size = f1)) + + colour = aligner_mode, shape = feature_set)) + geom_point(alpha = 0.85) + - scale_size_continuous(range = c(2, 7), name = "F1") + scale_colour_manual(values = pal_pr, name = "Aligner (mode)") + scale_shape_manual(values = c(16, 17, 15, 3, 7, 8)[seq_len( length(unique(det$feature_set)))], @@ -153,7 +152,7 @@ if (nrow(det) > 0) { facet_wrap(~gran_label, nrow = 1) + xlim(0, 1) + ylim(0, 1) + geom_abline(slope = 1, intercept = 0, linetype = "dotted", colour = "grey60") + - labs(title = "precision vs recall (size = F1)", + labs(title = "precision vs recall", subtitle = "facets = granularity", x = "Recall", y = "Precision") + theme_bw(base_size = 11) + From 932ed46fb5256f43d55c2b1f182882f698c9e907 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 13:14:44 +0100 Subject: [PATCH 04/12] Testing, documentation --- .coveragerc | 23 ++ .github/workflows/tests.yml | 8 +- pytest.ini | 6 + ...test_evaluate.cpython-312-pytest-7.4.4.pyc | Bin 55247 -> 0 bytes ...imulate_reads.cpython-312-pytest-7.4.4.pyc | Bin 45973 -> 0 bytes test/unit/test_evaluate.py | 200 ++++++++++++++++++ test/unit/test_simulate_reads.py | 133 ++++++++++++ workflow/scripts/evaluation_report.Rmd | 4 +- 8 files changed, 370 insertions(+), 4 deletions(-) create mode 100644 .coveragerc create mode 100644 pytest.ini delete mode 100644 test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc delete mode 100644 test/unit/__pycache__/test_simulate_reads.cpython-312-pytest-7.4.4.pyc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..4c196c9 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,23 @@ +[run] +source = workflow/scripts + +# Scripts requiring subprocess calls to samtools/BAM files or real aligner +# output matrices cannot be meaningfully unit-tested without bioinformatics +# infrastructure. Omit them so the coverage threshold applies only to +# logic that can be exercised with synthetic data. +omit = + workflow/scripts/count_pseudo_genome.py + workflow/scripts/count_pseudo_genome_chromium.py + workflow/scripts/normalize_starsolo.py + workflow/scripts/normalize_alevin_chromium.py + workflow/scripts/normalize_alevin_smartseq2.py + workflow/scripts/normalize_kallisto_chromium.py + workflow/scripts/normalize_kallisto_smartseq2_granular.py + workflow/scripts/merge_featurecounts.py + workflow/scripts/tag_bam_chromium.py + workflow/scripts/split_gtf_chunks.py + +[report] +exclude_lines = + pragma: no cover + if __name__ == .__main__.: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ea12b79..656c49a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -47,8 +47,10 @@ jobs: micromamba-version: latest environment-name: snakemake create-args: >- + -c bioconda + -c conda-forge python=3.11 - snakemake>=8 + "snakemake>=8" init-shell: bash - name: Dry-run main Snakefile with SmartSeq2 config @@ -101,8 +103,10 @@ jobs: micromamba-version: latest environment-name: snakemake create-args: >- + -c bioconda + -c conda-forge python=3.11 - snakemake>=8 + "snakemake>=8" init-shell: bash - name: Run negative control workflow diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..9cf6b9f --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +testpaths = test +addopts = -v --tb=short +markers = + slow: marks tests as slow (requires bioinformatics tools or large data) + workflow: marks tests that require snakemake diff --git a/test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc b/test/unit/__pycache__/test_evaluate.cpython-312-pytest-7.4.4.pyc deleted file mode 100644 index 69d220d59aa6fedcf637b8ab9027dabd6e933a0f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 55247 zcmeHwdvF}bncvRt?0Xk)kRU}-E55|#10(=GL|T*#k<`PMU`v$sprzYd?kvEC7Fe)n z!3Vs1fqc##70VIryNjSZxek_{GI3HlOj4<0xe|wRa*ivGD}+D^%$NzCvr{ykJC{Vd zPIt;5m*3a(>Rk+X7Z3?j76#aFXJ)&5dS<$R-?O`a5{-sj*naLhlL>aXT)&_LbK}3@ z;OeO4a=qeGT>D*$q`0%vemDR249F?BTtHy-leHU8XdE-mWx)E?1gB?@*dSV@eC?ok}a{3S|-KN@X$VDy0o{ zwXy_sjj|NJ_Mu~;(C*yfG*e^0 zfA{9qv0QH~e=r>z8fNK@y~Bg4d?q&-%jWtrshzQbbe?5Wu~cqgXgHswU(^jTC7Dkm zK~l*K_H76y2NjK1V%knsY^v_HIo!%#jSYWl;1G&ZFLpBdB6ERJ113 zc}9;%lZG^R??FZ{O{FIn)3*<|ZaSD7NN>svB(vGvU~-enQkz(MD4j%0rbfPL7|osU zC4`1;TsDzc53|K6jb5s%f{eP(2O3Und9i-9`@FyLtbfUc)+J-B#tyxD*I#USb;F;x zPqoI6hb{!8$CD?T#}2#`ymS2V3z7PldtU4r>pZ#cjC?M#_qcRE5PUiMV)UhY+6liH zKC$OqV95u5S7^n10aqkC`p^eKd;dd~UhBWv*}cv4);90`E`Q2+K{zWk%r!W8#nlIt zd|fKKg(m4z+yx1**=WxDTyf9HM*gH@$yguD4G$`bJR8m*Y{$)_M=9uC`C&E~%K!lA zS+`I4G=D0c%_cI67RcsO!z%3s^{2>pdy@m1>@og(Dw|aK?;#YLPNYzNex#xNI&q0V z*0_f?qTZ~T2)#y?Iso1rTZC_|aUl78Iw9s|F`Qq8k4kT3)b-bm&yIfPLb&C`(`WDK zIJM>U(y8#Cv+|y+_|cUTJ0U$oI=*_;=)j8Jfjz>LlFY79l=D_zbThZvC8%2#y!}?U zrK{+h;!)&&tHYXS#qKqNlZuz9c@^qRdtA>w2^jGe+m99x_7~^_{`dw<$m4$(hAvC3>5v&buIW5nb*d-#sRY&ry9TXwCoDH6hHl$*TsE! z_MzSqK1KhLnUH~B4af+Ld@)4*(rkXc=!W1Hv7<3+U$H0lJ=xBGH_j(`t5bjsJ7Zrm z&t()G7|n4fa2xHbSme1zLi|XjhRlIrBy492GI~9l6yl)O6ZdI8HOKO4g+=JRm&1%E z_k)LM9yQN2PhUD85Ayb@HJasE719Dm6;#$j`2$7@m9^5LOoy73(qu48Ri$@H_suu4 zZSx8zz_D27DBFZo>+n&(4>Ia{FXRd~p15Z+aK~u(h5FXBi|(4LUpxAc0WL2s8*4fr z2)!J5F>qq}_^R*5zZF0E(A0{plZ{(X%ae`woD1ywSUl(|*b0hYDxXw#-llT4|r3gl^h4;+A#164mqi2d3<}T@jo`Qsavs!1r zPOMh(IQ7wjY_@E@{hLRU>!rJRtM)4R6I!*~f&Jl}(UyH9&+=Ahwq+d{2a-e7jALsD zbC6qzgT;69WOAX5iCV7;h5#zRtqFvA#Rtc`M-UZt|ya zA#0<~!5e@E#`cQK2IPK`(HY5g2B%Z=RXvMzMORVE%VwGVB(IfiMKq$KNAVP8H>93| zryxV#GOhqvcNM(PujNqeJ0YpdZTRgsl@y^o#ar<8N{Wwi^;)b|@#ifGtG`Z2ZtR+} z*kPC2DwN+aGiAYI5=>0lR4L`41W*ptT!ST*YhXW1vzst6heJ(f=0J@C!c`pd*wSZ} zbRM%7Rnl2qyh=K+S)LMn%{PM9QuHe!C0y`lBz_dg6O8jsD2arUvaMjTOS8l;ON#5L zRP^EF#>az?T=X7u#Univ-peHXU#3=fnVS4%YO$B8dtSl^+k5w2`qOvU{3_huc<+Ry zd6Eaz3F$-ku3Zz-$QM_y9Xhr-uEuuliqUQd_CsGX_hS0VE#^7w7^gbpYI|~Mh~mlH!SzfOM zO6R9LCnB0VeOL=ZvSw;-Fu~aUxF5}@Us?-&S}#74WLje)1z|h^`H_)6m}F@!f}axy zhBMiGW>D39z$UOwtu3J*%nfH1o;1;y%^gT)6I}2QCe&nano7{>x_Nb=#O-A)$F$~z zt^+30M~AY>L9T0pk9VUsI;#%Twe)-F(u9Mp;=y+DV260np;uPFNJr^WmlLPwx^%*N zEi_bNsql^$_KbFq9=a%Rn7&E> z@5&q4ov4^IX#Wz8xn|IIzb=`OTQg{Tik^Ze3{_h}H{#GO7rbv;d>XPdR9g<+{(`>- z==K%-fNnqK5}>FbwgtTaau&%->~134iEJaXmB>yapCWP(k$XV^+|Lru zioo4rf^-M=Y6R|$Gmv`+M{XV02x=#MY!6*^FOdg`JP1O-y@MlkM=5Z38Nl7e9zn8+ z5HCoR66|Tp`V5i%lwSbIyLIH#p|fK%dzOCZ$ZddOXUWkPl_V!FcM8JiT4FK+8p5*I z3xgl52T7nJ>;;Uav#>5IYyi1?@KIj@xfUvJoR)9e|CGFOCNNs+%$gevj2sFrIgvn8 zeV5D-LicfyTm>0)NoBy9DbGOywB=g|=$n}X5};f;k1YWz-kNLPDwOFUXO~FN6>Ds# zL`}UysSbBnuw+oh_nOS5L^)4}E4~SNN%<<G7pjkYwfZc3 zVy!aJLRZIocR8ulb_a%{{PF1OLPsCMUGW~K_Zmir!yu3`EOmu2SdecV|L$G7!RaQV2;^pd>Kr9|Q);QKZ<{NJu?;iI}h3~p4oo8-VpE)WJP#tWm zkF7e)*6ITvo1r?0R-Xmc0d326Yk-r^x9Tu^tB9m&s=7q(ZPy(+?aFNX}=wiF}60XNhp}-LP+PzFR85voBI2;>GMaBEJXXWTjYI z1|-i9=8g;|$km@_qsYVa>Glbe^+sOLPKCQn^teUs6AMOVyga^@@JfPTzV$ol_Lf?K z&b(k$o;|$Gr%^fXxlDTAWl}pXF9(@DhD=1b#IJ=^vQiEtP3tZD92Lif{l{p}RsbAi zfA&w1#0gpMo;Mq#Ayfi!Q`XE;oS89(&k7rEJ&2j=RdS177s!6PdxoNtS`VFp}S4U$bqLXX3&Wm0G2kZPyFTtGr3Z5$FJ}a(y79J7Ngy>N7 z5?rd?*jUX2V*3@m*V(V&>(P0%gPk#M82SGY?O!)R9;Q88!sk-IlbC;mx{Vh^aOy2D zXVYb7$d{3vOqeQVwz}Kcony^ohsIJ<;T2|=yCKGV_yip7_Y^?q!RP8AC}J3}t8Y$$ zXGJr%S~6p-(NQoA01hm4M%ob2FY9}-Rg*ENY(oTtn@2Dpe!+ADA$q}65Th3kRPJVq zhet19*g;A3g6O((Xt*U#KU?!Bfvs$_pMhD!s$pMN31$xNO0%z0(Y_$uty4NHBD+dD ztFu>0XLmK903RRiiCOR5y#K0CY(@?|F<+IM&ss&OQl1?};eRcthrB5epsrICMF>$7 z6a*&;krWhFA>%{Q6rp%{WckBp98}DRgNkjuH}?3W`}V?pxw+l6nsNu|m(j>BDBXtZV*npZge4>bx+rT5)5ETdKs1;m?LnF8jdq$ZZ zaIG3AZ6_;*;g!bMb)ZZXr8`mYM}-l6wg+Ejks0hEI`=RUvg7g?jV+vR8z~8~!%gwi zclTI!8v8bQl-;cCH1Uvb4K6hkmxOj0L2X8$M_0*jTe(5GB+z4f$?BK@DOHvMZKhN?oyC-@JTdF7S4n3H7mEB^fJ3{l<~S9LEtoCL2#G5Pb3LNj+cWYF zAu6Q<0VW1MVy`j~i*YFw250261L7P)Tu^AcjUWbqZ#My4kB1}RrI;0tzVUznyrk|i zl5Z%;WjjmJA+Z#$PeL5#xwsJgD4iw6um*8B?zbKEVyxUYmZ1XrK}z7WlVx#A9HFQW z2Wl^(ZT7SSnNw8h%;mQx4)FMtV304L4dfau8nyv0KzETg*K6 zw~dA+VJwU66y~0t5*SMsuUcy$YUK_!u%Erz^Gi_Qp(ZnPphjX4vqPSOg`G<1y4Boj z6*bp@uj|nbdL*TA=fI_}{q4US|KLZb_#hU<^P1B7pO%M0TPWXQM>5tj zhQ*O(fEDO$1Um-zL8-(t3|^tLhz%IvoaKXGh+$Dgq>rW5;cQ-)rHK#Bl-P-Ju)iQu z$~d@WzO__f$L0Cy?eRKY3E;fse#*=ps&oa-gKE|4T7}3?(y<sU-&S&Tx1)frdueUc6V26<{1= zkdI?TE@3cC?c*55F@X+JzI|T;Hdiza&CCI876!xjJMdW{hWz6ci&t$Rk;N;kdcmxe zoC8;UzCskc$nhLx9zlAk_LgO{Kh*q|XXM}2O{0JP#Q?^3KsGXbo)jse59;`R+T-KI{w3|* z3j$N-Rbz;kF2mSnN6YGH8y(5!XtQd2c!zGv)a{R^DYMhku4&ISWugQ_<;I?I5{AYy zOQt1Jb=V@6qn1zd*vuM{B~-5xiFc5+1kuYFzDByXCM7lL0GGZXG3}v8vX=rDoo*@g&$$?f)Mw~Uj5Z;Ru|FN`W zCGeV;^Odj?R6>QY!B-+R^OcCnPt0$FuaFasoMwSya8B*aGpBZjTtYhw3$}ufCa+j_ zk*e*?N{MzBUT9}#tGH#`nLMX=~M(*R7pdI$hOvW~D?si!QXY`D|xg|M~%mZkp7sN%w2g?jMl2!Ttl09}xME{Kq|d z;5uny6Vk{dWoBw9M!K?O$CuP%q}D@9_@qNzxxwi8J#(|ADEU_+(I+cQkxB6uoy_z` z?53#J8A=`IUbR#bN3|}X_=-WObphK|Rt&)uAL8sOTnN{o*7*xzsC5C#C9o&aefrVF zZA-YR%ATy0U{6qBYG#ntOsm#2Gy#83YMrPaJE>uod9PM=txCF(SzDM3b+sY-nTtYi*xNhytaz9mVXI4tI zvxbFsHlOVbYF%SbnGVO^g1*4c5cviXQsvkWiEy2UN8kJ>I{eQ>9wyR4#8P(Hf5G8I zo#y3>A!wf50hLjpi<4r+enLN#M$qeyr2j@g&_pXL$!{h4kf9Li6Clo1hz4`C3UQ}~YJ;Wp0OswK1PEoDJ=Ov+ z@5L~!T5GUGDA%C|dFvUPzzOxJVohe|K#hEr^VojJD(Nh5R+V(NdgQm%BN{elt4E8? z2Pl-(W}dUZ3$>_6)-#$#J1bX@>}Oi3ot3Ibp_<>a*m@k^vK?(zwVjFTQD~u^nY}`p zhjs>WBHY9NJsRh%M2C133&mhrGd^78$CTR+@%=mlZ0&Ay-wE$w~>%_3@jn<%XFWtC%FM z_z-Kt`#L^rWu0M?tgqQ5S#P4M`E8gaqozp`-xPHcE=H=FCabqnc}}eqxrA1F{rt6} z#7BLcZ;sLAR!X$e=qw_3=f@ykZEn?tR%*Un(nP40)>Xt(-<(=0atW>U`k6T`qf%8Y zwNj#$)-SYD6VYx(EQNg8(8Enr_mZZ~Lk_sPN|W|#(gXMU>11g{f^N;4rXS9OXwrlF zY*{>p+=v(7Ulyg{I3})IoPxO`RBRKU1p@&NaT-F+O=Bd^#vNA`(POa{s$QB%FA;8M z`+eH`5fL({#T(dJ+MgscD`Og;wYZbUWnUV$ljRfT|n z@54}vWyGa}i0HLhQbw-x(bm8)1O$+q;d zi&eW}vX2#e8{skDwzaDjIKnr9?>LR(uEE-+TOl|bID;bXfHd8U@e?}74Gd*o48qyU zc7zqz07eS#%JgPZ8B8>B0i}>DM4bTPzJ}wl8NkzUe6`_c_@xzkX}eLyp&>ku6}4h0WGYKiBQ}b4hdw!%}Cg~mxB zcXvVNia4g>QD3!}b9agwLadJ$YB&x5kOk3EjBjxm{_zHGg*I-|hG}2loJ{5=13B!J z4pBsQN_6s&P-~~OmNJAm1-Cw#43CGwQ{+c}dm0C!Bqvj-1udqiW+~>iF}- zqO7xip2#CS&w8OFezfOl87lyqm7k}Tl0@+;Y3Jp6a)DEGr#kfEhTG9hJ|FG;fY44sna9)=JBH zDv4w^(>Dn7q`B5*jh4}gfFL>kZ@Ty|LBtt$I%fo;ZqZlBA)k~fA?l;;HgEuRtWSck zGu4|x2<$*crP&VHuaW<4eAF`_1ci-bjpL1z%`4AFRuLrDpICX~(22IQp{1j{FEqDK zH?N;;UVn1$sjky4r+sId&!o? zD`{oQ>x-2p@0kr&n$NQoE2B3VE2Fo<^8{96h&D&Wt?2Ju%?956FYE(BJNXX#Tl$66 zZuWB`9FP8*_I^PmPlVu!{U0KiK-8Haw6X#S&EczN5&eDw`(Epys-&|5tYH<=RYE_virA~@ zHJW!Ul;ormTK1YUm5^qZ^Wl!orw}%tLgW+u6r#pcsQZLIh5C{n(eO!p3g97)J(@S0 zOAV_Nk{0g6a!A8jtc3&%9!ae1eIh%E5N9!!dOq*ZF7DugKWM^DGvIdcHR4OOd5l-p zXWIEYDuAp8?C(J?69noa@HXsWsrVSaH2;H7Bp(0VQ~PvnzcHIjDv3Up8^%CR9zo6S zqHMOh{{=dHlE_ma6G28Op*xz+Vh_twr}Ifs!QZS0SpSNC6o#8_6%V#MEjXt~SkI_X zf80+zWQ4btV1oK1QZ&a=d)GBgqwcw5b+Xukbd|F zB0f6gb_)V+=BL9Dh!bG0m1=8khsdriokopb1IdG&1l z+}txgyQ@r@4kX-;wOR81VQ!mPz(0GjqIu}~y4LfFmiN)~qdn|)G)SS>U!}t)w9{+% zdXi)GUSF}V>%NegeZ4n1kjWkc)|B-2R$MZQkNVG1dgb2!sdG&an!SB?vEy&k92$2{ z&*pzhzO#bpve0$$wy-U9-T8eAde>c!iwa$rREsKI_y186&af+X-SyXC)2S?6vgJK@ zF?uca-Wq!D&T~x<=sox1ET@!to92X>-C#?MnXJ%rM{bzXK`|Q#-IyvJSD;NWwRyQ= z!ABIPHvcRQ3jwUeM{ylD*3@Qt6#YE(|N5BPDD2-hwdqfx&Ugy-pTwu2G?4R1;ZH)p z|HCju)?jXJUOjq!+I znj#G@)!2dIKD{W`2sqR&WMtNYlUKqCt-kC&Z8gg_q%j9It4GkY$Y3@hedwb*%?!q+ zkwp|sLuNPqjA?q~Y8Vz5DK&kugUHSzRJ*5Xk1Sq{q%Kwm;-s*3UxO2r-AV#+EqEwu z;y@bDDxFC7^|5qclAa8^9Yrsr;(v;(jkH3wE^(!v~!pFP6>Kl9b z{E{^%BWK&T{?_Yh@V)R5>|h-q1YF)Vvx@nft6H~yTAs!KO!i`p=^ML&?B$^Jrl*{_ zU?e;NitfeX9-PCjngGSi0mXNN0mbwk`}xwJ*N3UlN^u5LQ9^Y0{WO-KY{RG=xfp`4 zpT;&~oMRzudC(%b=ZFEjmj(Zklb0*ZpAye8doPy~%`Y~q)!%0B+219hmtpgK|59_$ z{Mbn>x@;Y86YX`RO8JC$jT$*)qLHG;HCI(xif-D^G`GlN zLO$)gb@K1knSMR<+X(sO?E~$ER!lI)8*u1F^*ip(jP9#I5pUCq> z_$V5h#ex{Uu<4gn&KD)ggijGu4yY5-#H~b{+@3wdX>y;}{%6 z(d`I@7n&h=PxnuUqlexv0Dl7r@(!~j&Tz99sSNe1trM& ztOEjr&jxDZvz4Dx18*yh#F+&?Yv-xhR+J`te0u*P``wy*Zx}m#h=X0G14l05vvx|S ztBZWLNoZqEX>C4RxlEhSR!(R0*~;l`KHF^Z*%qZ$;Ip;%l3DOs-d5y-$B;u`6`qY# z|5(J|{9?R$&m7)-+jV>M))0%55o}Ix(@Kee?KWkJfj{-iQsuTny&=Ig*z@}c_wR&1 z4JPiG-zNTG5qnpBS&yzF@;JS}K|9uiCdiT(cQf+T={h10&EUQ`evU#joHR;3G089N zP9l?(ZUyaG(geE$hmdZ{g&`WN5gAD4S>|Y>7gNI^$U_iNenAD1loF4!O|(yPh^B{Z zq!6)@Pv}|aAeA%nI|ibXf-J};`mQdaEVCsPvF?cjNi~yV zaa?{UKI&s2Tm(5EXgI(0w(*Y1=GCVToslLt-#;1HeLfgHoX(DQ(eB@dTuW}# z=M{f*->Jqoo}6miLX+t2KE7R2c(k!jD;vD1eRJ7dc zqe(<}Y56!3K04$&ia-d8qzU2(%JlJfHrzTvnU4p0`{xK{0UI)^PCaMW!+|B2ObKSE ztOjKW?zRHGLlE1129)`2h!vpBfz1}KHU16@St*f)Cy80#TLC(JoBNa0JwP1qIaA<6 zN-m;MeS`_2H(O1F!{lAGXMri`Ko_b@I>~+ol$EzVJCL-B7qFFQ1<4|2cX2N6G$ zoswYc>N@P{1LXJ+Vh;@-h9)P!=AlT4BG!egX{-t+)$em-^Y09cPTelb-s<*8Ygv!p4dv1R}i z=RtA+Kzn8>Q>H3e7{CDtXXb!x*C^brL!Q~|s#VIf=G&=~&VpH`$&l)UO0&{Z2pWJL zsu^Z1eZzcChV1Eq)r=MpAS;>|A79ar*3`=F05!EHy(O8Y!XE7@_^ej+KZX0db}L#$ zc@|oci5YgFh4A#8TBX&Bl*QJnfzVQ{RRe=Hw+>5dHQ%FC>%hBpD@zwzM&-A1YvId0 zwv5|qY#GbIfo@;mK-AmK-nPJjN;wb&-{poigby4c>36~-%sD~ptT8SQ^DxG3boe(! zY~x4|9quIZ9U}J-AzMjY8u_B>&Do_7W0{R7Cpk34az{mHPV#`diwZQ>puV3jRBC6@ zeEPa%FtzZ-;Dix0$fWy=uw(d(uHix|zc40N!xC`U@*9;d0f!4R1@+Q`_Nr*FP(8mv zgw_r>7ld1y$_)$+=hNnLa0$5j6Y4Og1Tc~RaveS`nc$(Xt1)6?_#KT{1O1(mB|W3sJ=y;kB@E7m$)0C&I9AXR%Vr`f7t z*A$xl1;1KNXpTbV!m3wu6q@UAG1W_)W64R#wdOdgEOyO7Xl^is=77?uG!+7dN_BmN zW?E{St5jTQZmvx$($Q+qhE<7tE9b~7Cv*1 zRDCMIa_);U#F!pcUr2zl9Vp=dJ}QN*7_1kwU!7_^-8i}aQ~ewn5gl&4h~gHO!=?&!RizSKG%o-5Eo)x^5i3Vw6HjjCwUm4FIjW>8NqU$@DF>QPao5y0eGhOP?>u{Mxu}ElitPf z+s@cXL+D;%c0Y=+djP58VLfnd3(6J-^q1)nq*rK-q?O`Au!hEDPJdBRdbsXPjveAVIkW=xn5qTx>7)5043l9K+tC7P1db0U z5h~6D6c^*19<<0^QJr*{ATmINx&kW@871;ZM7~PoYeZfq@=YSI6ZsyI?-Thck++Gw zLqsETfyl3k{D#PTM0lJe&C9@=iO>)xMq`0%_5|W4s0i|Y zYJqzOa>{TveJ^_ydGPO2KLz@pN0OxX-9AbFpw1;pUvd3G=x46r&sUkQ1nC08PD5bSNd(!zhBJ)Fa@^h1+d zdQ7?!l5jHMk#^EApOdaM`K6UtS^^-8f*@@HY4w#wK55OBrC#Z_D-8x|dvB?$v2}do z>4WE+7LPx9y6b(9IM9>b5EP&H%HNo`R!Pfe*lEg2~-lhe97ZQ4k!)5=NHLx`lnj+s*DIH}E^o>Pi) z<5*4Fe*eAm+TFqKf&@gV#$fQz?99D)X71ek|KI<(cm6CE3rkqulitkUyHb*VLO;yK zW9)q3ktOLRNtJd>s;qj3<=q~>^^PcmI5pzi?U%)w!0rG)6Wkr-+tBV1--dUG`8KjU z!ne`gQNE4sj;r3UNxPd=1+-c9fwriA&{j17xRsb^cFH&LF?U{XT68=?7-gP{GoN)Aj^&xv~^l` zWNbKH$fQ^%t!isW$HU=WJoV@p%k=IY%MBK?`JC2U*q<)+s`=hrzR=4uFN|ez7FmXS zA02ph*TA~Rc0Jm=H#?ls)`qXqTSyL?FNL;2{Cn~L!6Lk(m!tw62oJw@+0FvR>DhoDp(}{n?sNXw;QX=4`Qu*ak1gUBMbUlh z%YOB10X%lESU%jkZhw9xvo1S=XUyl)>ohjF4mX-fgQZXntQ!O0;d`kn4x@MTI;yU} z)PY?7P>$V+>$Kxv+Yd4!T~+$0=6HQa>Hh%Nl4hN0$uneH9?rBlqjNAVg!Qs#DjN4cNfg)5fz3KE4}ls7|g-GV6h4jCdDc~ zaL7fdmx+J7e3v%vD*o*p^wKMq>KqXN0moCgh~g!f zf`1?W{iT5YHm8BG5f=y>aYzHSVnA5?Ell5rqZJT#z6~2nfur`^`i|9x)tDg-B5GW1 zDn^Vp5^cDRM9sEgZr$1lhp^2_-JeVj3?!3B0?FjAUCG|0F8AsZmkbH{$g6XLW&5_6 zP?+`%EKG=1KxW!A4g&Q8AOkTDW(-u^w;`e3BOvZ34v@L%N`x3ON=7`A#fT6OWKBd! zmT@S&fzwT#ZaGqhmI+-GTE0*BCX-1a63eAwA|hlY^;11y+!Lc7$_V?h4k8PQEFwaZ zk984Q43dbkZrbl5vXn?Kh=8jG`Zd6-@vFb=+{UtBo7o*!W&`=&>qNf&>)A58#Bvag z>cYUlVk6}D^6BJvE;Y(B&>{|IRCYh|(rs&h146j=@u`o^dO>;osVm1bI$ zr)6{dhBMqy!?vL^2`_5jfto@n@1ChSUMH1%=C3lNN`6PVry7Tn!ozXs6;OB&;i9O6 zJLEd&J+NYVY#UC=i{En5LQmwzDoWg@mj@tGc!*ViHZ{xjY z8Q3w3fPF<2iyh+-j4#tWEaCHn!_wii?88k6=c8B4ypRamHX3qRZXmLW8)|6pUT%O{ z&uM?Tbq1bLet07nb|_~%SRc2jIGuqTg1-s~+`;_t7nuzp7I-!zq0wr4G80)rf1$jLx^r`IgDOaksB!##~!w ze7`N$yQQ_I_DJNu?Vr-(X3B;3Bj-c-6!VA(xm1=HG9uW`0pH>f95374| z+zh8%;Bv2xx&zM!*)F;_?h|~HZomOrH?vRCxn4TV1MmzVUmtrKhg@)K567u4w?p=w z>>x`cTR&y}GB@yCjy^rH^Q!Xj)NNWXDi2>(o}9W(>qX_stI9)Dw`skkJX8Z?pGNG^ zEn#d5GzMB8L+~rN&(uTMUXHMN9tXnyq=g*Y2)l9JU~Uk0JzZe#5O%;q*g-WkcL*EO zBAh(ZQY9_);NUPK7a$PM%DN}3A#!BSIDb|X(H|z~7p(NNU~uf$i4a3)Um)`LKwS9c znrZpvNLC}uI0BmsV=Skoa`{|_?L{f~QAxiE0u?{Hl-TD3Q@3fos60@Eg}i#JEM#6y zpI6iUa}GHp$s?;xCL*v_CE#=*u&wMf=y~iJB4j6G&k`Za)Qr2nFzLBLrCdJs!dM=h zf$c}Z58>bH@AuwTxO7Q*xVpbn7$!H%U9|fQTL_VvZ{+VN6TO10$Vh zjKWU5+RF~3kDD*yY+Arl)~G#Vi@av z4IAsNfZz&ay%!_BBC4cDyNvqe2F^xmG{$Q*PVZp8Mycf*L8_=xu9UtNM~fc#2o|_i zb`5MFxW&z`k6^i`qMEM#9*8dAjqzrFU?Gfegr40&dNw(K#1W2=pfYnu$ytxkiAf#) zFfIa2c><`?LOp);7N6b7e(h0Q!SqbEns;Q3U&Gy)s`J9kVd&d~1sXtx0az*chQ>0v z!Azs-bHcCt2KTf42!A}M?%eNi3JJA&qc|;oY}Sai2``WE@!ZX^;57F`L)H$ZnVQnb zJfN~KA~P8gw0{UfNU(om=aq1DD!gJcyyEm{&ul-p;N0Ma@crZh>YBRQ^`g>sRarT8 z3+tq^vRY)2e~XI@52(;GRrvak;IQf;T9OO3MF>3ks-pTV5#n*!+Z7a0vGC{tuk9~} zZi>-j4*$;-8?gvaDd1mfy$R5x5#`eB}5MN ztA_8O(^0O=9oY3}3Ka~=t$8QaIdDgz+Ij5w4K>{GojYs9<<9N5TDR_;Q2^mOcWQh0 zBgO~_zZz9z4toi5 z6E!*UNH#a7nS7WO@g)AWb0FZuOUM zh9io&j~F($gZ3;DVUL|-PvAtk;|!5rJRoZ`8>3Rfg7sVOEXG-XFO|0uq$~<+1KUK$ ziGP>5x3*{`FaFzP52? z-abEP_W43P+7Dms0fSt9qRCeL+#m%NN&9OD_jL1sL@+UntkUEbP6a8}u$oa%L25Ja z(uB!%rDRnOC`d6((L!C4FLh>&s}}@eNz}RV$vEoUO+m61koYNu`)-12liECs3JA^Q zCKQgAB>N_6jB~kb6e}gTT#oieiEyjN3$(|GaFk4q1W0~8!D`@dVYR`IVYPvohfm{% z$t)a0A_zHqhcmge3P;`)nbs!))u;~2>@paL!N{UsW|yIN-oS(t?Aw&lz~%hr4^u{t z(8pw9ssE{tg=^d z!M)VgiA+fEMY<<;T#h!MxNkDLq_Wd%mQnWRePx%FuIj#M*;C^d_;U0`+?ZHgEvuGQ zRrQs8u&h#u1njCs+w0{=?+g_E#Q-d;VrI8`mQ_EW*^TFMSXLYFpY?!)`9PMZr7m!& z%F#f7wfZ*vO}7_dgc{{6b<$bhraI|ntTxvv&+<*FVZ*W-LI_FJF&7-oSj87dNv-BF zdpqr1a0G11=6}QQU?~90YHTjKp~VX^gVy(Oo0ZH;rDOG#M&O;(l#(lz)7cc+Z&Q*@ zM3O|l0isdQw=IZ<39%tYmfn*;m~k+&@1rE*tr~GOVq`r|M)vVHELz_F@ACk$+Ma#Y z0{WQB8Z#5wtxmP1^bYBS{QPQ~*xKQFb@AF4X$ouUHE>>9N?zOJZ2O@xYo>w-b#PU# zeEExhC|~5DW839hYLjsqn*!OM`QlK% z>>M<=Tg6hHgS=BfJyzeXb4TrtCw93byS9M3YTk*Rl5_FeX>FcYJDtVzfX;;S72xPB zV0eFm1)AWFFOIwGGLPBYV$iHnb8C2iXl_b>3A0EXTcb$a=W`9sOA>k*H#AMqPg1hj zw}_BNqkFZnJ&a8J>}4WcJ&Nmo3eU-p=>BKK)`#EwGGbg%OiR?4QsjIe1!1d!#tA?-2Peh(cF^i!JpCtyhQ=cG0I^}6h|JHH)Zp(+y8ddxS=(NxKYfWN4U}2 zoNgZBAfFc&2C;vPs@L1xoF{Aorz$wJV2@_09{Tt@n5%xMI-%g|^(-x*K5fZzeS%xupAi0Wb#O2(a zJ1MLH*3@WPRWWhuzr=;VjDPJvgH$v1HZXMx8||75cb(jD+H)GwNvp{EHRrY~n8&Uv z?Nc|qUQ*htMF2@?V$vV{TQFrzhib59OzlaL4;9PwP_q#cKp(LwdUi?f#`&St5BQ(kTv_t`!U zN8S!mOHK+yhVg#Ptrc}oL6O1awLv5oAnYGtZ_s+1_(5-Z1O7i)0rDwH#@jAR&qLsq zWC&~Np!99|C0Uk^g#qZ~-}fGp6CT}13?!L6D`DTWcD*j|(WqIN@g1QDXc*ApQTEW0 z%`V+5pMxFwV0t*KijNAceL#GJh1bYdwN24G{U z8KEvG%W^{?7V1(1merg_{MuIYM&-ZdUaMdPu$mwHW3G{flfobNGrZPo#0TgLM-R~# zj;ygA4~Z~lIK6avkd^z#&N>D8?;*R;$2yD_J(wTM6;k^$_$m)(dKhd}wx11%?)3Xq z;E#wjWX%@iJlrpxN)I1Ok87#Xyp}Cw@iC@9N3q2GwC6zVP9=Wg*Z9|V! zMl!GoiUZMY#!lC^gomBSK~54LTT3-MArPZDMuTyTV~F5j7mx=yk*lu+5yjU=UZy>6 zl5BDyMoaXuKf`&lV2)I3vuM0dC`s?h@$HbXE%9_e@7Cf5*bgWj7k}Iy$p_uo_Q){K z3Oq^F8BRTo7^E6XrAJ3u{xEwNnP0`fMqI*X5W*~ZW!ug-)gMMK#GZO9@Zev8bGkuXbyJBn%%c2y z!0f$njo8|#1!WD60VlFK^s%mq5?`%^ObwCa*)~mK@SfdMT)2v(&pQo{E_|d~H@ztn zB?f~ud`EKwmLoRj9;ykJ>!h>Zxf-=hlrc5#7els%6dK zE;jfvzW~VycWtN`9CG1^rLfvmiXb#0T#T4k;YjTjwOMVMwKrn9ZR)BoR!Yc)mO3?U z86E4Sv!1L@I{VGFns2Vq=)1CHlv#_0B_s@Uas)LRs{O8>2lwS~Dq4yaV_zUENW9ck zYA&^uT1yK`ZKZZ~0SwV`LnRFt!*xy4ZN)J9eLLlG^!tukd#7>pN$b8-D<$4(N1bHQxwS(l-=1ppSi%BkC@EcNgRT`w{`6puUa1 z#7I{YbHos))s<(TNbclOoQXj8=ivF-UI+v|{8Znk2l~atem=ghd!Fjkm8U56m#GLc zdtW2jaSh!NLS7H#QiGXceD`)C7m#GE%HZ3!YHBpY_!%Ld$$aB*R4Vk9+X_!5W-UNL zsUuH0Y*IiIB5)Kt_HbWu2u3M8mUOLFNV9@=C|lTH_I+SIzzfZ)2@z3R`3W=%x@7-> z2vrkqfwYbaRXyRvDz?~36I!lJ@{Ldk9P@+AGV&rXNP1L6u5zPUP#nyv zx{}RdhKBz}89zg0H<3Xihd^{6zr60v$}nJ8$N02 zdMK4k@2<8z2JjZB%YN{O=!F9AWsTV?PI)j-zo#DTgP+-eh1`@aQ+o!SBXG$VTL}c5 z3qUwH;oOgETPf)yZDlQ?2~11j-HP%Zm}^1#XM_11K^yU!v!i<=K!0pH#dU? zXls{ktBG~e*`hmQitbsS>%okIBZXa7<2B!D)BHO%T9`|F`PjZw-cNT($Ag9u2(6xC z0MurB*D&yeVBiUrf@(_%@T(X?WI&`C0SvQy2bo+e7PYk)L4RF9Icyl_$m3Q+7O+*@ z9B`?&)3M$hE0*a4aLvABx2szz(U05Ib_0e*)ed!GF>1iDSi@hBjhTGJ+*&X!qAr@R z$`xkVE5!p%P|AH9frSvud-9*I}@&d)r2*@Vlj8G~DZ1oS?2vp2&z)fynqiZgs zB=TW~_bEASa^rN4L(Mq$+yLfk?(ufJ%hW!W9~;%oV8P*W=11ZG9_46X#SXw`d(Zja zC(pNi>T>UsmwTUj-z&E#<@bF`)IYJ~=V1vbGknKn_>R*nE`(Q7WXbZWIanu^<#Si^ zgkDWvXy5Qk^5hFA(kJELRZk};lIPnu+{*N57nPN_lKqmh@|QrO9UHm3{y%Pw zKHa92#YhmN=1_Dq^e9&-iVx$c0q#ri)V4(y9f8<309Avphs>ZzS+WI#onjLYfE%)G zk+>scF+nAQXTrshJ1U!&%mtA+mK>EUNXfahh~olv47m7;Zp= zWVp?fJ{>% zE*3>QCaA6t$ci2U&}dgXjM|K-3)MxA&(t?sn^w!I02+Qkw$AyQEJbZEpQ9zYwaXec zSya?6RqB!elVb)s> z_X7RD8GT_1<+1yM+TCy)cgEwnwDAI&j`(7>wYm{}RXNb+yVahC@5t#-b-5$Ew(9me zt6p&jJ&t;GDK!S}yfTntTXQ`8i3(-wEZ|Lx;W3Nrw=}!PTJ{)S)|* zTo-{s`xmGz-Op!=AJBdH5Kju5!C@^mfRpdxBnuMg4iO21B$R6ez>F-9h%DGpmPgqC zrVM`v@}YlUVQoP>$gRu>4v282Vh|s79LaG-OP9mC9O5DFP;{)=bz>@H=safZIsu}x zNgF6Sfbc&1A4F`u#lJt3Rx_+ZU(v&`CTBz>kkENZ`E$AvSpJY!;YJWD5w99~Q=(af z)wLdCFuHTSA*ZHLYGFS@4GK(G@}rrY9%8usa5k6GFjxwQv!gJNj}B)G4x2u;OgrqL z(!~F5oK3Xq!NSNW!rW1MlynsPTKTa8^vnIak6(p#&@1@?l}LP4*!kb0z3&jQbR%6J zW{W5(SEslSfFcewVr}Ir6a8}9<*T2UG{TW|f!&4jE>R(Cpgg28qv1L=SRgqo zChJvY>D0}x7nP+my`1yjE-H6i_e*`7&!x}p`Ge4f$o+p1I@5P_;P~S&K7L}?*OODx z-pOe1>9$MJyUv9scASsg|Iv6WemX;q^&QeimrTuMy`U`lMK|8%i=BUaqfLr#)mqSO z{;1`dkoQNSmCtl2?}U8MEC{&kcYle-Wk#03$$*Km;c_5X!!7G~v#;IRCdn^ZW_QG{ zg~SxG9HWUDkSzVq&C!5QchU?<5t>%kfzhH34|(mhqw3t^HL8cy0;Ab? z!sv$Y$T_suErP_VEnu{ocY>*R!F6cC9mMLSwX~=@=`8&ggT35mKq8jS7e`4hy5O`? zbTl0{kD#lP%kr_sLM|s-l909B9`_fx6yxN-V$YDk@SpdyRm8&iL61nl4^UF1;v%gg zF9w4Q_Tj&uHBr{TLdt98$EqAvwt~*SL?>P*@+b(5a%9?IU36+OktIaBiI9+TNHl6P zY#|Zy_OfyThuM*Okq3s+0$WCzmlNR^JWQ!=iNr+~7fTdZW{V{uj>I=*T*Of!Uq}z9 zxBy>+qW>rIno}SxTzh`=)054~%PsBavERH7QYqXw@yL7O)+%YV?9|}t<=@?ZCUjBh zzcQUP`svMnZD2;9DLAim{ldrj<%fKT+`n$wd8hYJ?)2?kHw}~gncCh zZB#Dy)NC+nkpYb_L)0cJ7B`cZ+4+p|!0^w)1OsyvOfZ%i%xy9>Ycw!&*euPF$HjwV zclELXT`mTmG82q_A8u-!6+{C@W2bbxx|I@)2BU-x6ATP)2pIGm%3HvpyxFC>Fwkko z)M9=HH4LBVd`;TMtedXMB1Wkb?*p1d6mLYc$)T&Qn-Uue_%^`W()yYX*+@$M!+vfOKaW3zV|1vf6+ z8phT7%dUpY^=;#j@hYsv@&lhR>;xYj1hz$j9k>OLueWN-Ez8&I z7Jvw7OjB<0X}WB&Gy}-{(TqZw&53d>Xbli<*NvN9z)A_2Q*WXi%Q8@>Ov@@@O_USh zV^ZxYw@8!ai=!k9s+s2`$p%&~q8d|fK~&-Ut3xiLs&8YJsM?jyjaN$!I?99S%LRv# z*cKf-k^_(&Y(0902w>uZ;9Iy?p{jG1TA|*@SU)8v&dxT_9%tdinj6;aL!5=Pjg)&6 z5q_O%wR*ag#gGLfuKY%HrCdqv%?=})9iOD)v((>*i?rh3m@e?T;t8!4?3f#v=bjxm z&d1LyJvF#h7^*i}HRqaJsowpvW8Arz#;b8<6R;Y*2@@aRl-fR%(do9@fB_`RYce)pla;)Z^T~LUtt0-y8Q5j82{;Smm?I{C z2;+Y^z|?FXpBo5feaMb6{(gL>6;od&j4!p)L|Ed8IT;ovnQ>!#4I)>BgOV+?lH<96 zRhKXCt>s~eH9%*@13!o>_26IoSE!Jiz@BgW-#mP=`O~w2KbK7@H@RL?mT@kaxLXgT zQfhuMmC}P0iCM_zG_D}%ovG1rs+qN7v{p8s!$%mo>Y<0MLwdkC#$}VP>^lO_swbP* zJ$a3f0f)mBddeY|9@YJ#JUy;uN5+QJ@Ya%3*7zFNoh089MD8cDjmWPNNfX&iWIvHF z68TLc$BFz7kyAuoA@Y49G$x5pLB?kXVt-CQX~G9SsGE;k;?bnsPs+$L&9waji5*FyAf98H8 ZHU^MyNvqs@V|7Sg`oW@@yqrJR{{tnb*(3k} diff --git a/test/unit/test_evaluate.py b/test/unit/test_evaluate.py index 1508d54..cdfcc1b 100644 --- a/test/unit/test_evaluate.py +++ b/test/unit/test_evaluate.py @@ -366,3 +366,203 @@ def test_compute_metrics_returns_all_expected_keys(): for key in ('pearson_r', 'spearman_r', 'log1p_rmse', 'precision', 'recall', 'f1', 'jaccard', 'specificity'): assert key in m, f"Missing key: {key}" + + +# --------------------------------------------------------------------------- +# compute_per_cell_metrics +# --------------------------------------------------------------------------- + +def test_compute_per_cell_metrics_basic(): + truth = {'c1': {'A': 5, 'B': 3}, 'c2': {'A': 1}} + obs = {'c1': {'A': 4}, 'c2': {'A': 1, 'B': 2}} + rows = ev.compute_per_cell_metrics(truth, obs, ['c1', 'c2'], ['A', 'B']) + assert len(rows) == 2 + assert rows[0]['cell_id'] == 'c1' + assert rows[0]['n_truth_expressed'] == 2 + assert rows[1]['n_observed_expressed'] == 2 + + +def test_compute_per_cell_metrics_empty_cell(): + truth = {'c1': {}} + obs = {} + rows = ev.compute_per_cell_metrics(truth, obs, ['c1'], ['A']) + assert rows[0]['n_truth_expressed'] == 0 + assert rows[0]['n_observed_expressed'] == 0 + + +def test_compute_per_cell_metrics_returns_all_keys(): + truth = {'c1': {'A': 3, 'B': 1, 'C': 2}} + obs = {'c1': {'A': 3, 'B': 1, 'C': 2}} + rows = ev.compute_per_cell_metrics(truth, obs, ['c1'], ['A', 'B', 'C']) + assert 'pearson_r' in rows[0] + assert 'spearman_r' in rows[0] + assert rows[0]['pearson_r'] == pytest.approx(1.0, abs=1e-4) + + +# --------------------------------------------------------------------------- +# load_benchmark +# --------------------------------------------------------------------------- + +def test_load_benchmark_existing_file(tmp_path): + bench = tmp_path / 'bench.txt' + bench.write_text('s\tcpu_time\tmax_rss\tio_in\tio_out\n' + '12.3\t11.2\t500.0\t100.0\t200.0\n') + result = ev.load_benchmark(str(bench)) + assert result['wall_time_s'] == '12.3' + assert result['max_rss_mb'] == '500.0' + assert result['io_in_mb'] == '100.0' + + +def test_load_benchmark_missing_path(): + result = ev.load_benchmark('/nonexistent/path.txt') + assert result == {} + + +def test_load_benchmark_none(): + assert ev.load_benchmark(None) == {} + + +# --------------------------------------------------------------------------- +# write_tsv +# --------------------------------------------------------------------------- + +def test_write_tsv_basic(tmp_path): + rows = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}] + out = tmp_path / 'out.tsv' + ev.write_tsv(rows, str(out)) + with open(out) as fh: + lines = fh.readlines() + assert lines[0].strip() == 'a\tb' + assert lines[1].strip() == '1\t2' + assert lines[2].strip() == '3\t4' + + +def test_write_tsv_empty_with_fallback_fields(tmp_path): + out = tmp_path / 'out.tsv' + ev.write_tsv([], str(out), fallback_fields=['x', 'y']) + with open(out) as fh: + content = fh.read() + assert 'x\ty' in content + + +# --------------------------------------------------------------------------- +# load_ground_truth default granularity fallback +# --------------------------------------------------------------------------- + +def test_load_ground_truth_unknown_granularity_falls_back(tmp_path): + p = make_ground_truth(tmp_path, GT_ROWS) + truth, _ = ev.load_ground_truth(str(p), granularity='unknown_gran') + # falls back to repeat_id key (same as gene_id) + assert 'AluSz6' in truth['c1'] + + +# --------------------------------------------------------------------------- +# main() end-to-end integration tests +# --------------------------------------------------------------------------- + +def _write_gt(path): + path.write_text( + 'cell_id\tlocus_id\trepeat_id\tfamily_id\tclass_id\ttrue_count\n' + 'c1\tAluSz6_dup1\tAluSz6\tAlu\tSINE\t5\n' + 'c1\tL1PA2_dup1\tL1PA2\tL1\tLINE\t3\n' + 'c2\tAluSz6_dup1\tAluSz6\tAlu\tSINE\t2\n' + ) + + +def _write_counts(path): + path.write_text( + 'feature_id\tc1\tc2\n' + 'AluSz6\t4\t2\n' + 'L1PA2\t3\t0\n' + ) + + +def test_evaluate_main_creates_output_files(tmp_path, monkeypatch): + import sys + gt = tmp_path / 'gt.tsv' + counts = tmp_path / 'counts.tsv' + _write_gt(gt) + _write_counts(counts) + prefix = str(tmp_path / 'out') + monkeypatch.setattr(sys, 'argv', [ + 'evaluate.py', + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'test_aligner', + '--output-prefix', prefix, + ]) + ev.main() + assert os.path.exists(prefix + '_global_metrics.tsv') + assert os.path.exists(prefix + '_per_cell_metrics.tsv') + assert os.path.exists(prefix + '_per_family_metrics.tsv') + + +def test_evaluate_main_global_metrics_content(tmp_path, monkeypatch): + import sys + gt = tmp_path / 'gt.tsv' + counts = tmp_path / 'counts.tsv' + _write_gt(gt) + _write_counts(counts) + prefix = str(tmp_path / 'out') + monkeypatch.setattr(sys, 'argv', [ + 'evaluate.py', + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'starsolo', + '--multimapper-mode', 'unique', + '--granularity', 'gene_id', + '--feature-set', 'repeats', + '--output-prefix', prefix, + ]) + ev.main() + import csv as _csv + with open(prefix + '_global_metrics.tsv') as fh: + rows = list(_csv.DictReader(fh, delimiter='\t')) + assert len(rows) == 1 + assert rows[0]['aligner'] == 'starsolo' + assert rows[0]['granularity'] == 'gene_id' + + +def test_evaluate_main_with_locus_map(tmp_path, monkeypatch): + import sys + gt = tmp_path / 'gt.tsv' + counts = tmp_path / 'counts.tsv' + lm = tmp_path / 'locus_map.tsv' + _write_gt(gt) + _write_counts(counts) + lm.write_text('AluSz6_dup1\tAluSz6\tAlu\tSINE\nL1PA2_dup1\tL1PA2\tL1\tLINE\n') + prefix = str(tmp_path / 'out') + monkeypatch.setattr(sys, 'argv', [ + 'evaluate.py', + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'alevin', + '--locus-map', str(lm), + '--output-prefix', prefix, + ]) + ev.main() + assert os.path.exists(prefix + '_global_metrics.tsv') + + +def test_evaluate_main_with_benchmark(tmp_path, monkeypatch): + import sys + gt = tmp_path / 'gt.tsv' + counts = tmp_path / 'counts.tsv' + bench = tmp_path / 'bench.txt' + _write_gt(gt) + _write_counts(counts) + bench.write_text('s\tcpu_time\tmax_rss\tio_in\tio_out\n5.0\t4.5\t300.0\t50.0\t80.0\n') + prefix = str(tmp_path / 'out') + monkeypatch.setattr(sys, 'argv', [ + 'evaluate.py', + '--ground-truth', str(gt), + '--observed-counts', str(counts), + '--aligner', 'kallisto', + '--benchmark', str(bench), + '--output-prefix', prefix, + ]) + ev.main() + import csv as _csv + with open(prefix + '_global_metrics.tsv') as fh: + rows = list(_csv.DictReader(fh, delimiter='\t')) + assert rows[0]['wall_time_s'] == '5.0' diff --git a/test/unit/test_simulate_reads.py b/test/unit/test_simulate_reads.py index 0e81b1a..ec32c2e 100644 --- a/test/unit/test_simulate_reads.py +++ b/test/unit/test_simulate_reads.py @@ -297,3 +297,136 @@ def test_parse_gtf_repeats_by_chrom_allowed_chroms(tmp_path): intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), allowed_chroms={'chr1'}) assert 'chr1' in intervals assert 'chr2' not in intervals + + +# --------------------------------------------------------------------------- +# parse_gtf_repeats_by_chrom – additional branch coverage +# --------------------------------------------------------------------------- + +def test_parse_gtf_repeats_by_chrom_skips_comment_lines(tmp_path): + gtf_content = ( + '# this is a header comment\n' + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf)) + assert 'chr1' in intervals + assert len(intervals['chr1']) == 1 + + +def test_parse_gtf_repeats_by_chrom_skips_short_lines(tmp_path): + gtf_content = ( + 'too\tfew\tfields\n' + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "AluSz6"; transcript_id "AluSz6_dup1"; ' + 'family_id "Alu"; class_id "SINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf)) + assert len(intervals['chr1']) == 1 + + +def test_parse_gtf_repeats_by_chrom_max_per_chrom_truncates(tmp_path): + lines = '' + for i in range(5): + start = 101 + i * 300 + end = start + 300 + lines += ( + f'chr1\trmsk\texon\t{start}\t{end}\t.\t+\t.\t' + f'gene_id "R{i}"; transcript_id "R{i}_dup1"; ' + f'family_id "Alu"; class_id "SINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(lines) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), max_per_chrom=2) + assert len(intervals['chr1']) == 2 + + +def test_parse_gtf_repeats_by_chrom_max_per_chrom_no_truncation(tmp_path): + # max_per_chrom larger than count -> no sampling + gtf_content = ( + 'chr1\trmsk\texon\t101\t500\t.\t+\t.\t' + 'gene_id "A"; transcript_id "A_dup1"; family_id "Alu"; class_id "SINE";\n' + ) + gtf = tmp_path / 'repeats.gtf' + gtf.write_text(gtf_content) + intervals = sr.parse_gtf_repeats_by_chrom(str(gtf), max_per_chrom=10) + assert len(intervals['chr1']) == 1 + + +# --------------------------------------------------------------------------- +# stream_fasta_by_chrom +# --------------------------------------------------------------------------- + +def test_stream_fasta_by_chrom_plain(tmp_path): + fa = tmp_path / 'test.fa' + fa.write_text('>chr1\nACGTACGT\n>chr2\nTTTTGGGG\n>chr3\nAAAAAAAA\n') + result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1', 'chr3'})) + assert result['chr1'] == 'ACGTACGT' + assert result['chr3'] == 'AAAAAAAA' + assert 'chr2' not in result + + +def test_stream_fasta_by_chrom_multiline_seq(tmp_path): + fa = tmp_path / 'test.fa' + fa.write_text('>chr1\nACGT\nACGT\n') + result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1'})) + assert result['chr1'] == 'ACGTACGT' + + +def test_stream_fasta_by_chrom_gzip(tmp_path): + fa_gz = tmp_path / 'test.fa.gz' + import gzip as _gz + with _gz.open(str(fa_gz), 'wt') as f: + f.write('>chr1\nACGTACGT\n') + result = dict(sr.stream_fasta_by_chrom(str(fa_gz), {'chr1'})) + assert result['chr1'] == 'ACGTACGT' + + +def test_stream_fasta_by_chrom_skips_unwanted(tmp_path): + fa = tmp_path / 'test.fa' + fa.write_text('>chr1\nAAAA\n>chr99\nCCCC\n') + result = dict(sr.stream_fasta_by_chrom(str(fa), {'chr1'})) + assert 'chr99' not in result + + +# --------------------------------------------------------------------------- +# make_qual and safe_id +# --------------------------------------------------------------------------- + +def test_make_qual_default_char(): + assert sr.make_qual(5) == 'FFFFF' + assert len(sr.make_qual(90)) == 90 + + +def test_make_qual_custom_char(): + assert sr.make_qual(3, 'I') == 'III' + + +def test_safe_id_replaces_spaces_and_slashes(): + assert sr.safe_id('LINE/SINE foo') == 'LINE_SINE_foo' + + +def test_safe_id_no_change(): + assert sr.safe_id('AluSz6_dup1') == 'AluSz6_dup1' + + +# --------------------------------------------------------------------------- +# build_chrom_locus_coords +# --------------------------------------------------------------------------- + +def test_build_chrom_locus_coords_basic(): + plan = { + 'c1': {'locus_A': (3, 'gA', 'fA', 'cA', 'chr1', 100, 200, '+')}, + 'c2': {'locus_A': (2, 'gA', 'fA', 'cA', 'chr1', 100, 200, '+'), + 'locus_B': (5, 'gB', 'fB', 'cB', 'chr2', 500, 700, '-')}, + } + result = sr.build_chrom_locus_coords(plan) + assert 'chr1' in result + assert result['chr1']['locus_A'] == (100, 200, '+', 'gA') + assert 'chr2' in result + assert result['chr2']['locus_B'] == (500, 700, '-', 'gB') diff --git a/workflow/scripts/evaluation_report.Rmd b/workflow/scripts/evaluation_report.Rmd index 08c9f43..dd97006 100644 --- a/workflow/scripts/evaluation_report.Rmd +++ b/workflow/scripts/evaluation_report.Rmd @@ -13,7 +13,7 @@ params: --- ```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, fig.width = 12, fig.height = 12) library(ggplot2) library(dplyr) @@ -315,7 +315,7 @@ if (!is.null(per_family) && nrow(per_family) > 0 && "class_id" %in% names(per_fa ## compute resources -```{r resources, fig.height=14, fig.width=18} +```{r resources, fig.height=10, fig.width=14} res_cols <- intersect(c("wall_time_s", "cpu_time_s", "max_rss_mb", "io_in_mb", "io_out_mb"), names(global)) if (length(res_cols) > 0 && any(!is.na(global[res_cols]))) { n_al5 <- length(unique(global$aligner_mode)) From af9c7308fd9277316411f4d435bd1259584568cb Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 13:19:28 +0100 Subject: [PATCH 05/12] Track extra tests, remove testing twice --- .github/workflows/tests.yml | 6 +- test/unit/test_count_starsolo_locus.py | 200 +++++++++++++++++++++++++ 2 files changed, 202 insertions(+), 4 deletions(-) create mode 100644 test/unit/test_count_starsolo_locus.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 656c49a..ae5a84e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,8 +2,6 @@ name: Tests on: workflow_dispatch: - push: - branches: [master, dev] pull_request: branches: [master, dev] @@ -50,7 +48,7 @@ jobs: -c bioconda -c conda-forge python=3.11 - "snakemake>=8" + snakemake>=8 init-shell: bash - name: Dry-run main Snakefile with SmartSeq2 config @@ -106,7 +104,7 @@ jobs: -c bioconda -c conda-forge python=3.11 - "snakemake>=8" + snakemake>=8 init-shell: bash - name: Run negative control workflow diff --git a/test/unit/test_count_starsolo_locus.py b/test/unit/test_count_starsolo_locus.py new file mode 100644 index 0000000..0589c1c --- /dev/null +++ b/test/unit/test_count_starsolo_locus.py @@ -0,0 +1,200 @@ +""" +Unit tests for count_starsolo_locus.py pure functions and mocked subprocess paths. +""" +import os +import sys +import textwrap +from unittest.mock import MagicMock, patch + +import pytest + +import count_starsolo_locus as csl + + +# --------------------------------------------------------------------------- +# parse_locus_id +# --------------------------------------------------------------------------- + +def test_parse_locus_id_plus_strand(): + result = csl.parse_locus_id("AluSz_dup1::chr10:10000-10435(+)") + assert result == ("chr10", 10000, 10435, "+") + + +def test_parse_locus_id_minus_strand(): + result = csl.parse_locus_id("L1M4_dup3::10:65000-65900(-)") + assert result == ("10", 65000, 65900, "-") + + +def test_parse_locus_id_malformed_returns_none(): + assert csl.parse_locus_id("no_double_colon") is None + assert csl.parse_locus_id("bad::coords") is None + + +# --------------------------------------------------------------------------- +# load_intervals +# --------------------------------------------------------------------------- + +def test_load_intervals_basic(tmp_path): + fa = tmp_path / "test.fa" + fa.write_text( + ">AluSz_dup1::10:10000-10435(+)\nACGT\n" + ">L1M4_dup3::10:65000-65900(-)\nACGT\n" + ) + chrom_starts, chrom_intervals = csl.load_intervals(str(fa)) + assert "10" in chrom_intervals + assert len(chrom_intervals["10"]) == 2 + # sorted by start + assert chrom_intervals["10"][0][0] == 10000 + assert chrom_intervals["10"][1][0] == 65000 + assert chrom_starts["10"] == [10000, 65000] + + +def test_load_intervals_skips_non_header_lines(tmp_path): + fa = tmp_path / "test.fa" + fa.write_text(">AluSz_dup1::10:10000-10200(+)\nACGTACGT\n") + _, intervals = csl.load_intervals(str(fa)) + assert len(intervals["10"]) == 1 + assert intervals["10"][0][2] == "AluSz_dup1" + + +def test_load_intervals_deduplicates(tmp_path): + fa = tmp_path / "test.fa" + fa.write_text( + ">dup1::10:100-200(+)\nACGT\n" + ">dup1::10:100-200(+)\nACGT\n" + ) + _, intervals = csl.load_intervals(str(fa)) + assert len(intervals["10"]) == 1 + + +# --------------------------------------------------------------------------- +# find_locus +# --------------------------------------------------------------------------- + +def make_intervals(): + chrom_intervals = {"10": [(10000, 10200, "AluSz_dup1"), (20000, 20500, "L1_dup2")]} + chrom_starts = {"10": [10000, 20000]} + return chrom_starts, chrom_intervals + + +def test_find_locus_hit_first(): + cs, ci = make_intervals() + assert csl.find_locus("10", 10100, cs, ci) == "AluSz_dup1" + + +def test_find_locus_hit_second(): + cs, ci = make_intervals() + assert csl.find_locus("10", 20300, cs, ci) == "L1_dup2" + + +def test_find_locus_boundary_start(): + cs, ci = make_intervals() + assert csl.find_locus("10", 10000, cs, ci) == "AluSz_dup1" + + +def test_find_locus_boundary_end_exclusive(): + cs, ci = make_intervals() + assert csl.find_locus("10", 10200, cs, ci) is None + + +def test_find_locus_before_all(): + cs, ci = make_intervals() + assert csl.find_locus("10", 5000, cs, ci) is None + + +def test_find_locus_between_intervals(): + cs, ci = make_intervals() + assert csl.find_locus("10", 15000, cs, ci) is None + + +def test_find_locus_unknown_chrom(): + cs, ci = make_intervals() + assert csl.find_locus("chrX", 10100, cs, ci) is None + + +# --------------------------------------------------------------------------- +# process_smartseq2 with mocked subprocess +# --------------------------------------------------------------------------- + +def make_fasta(tmp_path): + fa = tmp_path / "rep.fa" + fa.write_text(">AluSz_dup1::10:10000-10200(+)\nACGT\n>L1_dup2::10:20000-20500(+)\nACGT\n") + return str(fa) + + +def _mock_proc(lines): + proc = MagicMock() + proc.stdout = iter([l.encode() for l in lines]) + return proc + + +def test_process_smartseq2_assigns_reads(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + sam = [ + "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n", + "cell_001_r2_AluSz_dup1\t0\t10\t10080\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n", + "cell_002_r3_L1_dup2\t0\t10\t20100\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n", + ] + with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)): + counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "unique") + assert "cell_001" in cbs + assert "cell_002" in cbs + assert counts["cell_001"]["AluSz_dup1"] == 2 + assert counts["cell_002"]["L1_dup2"] == 1 + + +def test_process_smartseq2_unique_filter(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + # NH:i:3 = multimapper, should be dropped in unique mode + sam = [ + "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:3\n", + ] + with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)): + counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "unique") + assert len(cbs) == 0 + + +def test_process_smartseq2_multi_mode_keeps_multimappers(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + sam = [ + "cell_001_r1_AluSz_dup1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:3\n", + ] + with patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)): + counts, cbs, loci = csl.process_smartseq2("/fake.bam", chrom_starts, chrom_intervals, "multi") + assert "cell_001" in cbs + + +# --------------------------------------------------------------------------- +# process_chromium with mocked subprocess +# --------------------------------------------------------------------------- + +def test_process_chromium_assigns_reads(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + sam = [ + "r1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:ACGAACGAAGTGAGCT\tUB:Z:GGATGACGAAGG\n", + "r2\t0\t10\t10080\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:ACGAACGAAGTGAGCT\tUB:Z:TTTTTTTTTAAA\n", + "r3\t0\t10\t20100\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\tCB:Z:CTAAGCCACCCTGAAG\tUB:Z:AAAAAAAAAAAA\n", + ] + with patch("count_starsolo_locus.sort_bam_by_cb", return_value="/tmp/fake_sorted.bam"), \ + patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)), \ + patch("os.unlink"): + counts, cbs, loci = csl.process_chromium( + "/fake.bam", chrom_starts, chrom_intervals, "unique", 1) + assert "ACGAACGAAGTGAGCT" in cbs + assert "CTAAGCCACCCTGAAG" in cbs + # UMI deduplication: 2 distinct UMIs for cell1/AluSz_dup1 + assert counts["ACGAACGAAGTGAGCT"]["AluSz_dup1"] == 2 + assert counts["CTAAGCCACCCTGAAG"]["L1_dup2"] == 1 + + +def test_process_chromium_drops_no_cb(tmp_path): + chrom_starts, chrom_intervals = csl.load_intervals(make_fasta(tmp_path)) + sam = [ + "r1\t0\t10\t10050\t255\t90M\t*\t0\t0\tACGT\tFFFF\tNH:i:1\n", # no CB tag + ] + with patch("count_starsolo_locus.sort_bam_by_cb", return_value="/tmp/fake.bam"), \ + patch("count_starsolo_locus.subprocess.Popen", return_value=_mock_proc(sam)), \ + patch("os.unlink"): + counts, cbs, loci = csl.process_chromium( + "/fake.bam", chrom_starts, chrom_intervals, "unique", 1) + assert len(cbs) == 0 From ca33b200a8b91e7adf8ec82674c05c00b5b01fa1 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 13:23:21 +0100 Subject: [PATCH 06/12] Fix starsolo sam flags regression on smartseq2, add tests --- .github/workflows/tests.yml | 8 ++++---- workflow/modules/starsolo.snmk | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ae5a84e..f4042db 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -58,7 +58,7 @@ jobs: snakemake \ --configfile configs/simulation_smartseq2.yaml \ --dry-run \ - --quiet + --quiet --cores 2 - name: Dry-run main Snakefile with Chromium config shell: bash -el {0} @@ -67,7 +67,7 @@ jobs: snakemake \ --configfile configs/simulation_chromium.yaml \ --dry-run \ - --quiet + --quiet --cores 2 - name: Dry-run test Snakefile with negative control config shell: bash -el {0} @@ -77,7 +77,7 @@ jobs: -s ../test/workflow/Snakefile_test \ --configfile ../test/workflow/configs/test_negative_control.yaml \ --dry-run \ - --quiet + --quiet --cores 2 - name: Run workflow dry-run pytest tests shell: bash -el {0} @@ -115,5 +115,5 @@ jobs: -s ../test/workflow/Snakefile_test \ --configfile ../test/workflow/configs/test_negative_control.yaml \ --use-conda \ - --cores 4 \ + --cores 2 \ --conda-frontend mamba diff --git a/workflow/modules/starsolo.snmk b/workflow/modules/starsolo.snmk index da16717..50416d7 100644 --- a/workflow/modules/starsolo.snmk +++ b/workflow/modules/starsolo.snmk @@ -97,7 +97,7 @@ rule starsolo_smartseq2: --soloCellReadStats None \ --soloUMIdedup NoDedup \ --soloMultiMappers {params.solo_multimappers} \ - --outSAMattributes NH HI AS NM CB UB \ + --outSAMattributes NH HI AS NM \ --outSAMtype BAM SortedByCoordinate \ --limitBAMsortRAM {params.max_ram_bytes} \ {params.extra_args} 2> {log} From 818f952180405f4ca86aad1994c096579fde2b26 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 13:30:29 +0100 Subject: [PATCH 07/12] Attempt to remove hardcoded home path --- test/workflow/configs/test_negative_control.yaml | 10 +++++----- workflow/Snakefile | 3 +++ workflow/configs/colon_cancer_cell_lines.yaml | 10 +++++----- workflow/configs/real_data_chromium.yaml | 16 ++++++++-------- workflow/configs/simulation.yaml | 16 ++++++++-------- workflow/configs/simulation_chromium.yaml | 10 +++++----- workflow/configs/simulation_smartseq2.yaml | 10 +++++----- 7 files changed, 39 insertions(+), 36 deletions(-) diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml index c35bf3e..754bba2 100644 --- a/test/workflow/configs/test_negative_control.yaml +++ b/test/workflow/configs/test_negative_control.yaml @@ -5,8 +5,8 @@ # - only a small number of cells (5) to keep runtime low # - testing.negative_control_max_recall sets the recall threshold -base: "/home/imallona/repeats/results/test_negative_control" -indices_base: "/home/imallona/repeats/results/shared" +base: "~/repeats/results/test_negative_control" +indices_base: "~/repeats/results/shared" reference: assembly: hg38 @@ -14,9 +14,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "~/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "~/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "~/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/Snakefile b/workflow/Snakefile index ab05a91..d904f0b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,6 +16,9 @@ genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all' if 'indices_base' not in config: config['indices_base'] = config['base'] +config['base'] = op.expanduser(config['base']) +config['indices_base'] = op.expanduser(config['indices_base']) + include: "modules/download_references.snmk" include: "modules/data_acquisition.snmk" include: "modules/reference.snmk" diff --git a/workflow/configs/colon_cancer_cell_lines.yaml b/workflow/configs/colon_cancer_cell_lines.yaml index df4f533..16cf23a 100644 --- a/workflow/configs/colon_cancer_cell_lines.yaml +++ b/workflow/configs/colon_cancer_cell_lines.yaml @@ -1,16 +1,16 @@ sample: "colon_cancer_cell_lines" ## base_path -base: "/home/imallona/repeats/" +base: "~/repeats/" aligners: ['starsolo'] # https://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_Ensembl_rmsk_TE.gtf.gz -repeats_gtf: "/home/imallona/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz" +repeats_gtf: "~/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz" # https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.chr.gtf.gz -genes_gtf: "/home/imallona/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz" +genes_gtf: "~/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz" # http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz -genome_fasta_gz: "/home/imallona/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" +genome_fasta_gz: "~/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" max_mem_mb: 20000 ## these are megabytes @@ -21,4 +21,4 @@ extraStarSoloArgs: "" ## solo #################################################################### soloType: "CB_UMI_Simple" -readFilesManifest: "/home/imallona/src/repeats/workflow/configs/colon_cancer_cell_lines_manifest.tsv" \ No newline at end of file +readFilesManifest: "~/src/repeats/workflow/configs/colon_cancer_cell_lines_manifest.tsv" \ No newline at end of file diff --git a/workflow/configs/real_data_chromium.yaml b/workflow/configs/real_data_chromium.yaml index 422e8a1..77f0de9 100644 --- a/workflow/configs/real_data_chromium.yaml +++ b/workflow/configs/real_data_chromium.yaml @@ -1,9 +1,9 @@ -base: "/home/imallona/repeats/real_chromium" +base: "~/repeats/real_chromium" reference: - genome_fasta: "/home/imallona/repeats/refs/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" - repeats_gtf: "/home/imallona/repeats/refs/GRCh38_Ensembl_rmsk_TE.gtf.gz" - genes_gtf: "/home/imallona/repeats/refs/Homo_sapiens.GRCh38.112.chr.gtf.gz" + genome_fasta: "~/repeats/refs/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" + repeats_gtf: "~/repeats/refs/GRCh38_Ensembl_rmsk_TE.gtf.gz" + genes_gtf: "~/repeats/refs/Homo_sapiens.GRCh38.112.chr.gtf.gz" filter_genic: true mode: real_data @@ -13,11 +13,11 @@ real_data: chemistry: 10xv3 samples: hct116_wt: - R1: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974767_1.fastq.gz" - R2: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974767_2.fastq.gz" + R1: "~/repeats/data/colon_cancer_cell_lines/SRR10974767_1.fastq.gz" + R2: "~/repeats/data/colon_cancer_cell_lines/SRR10974767_2.fastq.gz" hct116_dko: - R1: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974769_1.fastq.gz" - R2: "/home/imallona/repeats/data/colon_cancer_cell_lines/SRR10974769_2.fastq.gz" + R1: "~/repeats/data/colon_cancer_cell_lines/SRR10974769_1.fastq.gz" + R2: "~/repeats/data/colon_cancer_cell_lines/SRR10974769_2.fastq.gz" aligners: - starsolo diff --git a/workflow/configs/simulation.yaml b/workflow/configs/simulation.yaml index 24747bd..20db5f8 100644 --- a/workflow/configs/simulation.yaml +++ b/workflow/configs/simulation.yaml @@ -1,16 +1,16 @@ sample: "test_sample_name" ## base_path -base: "/home/imallona/repeats/" +base: "~/repeats/" aligners: ['starsolo'] # https://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_Ensembl_rmsk_TE.gtf.gz -repeats_gtf: "/home/imallona/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz" +repeats_gtf: "~/repeats/GRCh38_Ensembl_rmsk_TE.gtf.gz" # https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.chr.gtf.gz -genes_gtf: "/home/imallona/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz" +genes_gtf: "~/repeats/Homo_sapiens.GRCh38.112.chr.gtf.gz" # http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz -genome_fasta_gz: "/home/imallona/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" +genome_fasta_gz: "~/repeats/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" max_mem_mb: 20000 ## these are megabytes @@ -21,8 +21,8 @@ extraStarSoloArgs: "" ## solo ####################################################################3 soloType: "SmartSeq" ## generated by a simulation but anyway -#smartseq_cdna_fastq: "/home/imallona/repeats/simulations/cdna.fastq.gz" -readFilesManifest: "/home/imallona/repeats/simulations/manifest.tsv" +#smartseq_cdna_fastq: "~/repeats/simulations/cdna.fastq.gz" +readFilesManifest: "~/repeats/simulations/manifest.tsv" # run_tech: "chromium" @@ -42,8 +42,8 @@ readFilesManifest: "/home/imallona/repeats/simulations/manifest.tsv" # featurecounts_parsing: "~/src/repeats_sc/04_snakemake/plot_featurecounts_profile.R" # software: # Rscript: "/usr/local/R/R-4.0.5/bin/Rscript" -# bowtie_build: "/home/imallona/soft/bowtie/bowtie-1.2.3/bowtie-build" -# bowtie: "/home/imallona/soft/bowtie/bowtie-1.2.3/bowtie" +# bowtie_build: "~/soft/bowtie/bowtie-1.2.3/bowtie-build" +# bowtie: "~/soft/bowtie/bowtie-1.2.3/bowtie" # pigz: "/usr/bin/pigz" # star: "~/soft/star/STAR-2.7.3a/source/STAR" # featurecounts: "~/soft/subread/subread-2.0.0-source/bin/featureCounts" diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml index ea12b78..7c3bfc6 100644 --- a/workflow/configs/simulation_chromium.yaml +++ b/workflow/configs/simulation_chromium.yaml @@ -1,5 +1,5 @@ -base: "/home/imallona/repeats/results/simulation_chromium" -indices_base: "/home/imallona/repeats/results/shared" +base: "~/repeats/results/simulation_chromium" +indices_base: "~/repeats/results/shared" reference: assembly: hg38 @@ -8,9 +8,9 @@ reference: # Split repeats into genic/intergenic subsets using gene body overlap (bedtools) filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "/home/imallona/repeats/results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "/home/imallona/repeats/results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "/home/imallona/repeats/results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "~/repeats/results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "~/repeats/results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "~/repeats/results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml index b9a7698..9c65f9d 100644 --- a/workflow/configs/simulation_smartseq2.yaml +++ b/workflow/configs/simulation_smartseq2.yaml @@ -1,5 +1,5 @@ -base: "/home/imallona/repeats/results/simulation_smartseq2" -indices_base: "/home/imallona/repeats/results/shared" +base: "~/repeats/results/simulation_smartseq2" +indices_base: "~/repeats/results/shared" reference: assembly: hg38 @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "/home/imallona/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "~/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "~/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "~/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" mode: simulation From 6752e900abba16bd5a89b0f571118bae89626fa9 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 13:36:03 +0100 Subject: [PATCH 08/12] Switch to relative paths --- test/workflow/configs/test_negative_control.yaml | 10 +++++----- workflow/Snakefile | 3 +-- workflow/configs/simulation_chromium.yaml | 10 +++++----- workflow/configs/simulation_smartseq2.yaml | 10 +++++----- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml index 754bba2..e0c6498 100644 --- a/test/workflow/configs/test_negative_control.yaml +++ b/test/workflow/configs/test_negative_control.yaml @@ -5,8 +5,8 @@ # - only a small number of cells (5) to keep runtime low # - testing.negative_control_max_recall sets the recall threshold -base: "~/repeats/results/test_negative_control" -indices_base: "~/repeats/results/shared" +base: "../results/test_negative_control" +indices_base: "../results/shared" reference: assembly: hg38 @@ -14,9 +14,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "~/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "~/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "~/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/Snakefile b/workflow/Snakefile index d904f0b..a605307 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,8 +16,7 @@ genome_tag = '_'.join(sorted(_chroms)) if _chroms else 'all' if 'indices_base' not in config: config['indices_base'] = config['base'] -config['base'] = op.expanduser(config['base']) -config['indices_base'] = op.expanduser(config['indices_base']) + include: "modules/download_references.snmk" include: "modules/data_acquisition.snmk" diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml index 7c3bfc6..3a49b4f 100644 --- a/workflow/configs/simulation_chromium.yaml +++ b/workflow/configs/simulation_chromium.yaml @@ -1,5 +1,5 @@ -base: "~/repeats/results/simulation_chromium" -indices_base: "~/repeats/results/shared" +base: "../results/simulation_chromium" +indices_base: "../results/shared" reference: assembly: hg38 @@ -8,9 +8,9 @@ reference: # Split repeats into genic/intergenic subsets using gene body overlap (bedtools) filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "~/repeats/results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "~/repeats/results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "~/repeats/results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml index 9c65f9d..8d54f10 100644 --- a/workflow/configs/simulation_smartseq2.yaml +++ b/workflow/configs/simulation_smartseq2.yaml @@ -1,5 +1,5 @@ -base: "~/repeats/results/simulation_smartseq2" -indices_base: "~/repeats/results/shared" +base: "../results/simulation_smartseq2" +indices_base: "../results/shared" reference: assembly: hg38 @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "~/repeats/results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "~/repeats/results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "~/repeats/results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" mode: simulation From 4f2f53f8f10bd6df4ab51a2b1113df622326ed84 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 13:38:45 +0100 Subject: [PATCH 09/12] Negative controls path --- test/workflow/configs/test_negative_control.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml index e0c6498..204865f 100644 --- a/test/workflow/configs/test_negative_control.yaml +++ b/test/workflow/configs/test_negative_control.yaml @@ -14,9 +14,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/test_negative_control/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/test_negative_control/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/test_negative_control/refs/GRCh38.112.genes.gtf.gz" mode: simulation From 7125ed7c7edc1b85daa0178c6a33583eea83dc0b Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 15:00:16 +0100 Subject: [PATCH 10/12] Add sim_mutation_rate to simulations --- .github/workflows/tests.yml | 3 + Makefile | 123 +++++++ .../{ => old}/colon_cancer_cell_lines.yaml | 0 .../configs/{ => old}/real_data_chromium.yaml | 0 workflow/configs/{ => old}/simulation.yaml | 0 workflow/configs/simulation_chromium.yaml | 1 + .../simulation_chromium_noise_0pct.yaml | 59 ++++ .../simulation_chromium_noise_10pct.yaml | 59 ++++ .../simulation_chromium_noise_1pct.yaml | 59 ++++ .../simulation_chromium_noise_5pct.yaml | 59 ++++ workflow/configs/simulation_smartseq2.yaml | 1 + .../simulation_smartseq2_noise_0pct.yaml | 57 ++++ .../simulation_smartseq2_noise_10pct.yaml | 57 ++++ .../simulation_smartseq2_noise_1pct.yaml | 57 ++++ .../simulation_smartseq2_noise_5pct.yaml | 57 ++++ workflow/modules/evaluation.snmk | 6 +- workflow/modules/simulations.snmk | 8 +- workflow/scripts/evaluate.py | 5 + workflow/scripts/noise_sweep_report.Rmd | 303 ++++++++++++++++++ workflow/scripts/simulate_reads.py | 18 +- 20 files changed, 921 insertions(+), 11 deletions(-) create mode 100644 Makefile rename workflow/configs/{ => old}/colon_cancer_cell_lines.yaml (100%) rename workflow/configs/{ => old}/real_data_chromium.yaml (100%) rename workflow/configs/{ => old}/simulation.yaml (100%) create mode 100644 workflow/configs/simulation_chromium_noise_0pct.yaml create mode 100644 workflow/configs/simulation_chromium_noise_10pct.yaml create mode 100644 workflow/configs/simulation_chromium_noise_1pct.yaml create mode 100644 workflow/configs/simulation_chromium_noise_5pct.yaml create mode 100644 workflow/configs/simulation_smartseq2_noise_0pct.yaml create mode 100644 workflow/configs/simulation_smartseq2_noise_10pct.yaml create mode 100644 workflow/configs/simulation_smartseq2_noise_1pct.yaml create mode 100644 workflow/configs/simulation_smartseq2_noise_5pct.yaml create mode 100644 workflow/scripts/noise_sweep_report.Rmd diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f4042db..2bf6715 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,6 +49,9 @@ jobs: -c conda-forge python=3.11 snakemake>=8 + pytest + pytest-cov + scipy init-shell: bash - name: Dry-run main Snakefile with SmartSeq2 config diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5af3948 --- /dev/null +++ b/Makefile @@ -0,0 +1,123 @@ +SHELL := /bin/bash +CORES ?= 10 +WF := workflow +RESULTS := results + +CONDA_RUN := source ~/miniconda3/etc/profile.d/conda.sh && conda activate snakemake && +SM := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES) +RSCRIPT := cd $(WF) && $(CONDA_RUN) Rscript + +# noise sweep eval dirs (relative to workflow/, comma-separated for the Rmd param) +NOISE_EDIRS_SS2 := ../$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation +NOISE_EDIRS_CHR := ../$(RESULTS)/simulation_chromium_noise_0pct/evaluation,../$(RESULTS)/simulation_chromium_noise_1pct/evaluation,../$(RESULTS)/simulation_chromium_noise_5pct/evaluation,../$(RESULTS)/simulation_chromium_noise_10pct/evaluation + +# output HTML files (relative to project root) +NOISE_REPORT_SS2 := $(RESULTS)/noise_sweep_smartseq2.html +NOISE_REPORT_CHR := $(RESULTS)/noise_sweep_chromium.html + +.PHONY: all \ + simulation_smartseq2 simulation_chromium \ + noise_smartseq2 noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct \ + noise_chromium noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct \ + report_noise_smartseq2 report_noise_chromium reports_noise \ + help + +# ----------------------------------------------------------------------- +# base simulations +# ----------------------------------------------------------------------- + +simulation_smartseq2: + $(SM) --configfile configs/simulation_smartseq2.yaml + +simulation_chromium: + $(SM) --configfile configs/simulation_chromium.yaml + +# ----------------------------------------------------------------------- +# noise sweep — SmartSeq2 +# note: genome refs are reused from the base simulation_smartseq2 run; +# alignment indices are shared via results/shared/ (never rebuilt) +# ----------------------------------------------------------------------- + +noise_smartseq2_0pct: + $(SM) --configfile configs/simulation_smartseq2_noise_0pct.yaml + +noise_smartseq2_1pct: + $(SM) --configfile configs/simulation_smartseq2_noise_1pct.yaml + +noise_smartseq2_5pct: + $(SM) --configfile configs/simulation_smartseq2_noise_5pct.yaml + +noise_smartseq2_10pct: + $(SM) --configfile configs/simulation_smartseq2_noise_10pct.yaml + +noise_smartseq2: noise_smartseq2_0pct noise_smartseq2_1pct noise_smartseq2_5pct noise_smartseq2_10pct + +# ----------------------------------------------------------------------- +# noise sweep — Chromium +# ----------------------------------------------------------------------- + +noise_chromium_0pct: + $(SM) --configfile configs/simulation_chromium_noise_0pct.yaml + +noise_chromium_1pct: + $(SM) --configfile configs/simulation_chromium_noise_1pct.yaml + +noise_chromium_5pct: + $(SM) --configfile configs/simulation_chromium_noise_5pct.yaml + +noise_chromium_10pct: + $(SM) --configfile configs/simulation_chromium_noise_10pct.yaml + +noise_chromium: noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct noise_chromium_10pct + +# ----------------------------------------------------------------------- +# noise sweep reports +# file targets: only (re)rendered when the HTML does not yet exist; +# use make -B report_noise_smartseq2 to force a re-render +# ----------------------------------------------------------------------- + +$(NOISE_REPORT_SS2): + mkdir -p $(RESULTS) + $(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', \ + output_file = '../$(NOISE_REPORT_SS2)', \ + params = list(eval_dirs = '$(NOISE_EDIRS_SS2)'))" + +$(NOISE_REPORT_CHR): + mkdir -p $(RESULTS) + $(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', \ + output_file = '../$(NOISE_REPORT_CHR)', \ + params = list(eval_dirs = '$(NOISE_EDIRS_CHR)'))" + +report_noise_smartseq2: $(NOISE_REPORT_SS2) +report_noise_chromium: $(NOISE_REPORT_CHR) +reports_noise: report_noise_smartseq2 report_noise_chromium + +# ----------------------------------------------------------------------- +# all +# ----------------------------------------------------------------------- + +all: simulation_smartseq2 simulation_chromium noise_smartseq2 noise_chromium reports_noise + +# ----------------------------------------------------------------------- +# help +# ----------------------------------------------------------------------- + +help: + @echo "Usage: make [target] [CORES=N] (default CORES=$(CORES))" + @echo "" + @echo "Base simulation runs:" + @echo " simulation_smartseq2 SmartSeq2 base run (mutation_rate 0.1%)" + @echo " simulation_chromium Chromium base run (mutation_rate 0.1%)" + @echo "" + @echo "Noise sweep runs (base simulation must have been run first):" + @echo " noise_smartseq2 all 4 noise levels for SmartSeq2" + @echo " noise_smartseq2_{0,1,5,10}pct individual SmartSeq2 noise level" + @echo " noise_chromium all 4 noise levels for Chromium" + @echo " noise_chromium_{0,1,5,10}pct individual Chromium noise level" + @echo "" + @echo "Reports (file targets; use -B to force re-render):" + @echo " report_noise_smartseq2 $(NOISE_REPORT_SS2)" + @echo " report_noise_chromium $(NOISE_REPORT_CHR)" + @echo " reports_noise both noise sweep reports" + @echo "" + @echo " all run everything in sequence" diff --git a/workflow/configs/colon_cancer_cell_lines.yaml b/workflow/configs/old/colon_cancer_cell_lines.yaml similarity index 100% rename from workflow/configs/colon_cancer_cell_lines.yaml rename to workflow/configs/old/colon_cancer_cell_lines.yaml diff --git a/workflow/configs/real_data_chromium.yaml b/workflow/configs/old/real_data_chromium.yaml similarity index 100% rename from workflow/configs/real_data_chromium.yaml rename to workflow/configs/old/real_data_chromium.yaml diff --git a/workflow/configs/simulation.yaml b/workflow/configs/old/simulation.yaml similarity index 100% rename from workflow/configs/simulation.yaml rename to workflow/configs/old/simulation.yaml diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml index 3a49b4f..04ae340 100644 --- a/workflow/configs/simulation_chromium.yaml +++ b/workflow/configs/simulation_chromium.yaml @@ -22,6 +22,7 @@ simulation: cb_length: 16 umi_length: 12 seed: 42 + mutation_rate: 0.001 feature_sets: - repeats diff --git a/workflow/configs/simulation_chromium_noise_0pct.yaml b/workflow/configs/simulation_chromium_noise_0pct.yaml new file mode 100644 index 0000000..1aea43a --- /dev/null +++ b/workflow/configs/simulation_chromium_noise_0pct.yaml @@ -0,0 +1,59 @@ +base: "../results/simulation_chromium_noise_0pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: chromium + n_cells: 50 + n_expressed_per_cell: 80 + read_length: 90 + cb_length: 16 + umi_length: 12 + seed: 42 + mutation_rate: 0.0 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +granularities: + - locus + - gene_id + - family_id + - class_id + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multi"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 50 + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_chromium_noise_10pct.yaml b/workflow/configs/simulation_chromium_noise_10pct.yaml new file mode 100644 index 0000000..b8a3b0a --- /dev/null +++ b/workflow/configs/simulation_chromium_noise_10pct.yaml @@ -0,0 +1,59 @@ +base: "../results/simulation_chromium_noise_10pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: chromium + n_cells: 50 + n_expressed_per_cell: 80 + read_length: 90 + cb_length: 16 + umi_length: 12 + seed: 42 + mutation_rate: 0.10 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +granularities: + - locus + - gene_id + - family_id + - class_id + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multi"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 50 + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_chromium_noise_1pct.yaml b/workflow/configs/simulation_chromium_noise_1pct.yaml new file mode 100644 index 0000000..2d52c8a --- /dev/null +++ b/workflow/configs/simulation_chromium_noise_1pct.yaml @@ -0,0 +1,59 @@ +base: "../results/simulation_chromium_noise_1pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: chromium + n_cells: 50 + n_expressed_per_cell: 80 + read_length: 90 + cb_length: 16 + umi_length: 12 + seed: 42 + mutation_rate: 0.01 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +granularities: + - locus + - gene_id + - family_id + - class_id + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multi"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 50 + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_chromium_noise_5pct.yaml b/workflow/configs/simulation_chromium_noise_5pct.yaml new file mode 100644 index 0000000..228f030 --- /dev/null +++ b/workflow/configs/simulation_chromium_noise_5pct.yaml @@ -0,0 +1,59 @@ +base: "../results/simulation_chromium_noise_5pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: chromium + n_cells: 50 + n_expressed_per_cell: 80 + read_length: 90 + cb_length: 16 + umi_length: 12 + seed: 42 + mutation_rate: 0.05 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +granularities: + - locus + - gene_id + - family_id + - class_id + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique", "multi"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 50 + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml index 8d54f10..54beaee 100644 --- a/workflow/configs/simulation_smartseq2.yaml +++ b/workflow/configs/simulation_smartseq2.yaml @@ -19,6 +19,7 @@ simulation: n_expressed_per_cell: 100 read_length: 90 seed: 42 + mutation_rate: 0.001 # Feature sets to quantify against: # genes - Ensembl gene annotation diff --git a/workflow/configs/simulation_smartseq2_noise_0pct.yaml b/workflow/configs/simulation_smartseq2_noise_0pct.yaml new file mode 100644 index 0000000..ba95edb --- /dev/null +++ b/workflow/configs/simulation_smartseq2_noise_0pct.yaml @@ -0,0 +1,57 @@ +base: "../results/simulation_smartseq2_noise_0pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 20 + n_expressed_per_cell: 100 + read_length: 90 + seed: 42 + mutation_rate: 0.0 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "--minAssignedFrags 1" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 20 + +granularities: + - locus + - gene_id + - family_id + - class_id + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_smartseq2_noise_10pct.yaml b/workflow/configs/simulation_smartseq2_noise_10pct.yaml new file mode 100644 index 0000000..b7d9d5c --- /dev/null +++ b/workflow/configs/simulation_smartseq2_noise_10pct.yaml @@ -0,0 +1,57 @@ +base: "../results/simulation_smartseq2_noise_10pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 20 + n_expressed_per_cell: 100 + read_length: 90 + seed: 42 + mutation_rate: 0.10 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "--minAssignedFrags 1" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 20 + +granularities: + - locus + - gene_id + - family_id + - class_id + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_smartseq2_noise_1pct.yaml b/workflow/configs/simulation_smartseq2_noise_1pct.yaml new file mode 100644 index 0000000..f38d904 --- /dev/null +++ b/workflow/configs/simulation_smartseq2_noise_1pct.yaml @@ -0,0 +1,57 @@ +base: "../results/simulation_smartseq2_noise_1pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 20 + n_expressed_per_cell: 100 + read_length: 90 + seed: 42 + mutation_rate: 0.01 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "--minAssignedFrags 1" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 20 + +granularities: + - locus + - gene_id + - family_id + - class_id + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/configs/simulation_smartseq2_noise_5pct.yaml b/workflow/configs/simulation_smartseq2_noise_5pct.yaml new file mode 100644 index 0000000..ef69956 --- /dev/null +++ b/workflow/configs/simulation_smartseq2_noise_5pct.yaml @@ -0,0 +1,57 @@ +base: "../results/simulation_smartseq2_noise_5pct" +indices_base: "../results/shared" + +reference: + assembly: hg38 + ensembl_release: "112" + chromosomes: ["chr10"] + filter_genic: true + rmsk_source: ucsc_flatfile + genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + +mode: simulation + +simulation: + technology: smartseq2 + n_cells: 20 + n_expressed_per_cell: 100 + read_length: 90 + seed: 42 + mutation_rate: 0.05 + +feature_sets: + - repeats + - genes + - genic_repeats + - intergenic_repeats + +aligners: + - starsolo + - kallisto + - alevin + - bowtie2 + +aligner_params: + starsolo: + multimapper_modes: ["unique"] + extra_args: "" + kallisto: + extra_args: "" + alevin: + extra_args: "--minAssignedFrags 1" + bowtie2: + multimapper_modes: ["unique", "multi"] + extra_args: "" + featurecounts_cell_chunk_size: 20 + +granularities: + - locus + - gene_id + - family_id + - class_id + +resources: + max_threads: 10 + max_mem_mb: 20000 diff --git a/workflow/modules/evaluation.snmk b/workflow/modules/evaluation.snmk index a83c559..c0d78b4 100644 --- a/workflow/modules/evaluation.snmk +++ b/workflow/modules/evaluation.snmk @@ -21,6 +21,7 @@ if sim_technology == 'smartseq2' and 'multi' in starsolo_modes: bowtie2_modes = config.get('aligner_params', {}).get('bowtie2', {}).get( 'multimapper_modes', ['unique']) granularities = config.get('granularities', ['family_id']) +sim_mutation_rate = config.get('simulation', {}).get('mutation_rate', 0.0) evaluation_script = op.join(workflow.basedir, 'scripts', 'evaluate.py') counts_dir = op.join(config['base'], 'counts') eval_dir = op.join(config['base'], 'evaluation') @@ -81,6 +82,7 @@ rule evaluate_combination: granularity = 'locus|gene_id|family_id|class_id', multimapper_mode = 'unique|multi' params: + mutation_rate = sim_mutation_rate, benchmark_file = eval_benchmark, output_prefix = op.join(eval_dir, '{aligner}_{feature_set}_{granularity}_{multimapper_mode}') @@ -98,7 +100,8 @@ rule evaluate_combination: --feature-set {wildcards.feature_set} \ --locus-map {input.locus_map} \ --benchmark {params.benchmark_file} \ - --output-prefix {params.output_prefix} 2> {log} + --output-prefix {params.output_prefix} \ + --mutation-rate {params.mutation_rate} 2> {log} """ rule aggregate_global_metrics: """Concatenate all per-aligner global metric TSVs into one summary file.""" @@ -176,6 +179,7 @@ rule render_report: output: report = op.join(eval_dir, 'evaluation_report.html') params: + mutation_rate = sim_mutation_rate, eval_dir = eval_dir, rmd = op.join(workflow.basedir, 'scripts', 'evaluation_report.Rmd') log: diff --git a/workflow/modules/simulations.snmk b/workflow/modules/simulations.snmk index 443e291..1c231c1 100644 --- a/workflow/modules/simulations.snmk +++ b/workflow/modules/simulations.snmk @@ -38,6 +38,7 @@ rule simulate_smartseq2: n_expressed = sim_cfg.get('n_expressed_per_cell', 100), read_length = sim_cfg.get('read_length', 90), seed = sim_seed, + mutation_rate = sim_cfg.get('mutation_rate', 0.001), chrom_args = '--chromosomes ' + ' '.join(sim_chromosomes) if sim_chromosomes else '', max_per_chrom_arg = ( f'--max-repeats-per-chrom {sim_cfg["max_repeats_per_chrom"]}' @@ -57,7 +58,8 @@ rule simulate_smartseq2: --read-length {params.read_length} \ --seed {params.seed} \ {params.chrom_args} \ - {params.max_per_chrom_arg} 2> {log} + {params.max_per_chrom_arg} \ + --mutation-rate {params.mutation_rate} 2> {log} """ @@ -80,6 +82,7 @@ rule simulate_chromium: cb_length = sim_cfg.get('cb_length', 16), umi_length = sim_cfg.get('umi_length', 12), seed = sim_seed, + mutation_rate = sim_cfg.get('mutation_rate', 0.001), chrom_args = '--chromosomes ' + ' '.join(sim_chromosomes) if sim_chromosomes else '', max_per_chrom_arg = ( f'--max-repeats-per-chrom {sim_cfg["max_repeats_per_chrom"]}' @@ -101,5 +104,6 @@ rule simulate_chromium: --umi-length {params.umi_length} \ --seed {params.seed} \ {params.chrom_args} \ - {params.max_per_chrom_arg} 2> {log} + {params.max_per_chrom_arg} \ + --mutation-rate {params.mutation_rate} 2> {log} """ diff --git a/workflow/scripts/evaluate.py b/workflow/scripts/evaluate.py index 3cc074b..8f868b6 100644 --- a/workflow/scripts/evaluate.py +++ b/workflow/scripts/evaluate.py @@ -228,6 +228,8 @@ def main(): help='4-col TSV (transcript_id, gene_id, family_id, class_id). ' 'Expands the feature universe for specificity and restricts ' 'ground truth to this feature_set\'s loci.') + ap.add_argument('--mutation-rate', type=float, default=0.0, + help='Mutation rate used in the simulation (for output labelling)') ap.add_argument('--output-prefix', required=True) args = ap.parse_args() @@ -277,6 +279,7 @@ def main(): global_metrics['multimapper_mode'] = args.multimapper_mode global_metrics['feature_set'] = args.feature_set global_metrics['granularity'] = args.granularity + global_metrics['mutation_rate'] = args.mutation_rate bench = load_benchmark(args.benchmark) global_metrics.update(bench) @@ -289,6 +292,7 @@ def main(): row['multimapper_mode'] = args.multimapper_mode row['feature_set'] = args.feature_set row['granularity'] = args.granularity + row['mutation_rate'] = args.mutation_rate write_tsv(per_cell_rows, args.output_prefix + '_per_cell_metrics.tsv', fallback_fields=['cell_id', 'pearson_r', 'spearman_r', 'n_truth_expressed', 'n_observed_expressed', @@ -310,6 +314,7 @@ def main(): row['multimapper_mode'] = args.multimapper_mode row['feature_set'] = args.feature_set row['granularity'] = args.granularity + row['mutation_rate'] = args.mutation_rate per_family_rows.append(row) write_tsv(per_family_rows, args.output_prefix + '_per_family_metrics.tsv', fallback_fields=['class_id', 'aligner', 'multimapper_mode', 'feature_set', diff --git a/workflow/scripts/noise_sweep_report.Rmd b/workflow/scripts/noise_sweep_report.Rmd new file mode 100644 index 0000000..c1a9602 --- /dev/null +++ b/workflow/scripts/noise_sweep_report.Rmd @@ -0,0 +1,303 @@ +--- +title: "repeat element quantification - noise sweep report" +author: "Izaskun Mallona" +date: "`r Sys.Date()`" +output: + html_document: + theme: flatly + toc: true + toc_float: true + code_folding: hide +params: + eval_dirs: "." +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, + fig.width = 14, fig.height = 10) +library(ggplot2) +library(dplyr) +library(tidyr) +library(readr) +library(scales) +library(patchwork) + +make_palette <- function(n) { + if (n <= 8) return(RColorBrewer::brewer.pal(max(3, n), "Dark2")[seq_len(n)]) + colorRampPalette(RColorBrewer::brewer.pal(8, "Dark2"))(n) +} + +eval_dirs <- strsplit(params$eval_dirs, ",")[[1]] +eval_dirs <- trimws(eval_dirs) +``` + +```{r load-data} +load_metrics_from_dirs <- function(dirs, filename) { + purrr::map_dfr(dirs, function(d) { + p <- file.path(d, filename) + if (!file.exists(p)) return(NULL) + read_tsv(p, show_col_types = FALSE) + }) +} + +load_pattern_from_dirs <- function(dirs, pattern) { + purrr::map_dfr(dirs, function(d) { + files <- list.files(d, pattern = pattern, full.names = TRUE) + if (length(files) == 0) return(NULL) + purrr::map_dfr(files, ~ read_tsv(.x, show_col_types = FALSE)) + }) +} + +global <- load_metrics_from_dirs(eval_dirs, "summary_global_metrics.tsv") + +num_cols <- c("pearson_r", "spearman_r", "log1p_rmse", + "precision", "recall", "f1", "jaccard", "specificity", + "n_cells", "n_features", "mutation_rate", + "wall_time_s", "cpu_time_s", "max_rss_mb", "io_in_mb", "io_out_mb") +for (col in num_cols) { + if (col %in% names(global)) + global[[col]] <- suppressWarnings(as.numeric(global[[col]])) +} + +if (!"granularity" %in% names(global)) global$granularity <- "gene_id" +if (!"feature_set" %in% names(global)) global$feature_set <- "repeats" +if (!"multimapper_mode" %in% names(global)) global$multimapper_mode <- "unique" +if (!"mutation_rate" %in% names(global)) global$mutation_rate <- NA_real_ + +global <- global %>% + mutate( + aligner_mode = paste0(aligner, " (", multimapper_mode, ")"), + gran_label = factor(granularity, + levels = c("locus", "gene_id", "family_id", "class_id")), + noise_label = paste0(mutation_rate * 100, "%") + ) + +per_cell <- tryCatch(load_pattern_from_dirs(eval_dirs, "_per_cell_metrics\\.tsv$"), + error = function(e) NULL) +per_family <- tryCatch(load_pattern_from_dirs(eval_dirs, "_per_family_metrics\\.tsv$"), + error = function(e) NULL) + +for (col in c("pearson_r", "spearman_r", "mutation_rate")) { + if (!is.null(per_cell) && col %in% names(per_cell)) + per_cell[[col]] <- suppressWarnings(as.numeric(per_cell[[col]])) + if (!is.null(per_family) && col %in% names(per_family)) + per_family[[col]] <- suppressWarnings(as.numeric(per_family[[col]])) +} + +if (!is.null(per_family) && nrow(per_family) > 0) { + for (col in c("f1", "precision", "recall", "jaccard", "specificity")) { + if (col %in% names(per_family)) + per_family[[col]] <- suppressWarnings(as.numeric(per_family[[col]])) + } + if (!"multimapper_mode" %in% names(per_family)) per_family$multimapper_mode <- "unique" + per_family <- per_family %>% + mutate(aligner_mode = paste0(aligner, " (", multimapper_mode, ")")) +} +``` + +--- + +This report combines evaluation results across multiple noise levels (simulated nucleotide +substitution rates). Each `eval_dirs` entry corresponds to one pipeline run with a distinct +`mutation_rate` in the simulation config. The `mutation_rate` column in the metrics TSVs +identifies each run. + +Noise levels found in the data: `r sort(unique(global$mutation_rate))` + +--- + +## metric degradation by noise level + +How each aligner's accuracy changes as the per-base mutation rate increases. +x-axis = mutation rate, y-axis = metric value, colour = aligner (+ multimapper mode). + +```{r degradation-main, fig.height=20, fig.width=14} +acc_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall", + "jaccard", "specificity") + +deg <- global %>% + filter(!is.na(mutation_rate)) %>% + select(aligner_mode, feature_set, gran_label, mutation_rate, all_of(acc_metrics)) %>% + pivot_longer(all_of(acc_metrics), names_to = "metric", values_to = "value") %>% + filter(!is.na(value)) %>% + mutate(metric = factor(metric, levels = acc_metrics)) + +n_al <- length(unique(deg$aligner_mode)) +pal <- make_palette(n_al) + +ggplot(deg, aes(x = mutation_rate, y = value, + colour = aligner_mode, group = aligner_mode)) + + geom_point(size = 2.5, alpha = 0.85) + + geom_line(linewidth = 0.6) + + facet_grid(metric ~ feature_set + gran_label, scales = "free_y") + + scale_colour_manual(values = pal, name = "aligner (mode)") + + scale_x_continuous(labels = percent_format(accuracy = 0.1)) + + labs(x = "mutation rate", y = NULL, + title = "metric degradation with increasing noise", + subtitle = "rows = metric; columns = feature_set x granularity; x = mutation rate") + + theme_bw(base_size = 9) + + theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + strip.text = element_text(size = 7), + legend.position = "bottom", + legend.text = element_text(size = 8)) +``` + +## pearson r and F1 — noise overview + +Summary view focusing on the two most interpretable metrics. + +```{r overview-two-metrics, fig.height=10, fig.width=14} +ov <- global %>% + filter(!is.na(mutation_rate)) %>% + select(aligner_mode, feature_set, gran_label, mutation_rate, pearson_r, f1) %>% + pivot_longer(c(pearson_r, f1), names_to = "metric", values_to = "value") %>% + filter(!is.na(value)) + +n_al2 <- length(unique(ov$aligner_mode)) +pal2 <- make_palette(n_al2) + +ggplot(ov, aes(x = mutation_rate, y = value, + colour = aligner_mode, group = aligner_mode)) + + geom_point(size = 3, alpha = 0.85) + + geom_line(linewidth = 0.8) + + facet_grid(metric ~ feature_set, scales = "free_y") + + scale_colour_manual(values = pal2, name = "aligner (mode)") + + scale_x_continuous(labels = percent_format(accuracy = 0.1)) + + ylim(0, 1) + + labs(x = "mutation rate", y = NULL, + title = "Pearson r and F1 across noise levels", + subtitle = "rows = metric, columns = feature_set; lines connect same aligner across mutation rates") + + theme_bw(base_size = 11) + + theme(axis.text.x = element_text(angle = 25, hjust = 1), + legend.position = "bottom") +``` + +## global accuracy metrics per noise level + +Full metric grid from the standard report, faceted by mutation rate. + +```{r global-faceted, fig.height=22, fig.width=16} +full_metrics <- c("pearson_r", "spearman_r", "f1", "precision", "recall", + "jaccard", "specificity", "log1p_rmse") + +full_long <- global %>% + filter(!is.na(mutation_rate)) %>% + select(aligner_mode, feature_set, gran_label, mutation_rate, noise_label, + all_of(full_metrics)) %>% + pivot_longer(all_of(full_metrics), names_to = "metric", values_to = "value") %>% + filter(!is.na(value)) %>% + mutate(metric = factor(metric, levels = full_metrics)) + +n_al3 <- length(unique(full_long$aligner_mode)) +pal3 <- make_palette(n_al3) + +ggplot(full_long, aes(x = gran_label, y = value, + colour = aligner_mode, group = aligner_mode)) + + geom_point(size = 2, alpha = 0.8) + + geom_line(linetype = "dashed", linewidth = 0.5) + + facet_grid(metric ~ noise_label + feature_set, scales = "free_y") + + scale_colour_manual(values = pal3, name = "aligner (mode)") + + labs(x = "granularity", y = NULL, + title = "global accuracy metrics", + subtitle = "rows = metric; columns = mutation rate x feature_set; x = granularity") + + theme_bw(base_size = 8) + + theme(aspect.ratio = 1, + axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + strip.text = element_text(size = 6), + legend.position = "bottom", + legend.text = element_text(size = 7)) +``` + +## per-cell correlations by noise level + +```{r per-cell-noise, fig.height=10, fig.width=14} +if (!is.null(per_cell) && nrow(per_cell) > 0 && "mutation_rate" %in% names(per_cell)) { + pc <- per_cell %>% + mutate(across(c(pearson_r, spearman_r), ~ suppressWarnings(as.numeric(.)))) %>% + filter(!is.na(aligner), !is.na(mutation_rate)) + + if (!"multimapper_mode" %in% names(pc)) pc$multimapper_mode <- "unique" + if (!"feature_set" %in% names(pc)) pc$feature_set <- "repeats" + + pc <- pc %>% + mutate(aligner_mode = paste0(aligner, " (", multimapper_mode, ")"), + gran_label = factor(granularity, + levels = c("locus", "gene_id", "family_id", "class_id")), + noise_label = paste0(mutation_rate * 100, "%")) + + pc_long <- pc %>% + pivot_longer(c(pearson_r, spearman_r), names_to = "metric", values_to = "r") %>% + filter(!is.na(r)) + + n_al4 <- length(unique(pc_long$aligner_mode)) + pal4 <- make_palette(n_al4) + + ggplot(pc_long, aes(x = aligner_mode, y = r, fill = aligner_mode)) + + geom_violin(alpha = 0.7, draw_quantiles = c(0.25, 0.5, 0.75)) + + geom_jitter(width = 0.1, size = 0.5, alpha = 0.4) + + facet_grid(metric + gran_label ~ noise_label + feature_set) + + scale_fill_manual(values = pal4) + + labs(x = NULL, y = "correlation", + title = "per-cell correlations by noise level", + subtitle = "rows = metric x granularity; columns = mutation rate x feature_set") + + theme_bw(base_size = 8) + + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), + legend.position = "none", + strip.text = element_text(size = 6)) +} else { + cat("No per-cell metrics with mutation_rate column found.\n") +} +``` + +## per-repeat-class F1 by noise level + +```{r per-family-noise, fig.height=14, fig.width=14} +if (!is.null(per_family) && nrow(per_family) > 0 && + "class_id" %in% names(per_family) && "mutation_rate" %in% names(per_family)) { + + pf <- per_family %>% + filter(!is.na(class_id), class_id != "unknown", !is.na(mutation_rate)) %>% + mutate(gran_label = factor(granularity, + levels = c("locus", "gene_id", "family_id", "class_id")), + noise_label = paste0(mutation_rate * 100, "%")) + + n_al5 <- length(unique(pf$aligner_mode)) + pal5 <- make_palette(n_al5) + + p_f1 <- ggplot(pf, aes(x = class_id, y = f1, + colour = aligner_mode, group = aligner_mode)) + + geom_line(linewidth = 0.4, linetype = "dashed") + + geom_point(size = 1.5, alpha = 0.85) + + facet_grid(gran_label ~ noise_label + feature_set) + + scale_y_sqrt() + + scale_colour_manual(values = pal5, name = "Aligner (mode)") + + labs(x = "repeat class", y = "F1", + title = "F1 per repeat class by noise level", + subtitle = "rows = granularity; columns = mutation rate x feature_set") + + theme_bw(base_size = 8) + + theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + legend.position = "bottom", + legend.text = element_text(size = 7), + strip.text = element_text(size = 6)) + + p_r <- ggplot(pf, aes(x = class_id, y = pearson_r, + colour = aligner_mode, group = aligner_mode)) + + geom_line(linewidth = 0.4, linetype = "dashed") + + geom_point(size = 1.5, alpha = 0.85) + + facet_grid(gran_label ~ noise_label + feature_set) + + scale_y_sqrt() + + scale_colour_manual(values = pal5, name = "Aligner (mode)") + + labs(x = "repeat class", y = "pearson r") + + theme_bw(base_size = 8) + + theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 7), + legend.position = "none", + strip.text = element_text(size = 6)) + + p_f1 / p_r +} else { + cat("No per-family metrics with class_id and mutation_rate found.\n") +} +``` + +--- +*Generated by [noise_sweep_report.Rmd](../workflow/scripts/noise_sweep_report.Rmd)* diff --git a/workflow/scripts/simulate_reads.py b/workflow/scripts/simulate_reads.py index d2b8d55..b322655 100644 --- a/workflow/scripts/simulate_reads.py +++ b/workflow/scripts/simulate_reads.py @@ -111,14 +111,14 @@ def reverse_complement(seq): return seq.translate(table)[::-1] -def sample_subseq(seq, read_length, rng): +def sample_subseq(seq, read_length, rng, mutation_rate=0.001): if len(seq) <= read_length: return (seq + 'N' * read_length)[:read_length] offset = rng.randint(0, len(seq) - read_length) read = seq[offset:offset + read_length] bases = list(read) for i in range(len(bases)): - if rng.random() < 0.001: + if rng.random() < mutation_rate: bases[i] = rng.choice('ACGT') return ''.join(bases) @@ -178,7 +178,7 @@ def build_chrom_locus_coords(cell_plan): return chrom_loci -def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng): +def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng, mutation_rate=0.001): os.makedirs(outdir, exist_ok=True) locus_to_cells = build_locus_to_cells(cell_plan) chrom_locus_coords = build_chrom_locus_coords(cell_plan) @@ -200,7 +200,7 @@ def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng): fq = cell_handles[cell_id] read_idx = cell_read_counts[cell_id] for i in range(count): - read = sample_subseq(repeat_seq, read_length, rng) + read = sample_subseq(repeat_seq, read_length, rng, mutation_rate) fq.write(f'@{cell_id}_r{read_idx + i}_{safe_id(locus_id)}\n' f'{read}\n+\n{make_qual(len(read))}\n') cell_read_counts[cell_id] += count @@ -215,7 +215,7 @@ def simulate_smartseq2(fasta_path, cell_plan, read_length, outdir, rng): return cell_fastq_paths, ground_truth -def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_length, outdir, rng): +def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_length, outdir, rng, mutation_rate=0.001): os.makedirs(outdir, exist_ok=True) locus_to_cells = build_locus_to_cells(cell_plan) chrom_locus_coords = build_chrom_locus_coords(cell_plan) @@ -263,7 +263,7 @@ def simulate_chromium(fasta_path, cell_plan, read_length, barcode_length, umi_le used_umis.add(umi) read_id = f'r{total_reads}_{barcode[:8]}_{umi}_{safe_id(locus_id)}' r1_seq = barcode + umi - r2_seq = sample_subseq(repeat_seq, read_length, rng) + r2_seq = sample_subseq(repeat_seq, read_length, rng, mutation_rate) r1f.write(f'@{read_id}\n{r1_seq}\n+\n{make_qual(len(r1_seq))}\n') r2f.write(f'@{read_id}\n{r2_seq}\n+\n{make_qual(len(r2_seq))}\n') total_reads += 1 @@ -323,6 +323,8 @@ def main(): ap.add_argument('--max-repeats-per-chrom', type=int, default=None, help='Cap intervals per chromosome to limit memory use on full genomes') ap.add_argument('--seed', type=int, default=42) + ap.add_argument('--mutation-rate', type=float, default=0.001, + help='Per-base substitution probability for simulated reads (default 0.001)') args = ap.parse_args() rng = random.Random(args.seed) @@ -353,7 +355,7 @@ def main(): if args.mode == 'smartseq2': print(f'Simulating SmartSeq2 reads (streaming FASTA by chrom)', file=sys.stderr) cell_fastq_paths, ground_truth = simulate_smartseq2( - args.fasta, cell_plan, args.read_length, args.outdir, rng) + args.fasta, cell_plan, args.read_length, args.outdir, rng, mutation_rate=args.mutation_rate) write_ground_truth(ground_truth, ground_truth_path, cell_column='cell_id') manifest_path = os.path.join(args.outdir, 'manifest.tsv') write_smartseq2_manifest(cell_fastq_paths, manifest_path) @@ -363,7 +365,7 @@ def main(): print(f'Simulating Chromium reads (streaming FASTA by chrom)', file=sys.stderr) (r1, r2), ground_truth = simulate_chromium( args.fasta, cell_plan, args.read_length, - args.cb_length, args.umi_length, args.outdir, rng) + args.cb_length, args.umi_length, args.outdir, rng, mutation_rate=args.mutation_rate) write_ground_truth(ground_truth, ground_truth_path, cell_column='cell_barcode') print(f'R1: {r1}', file=sys.stderr) print(f'R2: {r2}', file=sys.stderr) From 6c6edcc0e615625f634663bdaa76c14ca22b46c3 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 16:16:07 +0100 Subject: [PATCH 11/12] Cache genome/indices across noise runs; make all reports by default --- Makefile | 15 ++++++++------- workflow/configs/simulation_chromium.yaml | 6 +++--- .../configs/simulation_chromium_noise_0pct.yaml | 6 +++--- .../configs/simulation_chromium_noise_10pct.yaml | 6 +++--- .../configs/simulation_chromium_noise_1pct.yaml | 6 +++--- .../configs/simulation_chromium_noise_5pct.yaml | 6 +++--- workflow/configs/simulation_smartseq2.yaml | 6 +++--- .../configs/simulation_smartseq2_noise_0pct.yaml | 6 +++--- .../configs/simulation_smartseq2_noise_10pct.yaml | 6 +++--- .../configs/simulation_smartseq2_noise_1pct.yaml | 6 +++--- .../configs/simulation_smartseq2_noise_5pct.yaml | 6 +++--- workflow/modules/download_references.snmk | 2 +- workflow/modules/evaluation.snmk | 6 ++++-- workflow/modules/reference.snmk | 8 ++++---- 14 files changed, 47 insertions(+), 44 deletions(-) diff --git a/Makefile b/Makefile index 5af3948..cc9a316 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,16 @@ SHELL := /bin/bash +.DEFAULT_GOAL := all CORES ?= 10 WF := workflow RESULTS := results CONDA_RUN := source ~/miniconda3/etc/profile.d/conda.sh && conda activate snakemake && -SM := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES) +SM := cd $(WF) && $(CONDA_RUN) snakemake --use-conda --cores $(CORES) --rerun-triggers mtime RSCRIPT := cd $(WF) && $(CONDA_RUN) Rscript # noise sweep eval dirs (relative to workflow/, comma-separated for the Rmd param) -NOISE_EDIRS_SS2 := ../$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,../$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation -NOISE_EDIRS_CHR := ../$(RESULTS)/simulation_chromium_noise_0pct/evaluation,../$(RESULTS)/simulation_chromium_noise_1pct/evaluation,../$(RESULTS)/simulation_chromium_noise_5pct/evaluation,../$(RESULTS)/simulation_chromium_noise_10pct/evaluation +NOISE_EDIRS_SS2 := $(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_smartseq2_noise_10pct/evaluation +NOISE_EDIRS_CHR := $(CURDIR)/$(RESULTS)/simulation_chromium_noise_0pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_1pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_5pct/evaluation,$(CURDIR)/$(RESULTS)/simulation_chromium_noise_10pct/evaluation # output HTML files (relative to project root) NOISE_REPORT_SS2 := $(RESULTS)/noise_sweep_smartseq2.html @@ -78,14 +79,14 @@ noise_chromium: noise_chromium_0pct noise_chromium_1pct noise_chromium_5pct nois $(NOISE_REPORT_SS2): mkdir -p $(RESULTS) - $(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', \ - output_file = '../$(NOISE_REPORT_SS2)', \ + $(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \ + output_file = '$(CURDIR)/$(NOISE_REPORT_SS2)', \ params = list(eval_dirs = '$(NOISE_EDIRS_SS2)'))" $(NOISE_REPORT_CHR): mkdir -p $(RESULTS) - $(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', \ - output_file = '../$(NOISE_REPORT_CHR)', \ + $(RSCRIPT) -e "rmarkdown::render('scripts/noise_sweep_report.Rmd', knit_root_dir = getwd(), \ + output_file = '$(CURDIR)/$(NOISE_REPORT_CHR)', \ params = list(eval_dirs = '$(NOISE_EDIRS_CHR)'))" report_noise_smartseq2: $(NOISE_REPORT_SS2) diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml index 04ae340..2c36df7 100644 --- a/workflow/configs/simulation_chromium.yaml +++ b/workflow/configs/simulation_chromium.yaml @@ -8,9 +8,9 @@ reference: # Split repeats into genic/intergenic subsets using gene body overlap (bedtools) filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_chromium_noise_0pct.yaml b/workflow/configs/simulation_chromium_noise_0pct.yaml index 1aea43a..7fd2909 100644 --- a/workflow/configs/simulation_chromium_noise_0pct.yaml +++ b/workflow/configs/simulation_chromium_noise_0pct.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_chromium_noise_10pct.yaml b/workflow/configs/simulation_chromium_noise_10pct.yaml index b8a3b0a..2f359ec 100644 --- a/workflow/configs/simulation_chromium_noise_10pct.yaml +++ b/workflow/configs/simulation_chromium_noise_10pct.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_chromium_noise_1pct.yaml b/workflow/configs/simulation_chromium_noise_1pct.yaml index 2d52c8a..da11b20 100644 --- a/workflow/configs/simulation_chromium_noise_1pct.yaml +++ b/workflow/configs/simulation_chromium_noise_1pct.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_chromium_noise_5pct.yaml b/workflow/configs/simulation_chromium_noise_5pct.yaml index 228f030..3f63a64 100644 --- a/workflow/configs/simulation_chromium_noise_5pct.yaml +++ b/workflow/configs/simulation_chromium_noise_5pct.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_chromium/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_chromium/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_chromium/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml index 54beaee..4d3b4d7 100644 --- a/workflow/configs/simulation_smartseq2.yaml +++ b/workflow/configs/simulation_smartseq2.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_smartseq2_noise_0pct.yaml b/workflow/configs/simulation_smartseq2_noise_0pct.yaml index ba95edb..2480849 100644 --- a/workflow/configs/simulation_smartseq2_noise_0pct.yaml +++ b/workflow/configs/simulation_smartseq2_noise_0pct.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_smartseq2_noise_10pct.yaml b/workflow/configs/simulation_smartseq2_noise_10pct.yaml index b7d9d5c..ee5983b 100644 --- a/workflow/configs/simulation_smartseq2_noise_10pct.yaml +++ b/workflow/configs/simulation_smartseq2_noise_10pct.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_smartseq2_noise_1pct.yaml b/workflow/configs/simulation_smartseq2_noise_1pct.yaml index f38d904..fee65d1 100644 --- a/workflow/configs/simulation_smartseq2_noise_1pct.yaml +++ b/workflow/configs/simulation_smartseq2_noise_1pct.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_smartseq2_noise_5pct.yaml b/workflow/configs/simulation_smartseq2_noise_5pct.yaml index ef69956..8b9ff40 100644 --- a/workflow/configs/simulation_smartseq2_noise_5pct.yaml +++ b/workflow/configs/simulation_smartseq2_noise_5pct.yaml @@ -7,9 +7,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/simulation_smartseq2/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/simulation_smartseq2/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/simulation_smartseq2/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/modules/download_references.snmk b/workflow/modules/download_references.snmk index 58b6533..0fe3a19 100644 --- a/workflow/modules/download_references.snmk +++ b/workflow/modules/download_references.snmk @@ -69,7 +69,7 @@ genome_fasta_url = urls_cfg.get('genome_fasta', default_genome_fasta_url) genes_gtf_url = urls_cfg.get('genes_gtf', default_genes_gtf_url) rmsk_flatfile_url = urls_cfg.get('rmsk_flatfile', default_rmsk_flatfile_url) -refs_dir = op.join(config['base'], 'refs') +refs_dir = op.join(config.get('indices_base', config['base']), 'refs') build_rmsk_script = op.join(workflow.basedir, 'scripts', 'build_rmsk_gtf.py') diff --git a/workflow/modules/evaluation.snmk b/workflow/modules/evaluation.snmk index c0d78b4..9ae34bf 100644 --- a/workflow/modules/evaluation.snmk +++ b/workflow/modules/evaluation.snmk @@ -180,7 +180,8 @@ rule render_report: report = op.join(eval_dir, 'evaluation_report.html') params: mutation_rate = sim_mutation_rate, - eval_dir = eval_dir, + eval_dir = op.abspath(eval_dir), + abs_report = op.abspath(op.join(eval_dir, 'evaluation_report.html')), rmd = op.join(workflow.basedir, 'scripts', 'evaluation_report.Rmd') log: op.join(config['base'], 'logs', 'render_report.log') @@ -188,7 +189,8 @@ rule render_report: """ Rscript -e "rmarkdown::render( '{params.rmd}', - output_file = '{output.report}', + output_file = '{params.abs_report}', + knit_root_dir = getwd(), params = list(eval_dir = '{params.eval_dir}') )" 2> {log} """ diff --git a/workflow/modules/reference.snmk b/workflow/modules/reference.snmk index cd00f3c..0dae94a 100644 --- a/workflow/modules/reference.snmk +++ b/workflow/modules/reference.snmk @@ -66,7 +66,7 @@ def t2g_path(wildcards): rule decompress_genome: conda: op.join(workflow.basedir, 'envs', 'star.yaml') input: config['reference']['genome_fasta'] - output: temp(op.join(config['base'], 'tmp', 'genome.fa')) + output: op.join(config['indices_base'], 'tmp', 'genome.fa') params: chroms = _chrom_list threads: workflow.cores @@ -84,7 +84,7 @@ rule decompress_genome: rule decompress_repeats_gtf: conda: op.join(workflow.basedir, 'envs', 'star.yaml') input: config['reference']['repeats_gtf'] - output: temp(op.join(config['base'], 'tmp', 'repeats.gtf')) + output: op.join(config['indices_base'], 'tmp', 'repeats.gtf') threads: workflow.cores shell: "pigz --decompress --keep --stdout --processes {threads} {input} | sed 's/^chr//' > {output}" @@ -92,7 +92,7 @@ rule decompress_repeats_gtf: rule decompress_genes_gtf: conda: op.join(workflow.basedir, 'envs', 'star.yaml') input: config['reference']['genes_gtf'] - output: temp(op.join(config['base'], 'tmp', 'genes.gtf')) + output: op.join(config['indices_base'], 'tmp', 'genes.gtf') params: chroms = _chrom_list threads: workflow.cores @@ -196,7 +196,7 @@ rule extract_feature_set_fasta: gtf = gtf_for_repeat_feature_set output: fasta = op.join(config['indices_base'], 'indices', genome_tag, '{feature_set}.fa'), - bed = temp(op.join(config['base'], 'tmp', '{feature_set}.bed')) + bed = temp(op.join(config['indices_base'], 'tmp', '{feature_set}.bed')) wildcard_constraints: feature_set = 'repeats|genic_repeats|intergenic_repeats' log: From 642c388d7dafc667b1db8e9b4aba0e880d329322 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 23 Mar 2026 17:07:26 +0100 Subject: [PATCH 12/12] Largely increase the amount of simulated cells --- .github/workflows/tests.yml | 3 + README.md | 56 +++++++++++++++---- .../configs/test_negative_control.yaml | 6 +- workflow/configs/simulation_chromium.yaml | 4 +- .../simulation_chromium_noise_0pct.yaml | 4 +- .../simulation_chromium_noise_10pct.yaml | 4 +- .../simulation_chromium_noise_1pct.yaml | 4 +- .../simulation_chromium_noise_5pct.yaml | 4 +- workflow/configs/simulation_smartseq2.yaml | 4 +- .../simulation_smartseq2_noise_0pct.yaml | 4 +- .../simulation_smartseq2_noise_10pct.yaml | 4 +- .../simulation_smartseq2_noise_1pct.yaml | 4 +- .../simulation_smartseq2_noise_5pct.yaml | 4 +- 13 files changed, 71 insertions(+), 34 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2bf6715..847d3e3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -61,6 +61,7 @@ jobs: snakemake \ --configfile configs/simulation_smartseq2.yaml \ --dry-run \ + --rerun-triggers mtime \ --quiet --cores 2 - name: Dry-run main Snakefile with Chromium config @@ -70,6 +71,7 @@ jobs: snakemake \ --configfile configs/simulation_chromium.yaml \ --dry-run \ + --rerun-triggers mtime \ --quiet --cores 2 - name: Dry-run test Snakefile with negative control config @@ -80,6 +82,7 @@ jobs: -s ../test/workflow/Snakefile_test \ --configfile ../test/workflow/configs/test_negative_control.yaml \ --dry-run \ + --rerun-triggers mtime \ --quiet --cores 2 - name: Run workflow dry-run pytest tests diff --git a/README.md b/README.md index 6bc5ef0..09ef24f 100644 --- a/README.md +++ b/README.md @@ -46,30 +46,64 @@ Exact package pins (platform-locked explicit exports) are in `workflow/envs/expl ## Running +### Via Makefile (recommended) + +A `Makefile` at the project root orchestrates all configs and renders reports. + +``` +# Run everything (all simulations + noise sweep + reports) +make + +# Individual targets +make simulation_smartseq2 # SmartSeq2 base simulation +make simulation_chromium # Chromium base simulation +make noise_smartseq2 # SmartSeq2 noise sweep (0%, 1%, 5%, 10% mutation rate) +make noise_chromium # Chromium noise sweep +make report_noise_smartseq2 # Render SmartSeq2 noise sweep HTML report +make report_noise_chromium # Render Chromium noise sweep HTML report +make help # List all targets +``` + +Tune parallelism with `make CORES=20`. The Makefile activates the `snakemake` conda +environment automatically. + +### Manually + ``` source ~/miniconda3/etc/profile.d/conda.sh conda activate snakemake cd workflow -snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10 -snakemake --configfile configs/simulation_chromium.yaml --use-conda --cores 10 +snakemake --configfile configs/simulation_smartseq2.yaml --use-conda --cores 10 --rerun-triggers mtime +snakemake --configfile configs/simulation_chromium.yaml --use-conda --cores 10 --rerun-triggers mtime ``` -The report is written to `{base}/evaluation/evaluation_report.html` as defined in -the config file. +The per-run evaluation report is written to `{base}/evaluation/evaluation_report.html`. ## Configuration -Two reference configs are provided under `workflow/configs/`: +Simulation configs are under `workflow/configs/`: + +| File | Technology | Cells | Expressed loci/cell | Mutation rate | +|---|---|---|---|---| +| `simulation_smartseq2.yaml` | SmartSeq2 | 500 | 1000 | 0.1% | +| `simulation_chromium.yaml` | 10x Chromium | 500 | 1000 | 0.1% | +| `simulation_smartseq2_noise_0pct.yaml` | SmartSeq2 | 500 | 1000 | 0% | +| `simulation_smartseq2_noise_1pct.yaml` | SmartSeq2 | 500 | 1000 | 1% | +| `simulation_smartseq2_noise_5pct.yaml` | SmartSeq2 | 500 | 1000 | 5% | +| `simulation_smartseq2_noise_10pct.yaml` | SmartSeq2 | 500 | 1000 | 10% | +| `simulation_chromium_noise_0pct.yaml` | 10x Chromium | 500 | 1000 | 0% | +| `simulation_chromium_noise_1pct.yaml` | 10x Chromium | 500 | 1000 | 1% | +| `simulation_chromium_noise_5pct.yaml` | 10x Chromium | 500 | 1000 | 5% | +| `simulation_chromium_noise_10pct.yaml` | 10x Chromium | 500 | 1000 | 10% | -| File | Technology | Cells | Chr subset | -|---|---|---|---| -| `simulation_smartseq2.yaml` | SmartSeq2 | 20 | chr10 | -| `simulation_chromium.yaml` | 10x Chromium | 50 | chr10 | +Unused real-data configs have been moved to `workflow/configs/old/`. Key parameters: -- `base`: output directory for all results. -- `indices_base`: directory for shared aligner indices (can be shared across runs). +- `base`: run-specific output directory. +- `indices_base`: shared directory for aligner indices and decompressed references. + All noise-sweep configs share the same `indices_base` so indices are built once. +- `simulation.mutation_rate`: per-base substitution rate applied to simulated reads. - `feature_sets`: which repeat subsets to quantify (`repeats`, `genic_repeats`, `intergenic_repeats`). - `granularities`: aggregation levels (`gene_id`, `family_id`, `class_id`). - `aligner_params.{aligner}.multimapper_modes`: `unique` (best hit only) or `multi` (EM/all-alignments). diff --git a/test/workflow/configs/test_negative_control.yaml b/test/workflow/configs/test_negative_control.yaml index 204865f..32f068f 100644 --- a/test/workflow/configs/test_negative_control.yaml +++ b/test/workflow/configs/test_negative_control.yaml @@ -14,9 +14,9 @@ reference: chromosomes: ["chr10"] filter_genic: true rmsk_source: ucsc_flatfile - genome_fasta: "../results/test_negative_control/refs/GRCh38.dna.primary_assembly.fa.gz" - repeats_gtf: "../results/test_negative_control/refs/hg38_rmsk_TE.gtf.gz" - genes_gtf: "../results/test_negative_control/refs/GRCh38.112.genes.gtf.gz" + genome_fasta: "../results/shared/refs/GRCh38.dna.primary_assembly.fa.gz" + repeats_gtf: "../results/shared/refs/hg38_rmsk_TE.gtf.gz" + genes_gtf: "../results/shared/refs/GRCh38.112.genes.gtf.gz" mode: simulation diff --git a/workflow/configs/simulation_chromium.yaml b/workflow/configs/simulation_chromium.yaml index 2c36df7..5a08495 100644 --- a/workflow/configs/simulation_chromium.yaml +++ b/workflow/configs/simulation_chromium.yaml @@ -16,8 +16,8 @@ mode: simulation simulation: technology: chromium - n_cells: 50 - n_expressed_per_cell: 80 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 cb_length: 16 umi_length: 12 diff --git a/workflow/configs/simulation_chromium_noise_0pct.yaml b/workflow/configs/simulation_chromium_noise_0pct.yaml index 7fd2909..d098ee1 100644 --- a/workflow/configs/simulation_chromium_noise_0pct.yaml +++ b/workflow/configs/simulation_chromium_noise_0pct.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: chromium - n_cells: 50 - n_expressed_per_cell: 80 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 cb_length: 16 umi_length: 12 diff --git a/workflow/configs/simulation_chromium_noise_10pct.yaml b/workflow/configs/simulation_chromium_noise_10pct.yaml index 2f359ec..07b1052 100644 --- a/workflow/configs/simulation_chromium_noise_10pct.yaml +++ b/workflow/configs/simulation_chromium_noise_10pct.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: chromium - n_cells: 50 - n_expressed_per_cell: 80 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 cb_length: 16 umi_length: 12 diff --git a/workflow/configs/simulation_chromium_noise_1pct.yaml b/workflow/configs/simulation_chromium_noise_1pct.yaml index da11b20..7837c70 100644 --- a/workflow/configs/simulation_chromium_noise_1pct.yaml +++ b/workflow/configs/simulation_chromium_noise_1pct.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: chromium - n_cells: 50 - n_expressed_per_cell: 80 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 cb_length: 16 umi_length: 12 diff --git a/workflow/configs/simulation_chromium_noise_5pct.yaml b/workflow/configs/simulation_chromium_noise_5pct.yaml index 3f63a64..fadf484 100644 --- a/workflow/configs/simulation_chromium_noise_5pct.yaml +++ b/workflow/configs/simulation_chromium_noise_5pct.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: chromium - n_cells: 50 - n_expressed_per_cell: 80 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 cb_length: 16 umi_length: 12 diff --git a/workflow/configs/simulation_smartseq2.yaml b/workflow/configs/simulation_smartseq2.yaml index 4d3b4d7..541b150 100644 --- a/workflow/configs/simulation_smartseq2.yaml +++ b/workflow/configs/simulation_smartseq2.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: smartseq2 - n_cells: 20 - n_expressed_per_cell: 100 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 seed: 42 mutation_rate: 0.001 diff --git a/workflow/configs/simulation_smartseq2_noise_0pct.yaml b/workflow/configs/simulation_smartseq2_noise_0pct.yaml index 2480849..fcf2473 100644 --- a/workflow/configs/simulation_smartseq2_noise_0pct.yaml +++ b/workflow/configs/simulation_smartseq2_noise_0pct.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: smartseq2 - n_cells: 20 - n_expressed_per_cell: 100 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 seed: 42 mutation_rate: 0.0 diff --git a/workflow/configs/simulation_smartseq2_noise_10pct.yaml b/workflow/configs/simulation_smartseq2_noise_10pct.yaml index ee5983b..ee49fda 100644 --- a/workflow/configs/simulation_smartseq2_noise_10pct.yaml +++ b/workflow/configs/simulation_smartseq2_noise_10pct.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: smartseq2 - n_cells: 20 - n_expressed_per_cell: 100 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 seed: 42 mutation_rate: 0.10 diff --git a/workflow/configs/simulation_smartseq2_noise_1pct.yaml b/workflow/configs/simulation_smartseq2_noise_1pct.yaml index fee65d1..e19d0cc 100644 --- a/workflow/configs/simulation_smartseq2_noise_1pct.yaml +++ b/workflow/configs/simulation_smartseq2_noise_1pct.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: smartseq2 - n_cells: 20 - n_expressed_per_cell: 100 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 seed: 42 mutation_rate: 0.01 diff --git a/workflow/configs/simulation_smartseq2_noise_5pct.yaml b/workflow/configs/simulation_smartseq2_noise_5pct.yaml index 8b9ff40..2c7756e 100644 --- a/workflow/configs/simulation_smartseq2_noise_5pct.yaml +++ b/workflow/configs/simulation_smartseq2_noise_5pct.yaml @@ -15,8 +15,8 @@ mode: simulation simulation: technology: smartseq2 - n_cells: 20 - n_expressed_per_cell: 100 + n_cells: 500 + n_expressed_per_cell: 1000 read_length: 90 seed: 42 mutation_rate: 0.05