From 743bf8e8a038c13f361aad3be4ead4e543298436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rub=C3=A9n=20Casta=C3=B1eda-Mart=C3=ADnez?= Date: Tue, 24 Feb 2026 11:26:49 -0800 Subject: [PATCH 1/3] fix(format-kegg-db): three bugs in format_kegg_database.py 1. Move scikit-bio imports inside functions (lazy import) - `from skbio import ...` was at module level, causing ImportError in Docker containers that do not have scikit-bio installed (e.g. python_pandas_hmmer_mmseqs2_pruned). The import is only needed when a gene_ko_link file is actually processed, so it is now placed inside the two functions that use it: process_kegg() and generate_modified_kegg_fasta(). 2. Fix MMseqs2 output database name (kegg.mmsdb, not kegg..mmsdb) - The database was written as kegg..mmsdb but modules/local/annotate/mmseqs_search.nf expects the file to be named exactly kegg.mmsdb (it constructs the path as ${db_name}.mmsdb where db_name is the parent directory name "kegg"). The date suffix caused a "No such file or directory" error at annotation time. 3. Fix --skip_gene_ko_link argparse definition - Using `type=bool` does NOT work as a flag: argparse passes the string "False" / "True" to bool(), and bool("False") == True. Replaced with `action="store_true"` so the flag behaves as intended. --- bin/format_kegg_database.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/format_kegg_database.py b/bin/format_kegg_database.py index 5287c994..24e906a9 100755 --- a/bin/format_kegg_database.py +++ b/bin/format_kegg_database.py @@ -5,7 +5,6 @@ from glob import glob import logging import subprocess -from skbio import write as write_sequence, read as read_sequence from collections import defaultdict import gzip import argparse @@ -64,6 +63,7 @@ def process_kegg( download_date = get_iso_date() if gene_ko_link_loc is not None and Path(gene_ko_link_loc).exists(): # add KOs to end of header where KO is not already there + from skbio import write as write_sequence kegg_mod_loc = path.join(output_dir, "kegg.mod.fa") write_sequence( generate_modified_kegg_fasta(kegg_loc, gene_ko_link_loc), @@ -73,7 +73,7 @@ def process_kegg( else: kegg_mod_loc = kegg_loc # make mmseqsdb from modified kegg fasta - kegg_mmseqs_db = path.join(output_dir, "kegg.%s.mmsdb" % download_date) + kegg_mmseqs_db = path.join(output_dir, "kegg.mmsdb") create_mmseqs( kegg_mod_loc, kegg_mmseqs_db, @@ -114,6 +114,7 @@ def generate_modified_kegg_fasta(kegg_fasta, gene_ko_link_loc=None): Takes kegg fasta file and gene ko link file, adds kos not already in headers to headers Whish I knew about this, oh well I may split this out. """ + from skbio import write as write_sequence, read as read_sequence genes_ko_dict = defaultdict(list) if gene_ko_link_loc is not None: if gene_ko_link_loc.endswith(".gz"): @@ -150,9 +151,8 @@ def main(): ) parser.add_argument( "--skip_gene_ko_link", - type=bool, + action="store_true", help="Skip gene KO link processing. If not passed in, `--gene_ko_link_loc` is required", - default=False, ) parser.add_argument( "--output_dir", type=str, help="Path to the output directory", default="kegg" From 3afbce83192f05c8c9e2d16744828fc04bd2642f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rub=C3=A9n=20Casta=C3=B1eda-Mart=C3=ADnez?= Date: Tue, 24 Feb 2026 11:26:58 -0800 Subject: [PATCH 2/3] fix(format-kegg-db): two bugs in format_kegg_db.nf 1. Replace container that lacks mmseqs2 - FORMAT_KEGG_DB used python_scikit-bio_scipy which does not include mmseqs2. The process calls mmseqs createdb / createindex, so it fails immediately with "No such file or directory: mmseqs". Replaced with python_pandas_hmmer_mmseqs2_pruned, which already carries mmseqs2 and is used by other annotation processes in the pipeline. 2. Fix bash condition for skip_gene_ko_link - The Nextflow value passed to the process is the string "0" or "1" (see dram.nf). In bash, `if [ "0" ]` evaluates to TRUE because any non-empty string is truthy. FORMAT_KEGG_DB therefore always ran the --skip_gene_ko_link branch, ignoring the gene_ko_link file. Fixed with an explicit string comparison: if [ "${skip_gene_ko_link}" = "true" ] (see companion fix in workflows/dram.nf) --- modules/local/database/format_kegg_db.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/database/format_kegg_db.nf b/modules/local/database/format_kegg_db.nf index 0e4c0971..efff0280 100644 --- a/modules/local/database/format_kegg_db.nf +++ b/modules/local/database/format_kegg_db.nf @@ -4,7 +4,7 @@ process FORMAT_KEGG_DB { errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_scikit-bio_scipy:0f89a100e990daf2" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" tag { ch_kegg_pep } @@ -19,7 +19,7 @@ process FORMAT_KEGG_DB { script: """ - if [ ${skip_gene_ko_link} ]; then + if [ "${skip_gene_ko_link}" = "true" ]; then echo "No Gene KO Link file provided. Running KEGG DB formatting without" format_kegg_database.py --kegg_loc ${ch_kegg_pep} --download_date ${kegg_download_date} --threads ${params.threads} --output_dir kegg --skip_gene_ko_link else From f4b23944e345b0f0834923c981b4a971736073a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rub=C3=A9n=20Casta=C3=B1eda-Mart=C3=ADnez?= Date: Tue, 24 Feb 2026 11:27:06 -0800 Subject: [PATCH 3/3] fix(dram): pass boolean string to FORMAT_KEGG_DB skip_gene_ko_link The companion fix for the bash condition in format_kegg_db.nf requires that skip_gene_ko_link be the string "true" or "false" rather than the integer 1 or 0. In bash: `if [ "0" ]` -> true (non-empty string) `if [ "false" ]` -> true (still non-empty - also wrong) The correct pattern used in format_kegg_db.nf is: `if [ "${skip_gene_ko_link}" = "true" ]` which requires this value to be exactly the string "true" or "false". Changed `params.skip_gene_ko_link ? 1 : 0` to `params.skip_gene_ko_link ? "true" : "false"`. --- workflows/dram.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/dram.nf b/workflows/dram.nf index 53481ba2..62c3784a 100644 --- a/workflows/dram.nf +++ b/workflows/dram.nf @@ -212,7 +212,7 @@ workflow DRAM { gene_ko_link_f = params.gene_ko_link_loc && file(params.gene_ko_link_loc).exists() ? file(params.gene_ko_link_loc) : default_sheet kegg_download_date = params.kegg_download_date ? params.kegg_download_date : "''" - skip_gene_ko_link = params.skip_gene_ko_link ? 1 : 0 + skip_gene_ko_link = params.skip_gene_ko_link ? "true" : "false" FORMAT_KEGG_DB( kegg_pep_f, gene_ko_link_f, kegg_download_date, skip_gene_ko_link ) } else if (params.merge_annotations){