From 743bf8e8a038c13f361aad3be4ead4e543298436 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rub=C3=A9n=20Casta=C3=B1eda-Mart=C3=ADnez?=
 <ruben@cicese.edu.mx>
Date: Tue, 24 Feb 2026 11:26:49 -0800
Subject: [PATCH 1/3] fix(format-kegg-db): three bugs in
 format_kegg_database.py

1. Move scikit-bio imports inside functions (lazy import)
   - `from skbio import ...` was at module level, causing ImportError in
     Docker containers that do not have scikit-bio installed (e.g.
     python_pandas_hmmer_mmseqs2_pruned). The import is only needed when
     a gene_ko_link file is actually processed, so it is now placed inside
     the two functions that use it: process_kegg() and
     generate_modified_kegg_fasta().

2. Fix MMseqs2 output database name (kegg.mmsdb, not kegg.<date>.mmsdb)
   - The database was written as kegg.<download_date>.mmsdb but
     modules/local/annotate/mmseqs_search.nf expects the file to be named
     exactly kegg.mmsdb (it constructs the path as ${db_name}.mmsdb where
     db_name is the parent directory name "kegg"). The date suffix caused a
     "No such file or directory" error at annotation time.

3. Fix --skip_gene_ko_link argparse definition
   - Using `type=bool` does NOT work as a flag: argparse passes the string
     "False" / "True" to bool(), and bool("False") == True. Replaced with
     `action="store_true"` so the flag behaves as intended.
---
 bin/format_kegg_database.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/bin/format_kegg_database.py b/bin/format_kegg_database.py
index 5287c994..24e906a9 100755
--- a/bin/format_kegg_database.py
+++ b/bin/format_kegg_database.py
@@ -5,7 +5,6 @@
 from glob import glob
 import logging
 import subprocess
-from skbio import write as write_sequence, read as read_sequence
 from collections import defaultdict
 import gzip
 import argparse
@@ -64,6 +63,7 @@ def process_kegg(
         download_date = get_iso_date()
     if gene_ko_link_loc is not None and Path(gene_ko_link_loc).exists():
         # add KOs to end of header where KO is not already there
+        from skbio import write as write_sequence
         kegg_mod_loc = path.join(output_dir, "kegg.mod.fa")
         write_sequence(
             generate_modified_kegg_fasta(kegg_loc, gene_ko_link_loc),
@@ -73,7 +73,7 @@ def process_kegg(
     else:
         kegg_mod_loc = kegg_loc
     # make mmseqsdb from modified kegg fasta
-    kegg_mmseqs_db = path.join(output_dir, "kegg.%s.mmsdb" % download_date)
+    kegg_mmseqs_db = path.join(output_dir, "kegg.mmsdb")
     create_mmseqs(
         kegg_mod_loc,
         kegg_mmseqs_db,
@@ -114,6 +114,7 @@ def generate_modified_kegg_fasta(kegg_fasta, gene_ko_link_loc=None):
     Takes kegg fasta file and gene ko link file, adds kos not already in headers to headers
     Whish I knew about this, oh well I may split this out.
     """
+    from skbio import write as write_sequence, read as read_sequence
     genes_ko_dict = defaultdict(list)
     if gene_ko_link_loc is not None:
         if gene_ko_link_loc.endswith(".gz"):
@@ -150,9 +151,8 @@ def main():
     )
     parser.add_argument(
         "--skip_gene_ko_link",
-        type=bool,
+        action="store_true",
         help="Skip gene KO link processing. If not passed in, `--gene_ko_link_loc` is required",
-        default=False,
     )
     parser.add_argument(
         "--output_dir", type=str, help="Path to the output directory", default="kegg"

From 3afbce83192f05c8c9e2d16744828fc04bd2642f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rub=C3=A9n=20Casta=C3=B1eda-Mart=C3=ADnez?=
 <ruben@cicese.edu.mx>
Date: Tue, 24 Feb 2026 11:26:58 -0800
Subject: [PATCH 2/3] fix(format-kegg-db): two bugs in format_kegg_db.nf

1. Replace container that lacks mmseqs2
   - FORMAT_KEGG_DB used python_scikit-bio_scipy which does not include
     mmseqs2. The process calls mmseqs createdb / createindex, so it fails
     immediately with "No such file or directory: mmseqs". Replaced with
     python_pandas_hmmer_mmseqs2_pruned, which already carries mmseqs2 and
     is used by other annotation processes in the pipeline.

2. Fix bash condition for skip_gene_ko_link
   - The Nextflow value passed to the process is the string "0" or "1"
     (see dram.nf). In bash, `if [ "0" ]` evaluates to TRUE because any
     non-empty string is truthy. FORMAT_KEGG_DB therefore always ran the
     --skip_gene_ko_link branch, ignoring the gene_ko_link file.
     Fixed with an explicit string comparison:
       if [ "${skip_gene_ko_link}" = "true" ]
     (see companion fix in workflows/dram.nf)
---
 modules/local/database/format_kegg_db.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/local/database/format_kegg_db.nf b/modules/local/database/format_kegg_db.nf
index 0e4c0971..efff0280 100644
--- a/modules/local/database/format_kegg_db.nf
+++ b/modules/local/database/format_kegg_db.nf
@@ -4,7 +4,7 @@ process FORMAT_KEGG_DB {
     errorStrategy 'finish'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/python_scikit-bio_scipy:0f89a100e990daf2"
+    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
 
     tag { ch_kegg_pep }
 
@@ -19,7 +19,7 @@ process FORMAT_KEGG_DB {
 
     script:
     """
-    if [ ${skip_gene_ko_link} ]; then
+    if [ "${skip_gene_ko_link}" = "true" ]; then
         echo "No Gene KO Link file provided. Running KEGG DB formatting without"
         format_kegg_database.py --kegg_loc ${ch_kegg_pep} --download_date ${kegg_download_date} --threads ${params.threads} --output_dir kegg --skip_gene_ko_link
     else

From f4b23944e345b0f0834923c981b4a971736073a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rub=C3=A9n=20Casta=C3=B1eda-Mart=C3=ADnez?=
 <ruben@cicese.edu.mx>
Date: Tue, 24 Feb 2026 11:27:06 -0800
Subject: [PATCH 3/3] fix(dram): pass boolean string to FORMAT_KEGG_DB
 skip_gene_ko_link

The companion fix for the bash condition in format_kegg_db.nf requires
that skip_gene_ko_link be the string "true" or "false" rather than the
integer 1 or 0.

In bash:
  `if [ "0" ]`   -> true  (non-empty string)
  `if [ "false" ]` -> true  (still non-empty - also wrong)

The correct pattern used in format_kegg_db.nf is:
  `if [ "${skip_gene_ko_link}" = "true" ]`

which requires this value to be exactly the string "true" or "false".
Changed `params.skip_gene_ko_link ? 1 : 0` to
`params.skip_gene_ko_link ? "true" : "false"`.
---
 workflows/dram.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/dram.nf b/workflows/dram.nf
index 53481ba2..62c3784a 100644
--- a/workflows/dram.nf
+++ b/workflows/dram.nf
@@ -212,7 +212,7 @@ workflow DRAM {
 
         gene_ko_link_f = params.gene_ko_link_loc && file(params.gene_ko_link_loc).exists() ? file(params.gene_ko_link_loc) : default_sheet
         kegg_download_date = params.kegg_download_date ? params.kegg_download_date : "''"
-        skip_gene_ko_link = params.skip_gene_ko_link ? 1 : 0
+        skip_gene_ko_link = params.skip_gene_ko_link ? "true" : "false"
         FORMAT_KEGG_DB( kegg_pep_f, gene_ko_link_f, kegg_download_date, skip_gene_ko_link )
 
     } else if (params.merge_annotations){