diff --git a/bin/format_kegg_database.py b/bin/format_kegg_database.py index 5287c994..24e906a9 100755 --- a/bin/format_kegg_database.py +++ b/bin/format_kegg_database.py @@ -5,7 +5,6 @@ from glob import glob import logging import subprocess -from skbio import write as write_sequence, read as read_sequence from collections import defaultdict import gzip import argparse @@ -64,6 +63,7 @@ def process_kegg( download_date = get_iso_date() if gene_ko_link_loc is not None and Path(gene_ko_link_loc).exists(): # add KOs to end of header where KO is not already there + from skbio import write as write_sequence kegg_mod_loc = path.join(output_dir, "kegg.mod.fa") write_sequence( generate_modified_kegg_fasta(kegg_loc, gene_ko_link_loc), @@ -73,7 +73,7 @@ def process_kegg( else: kegg_mod_loc = kegg_loc # make mmseqsdb from modified kegg fasta - kegg_mmseqs_db = path.join(output_dir, "kegg.%s.mmsdb" % download_date) + kegg_mmseqs_db = path.join(output_dir, "kegg.mmsdb") create_mmseqs( kegg_mod_loc, kegg_mmseqs_db, @@ -114,6 +114,7 @@ def generate_modified_kegg_fasta(kegg_fasta, gene_ko_link_loc=None): Takes kegg fasta file and gene ko link file, adds kos not already in headers to headers Whish I knew about this, oh well I may split this out. """ + from skbio import write as write_sequence, read as read_sequence genes_ko_dict = defaultdict(list) if gene_ko_link_loc is not None: if gene_ko_link_loc.endswith(".gz"): @@ -150,9 +151,8 @@ def main(): ) parser.add_argument( "--skip_gene_ko_link", - type=bool, + action="store_true", help="Skip gene KO link processing. If not passed in, `--gene_ko_link_loc` is required", - default=False, ) parser.add_argument( "--output_dir", type=str, help="Path to the output directory", default="kegg" diff --git a/modules/local/database/format_kegg_db.nf b/modules/local/database/format_kegg_db.nf index 0e4c0971..efff0280 100644 --- a/modules/local/database/format_kegg_db.nf +++ b/modules/local/database/format_kegg_db.nf @@ -4,7 +4,7 @@ process FORMAT_KEGG_DB { errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_scikit-bio_scipy:0f89a100e990daf2" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" tag { ch_kegg_pep } @@ -19,7 +19,7 @@ process FORMAT_KEGG_DB { script: """ - if [ ${skip_gene_ko_link} ]; then + if [ "${skip_gene_ko_link}" = "true" ]; then echo "No Gene KO Link file provided. Running KEGG DB formatting without" format_kegg_database.py --kegg_loc ${ch_kegg_pep} --download_date ${kegg_download_date} --threads ${params.threads} --output_dir kegg --skip_gene_ko_link else diff --git a/workflows/dram.nf b/workflows/dram.nf index 53481ba2..62c3784a 100644 --- a/workflows/dram.nf +++ b/workflows/dram.nf @@ -212,7 +212,7 @@ workflow DRAM { gene_ko_link_f = params.gene_ko_link_loc && file(params.gene_ko_link_loc).exists() ? file(params.gene_ko_link_loc) : default_sheet kegg_download_date = params.kegg_download_date ? params.kegg_download_date : "''" - skip_gene_ko_link = params.skip_gene_ko_link ? 1 : 0 + skip_gene_ko_link = params.skip_gene_ko_link ? "true" : "false" FORMAT_KEGG_DB( kegg_pep_f, gene_ko_link_f, kegg_download_date, skip_gene_ko_link ) } else if (params.merge_annotations){