From efa3be4de528373b216469165c3ae90c3825ab11 Mon Sep 17 00:00:00 2001
From: Daniel Lundin <erik.rikard.daniel@gmail.com>
Date: Sat, 1 Feb 2025 08:04:29 +0100
Subject: [PATCH 1/8] Set resourceLimits for all tests

---
 conf/test_eggnog.config       | 8 ++++++++
 conf/test_eukulele.config     | 8 ++++++++
 conf/test_filter.config       | 8 ++++++++
 conf/test_full.config         | 8 ++++++++
 conf/test_kofamscan.config    | 8 ++++++++
 conf/test_prokka.config       | 8 ++++++++
 conf/test_spades.config       | 8 ++++++++
 conf/test_transdecoder.config | 8 ++++++++
 8 files changed, 64 insertions(+)

diff --git a/conf/test_eggnog.config b/conf/test_eggnog.config
index 6858b82b..2605e951 100644
--- a/conf/test_eggnog.config
+++ b/conf/test_eggnog.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test eggnog profile'
     config_profile_description = 'Minimal test dataset to check pipeline with eggnog function added'
diff --git a/conf/test_eukulele.config b/conf/test_eukulele.config
index a89b941e..11abc578 100644
--- a/conf/test_eukulele.config
+++ b/conf/test_eukulele.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test profile for eukulele taxonomic annotation'
     config_profile_description = 'Minimal test dataset to check pipeline function'
diff --git a/conf/test_filter.config b/conf/test_filter.config
index 88109a56..908ba60d 100644
--- a/conf/test_filter.config
+++ b/conf/test_filter.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function, including removal of contaminating sequences (e.g. rRNA)'
diff --git a/conf/test_full.config b/conf/test_full.config
index 2dfa5092..af082c5b 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Full test profile'
     config_profile_description = 'Full test dataset to check pipeline function'
diff --git a/conf/test_kofamscan.config b/conf/test_kofamscan.config
index f9b6c579..6869e3c0 100644
--- a/conf/test_kofamscan.config
+++ b/conf/test_kofamscan.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test kofamscan profile'
     config_profile_description = 'Minimal test dataset to check pipeline with kofamscan function added'
diff --git a/conf/test_prokka.config b/conf/test_prokka.config
index 4753b975..24243cb8 100644
--- a/conf/test_prokka.config
+++ b/conf/test_prokka.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test profile for prokka orf caller'
     config_profile_description = 'Minimal test dataset to check pipeline function'
diff --git a/conf/test_spades.config b/conf/test_spades.config
index 5dcff137..ada71160 100644
--- a/conf/test_spades.config
+++ b/conf/test_spades.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test spades assembler profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
diff --git a/conf/test_transdecoder.config b/conf/test_transdecoder.config
index 3dd02a05..069d73af 100644
--- a/conf/test_transdecoder.config
+++ b/conf/test_transdecoder.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test profile for transdecoder  orf caller'
     config_profile_description = 'Minimal test dataset to check pipeline function'

From 6583394c3689410255d8ebbb4e8fde6501939e16 Mon Sep 17 00:00:00 2001
From: Daniel Lundin <erik.rikard.daniel@gmail.com>
Date: Sat, 1 Feb 2025 09:43:58 +0100
Subject: [PATCH 2/8] Fix failing collect_stats for empty samples

---
 modules/local/collect_stats.nf | 103 +++++++++++++++------------------
 1 file changed, 47 insertions(+), 56 deletions(-)

diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf
index 59ab2ffa..5cfe5c88 100644
--- a/modules/local/collect_stats.nf
+++ b/modules/local/collect_stats.nf
@@ -27,101 +27,92 @@ process COLLECT_STATS {
             d = map(
                 sample,
                 function(s) {
-                    fread(cmd = sprintf("grep 'Reads written (passing filters)' %s*trimming_report.txt | sed 's/.*: *//' | sed 's/ .*//' | sed 's/,//g'", s)) %>%
-                        as_tibble()
+                    read_tsv(
+                        pipe(sprintf("grep 'Reads written (passing filters)' %s*trimming_report.txt | sed 's/.*: *//' | sed 's/ .*//' | sed 's/,//g'", s)),
+                        col_names = c('n_trimmed'),
+                        col_types = 'i'
+                    ) %>%
+                        mutate(n_trimmed = n_trimmed * 2)
                 }
             )
         ) %>%
-        unnest(d) %>%
-        rename(n_trimmed = V1) %>%
-        mutate(n_trimmed = n_trimmed*2) %>%
+        unnest(d)
         """
-    } else {
-        read_trimlogs = "%>%"
     }
 
     if (mergetab) {
         read_mergetab = """
 
-        mergetab <- list.files(pattern = "*_merged_table.tsv.gz" ) %>%
-            map_df(~read_tsv(.,  show_col_types  = FALSE)) %>%
-            mutate(sample = as.character(sample))
+        mergetab <- read_tsv("${mergetab}, show_col_types = FALSE)
 
         """
     } else {
         read_mergetab = """
-        mergetab <- data.frame(sample = character(), stringsAsFactors = FALSE)
+        mergetab <- tibble(sample = character())
         """
     }
 
     """
     #!/usr/bin/env Rscript
 
-    library(data.table)
-    library(dtplyr)
     library(dplyr)
     library(readr)
     library(purrr)
     library(tidyr)
     library(stringr)
 
-    TYPE_ORDER = c('n_trimmed', 'n_non_contaminated', 'idxs_n_mapped', 'idxs_n_unmapped', 'n_feature_count')
+    #TYPE_ORDER = c('n_trimmed', 'n_non_contaminated', 'idxs_n_mapped', 'idxs_n_unmapped', 'n_feature_count')
 
-    # Collect stats for each sample, create a table in long format that can be appended to
-    t <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs}
-        # add samtools idxstats output
-        mutate(
-            i = map(
-                sample,
-                function(s) {
-                    fread(cmd = sprintf("grep -v '^*' %s*idxstats", s), sep = '\\t', col.names = c('chr', 'length', 'idxs_n_mapped', 'idxs_n_unmapped')) %>%
-                        lazy_dt() %>%
-                        summarise(idxs_n_mapped = sum(idxs_n_mapped), idxs_n_unmapped = sum(idxs_n_unmapped)) %>%
-                        as_tibble()
-                }
-            )
-        ) %>%
-        unnest(i) %>%
-        pivot_longer(2:ncol(.), names_to = 'm', values_to = 'v') %>%
-        union(
-            # Total observation after featureCounts
-            tibble(file = Sys.glob('*.counts.tsv.gz')) %>%
-                mutate(d = map(file, function(f) fread(cmd = sprintf("gunzip -c %s", f), sep = '\\t'))) %>%
-                as_tibble() %>%
-                unnest(d) %>%
-                mutate(sample = as.character(sample)) %>%
-                group_by(sample) %>% summarise(n_feature_count = sum(count), .groups = 'drop') %>%
-                pivot_longer(2:ncol(.), names_to = 'm', values_to = 'v')
-        )
-
-    # Add in stats from BBDuk, if present
+    start    <- tibble(sample = c("${samples.join('", "')}"))
+
+    trimming <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs}
+
+    idxs <- read_tsv(
+        pipe("grep -Hv '^*' *.idxstats"),
+        col_names = c('c', 'length', 'idxs_n_mapped', 'idxs_n_unmapped'),
+        col_types = 'ciii'
+    ) %>%
+        separate(c, c('sample', 'chr'), sep = ':') %>%
+        transmute(sample = str_remove(sample, '.idxstats'), idxs_n_mapped, idxs_n_unmapped) %>%
+        group_by(sample) %>% summarise(idxs_n_mapped = sum(idxs_n_mapped), idxs_n_unmapped = sum(idxs_n_unmapped))
+
+    counts <- read_tsv("${fcs}", col_types = 'cciicicid') %>%
+        group_by(sample) %>% summarise(n_feature_count = sum(count))
+
+
+    bbduk <- tibble(sample = character(), n_non_contaminated = integer())
     for ( f in Sys.glob('*.bbduk.log') ) {
         s = str_remove(f, '.bbduk.log')
-        t <- t %>% union(
-            fread(cmd = sprintf("grep 'Result:' %s | sed 's/Result:[ \\t]*//; s/ reads.*//'", f), col.names = c('v')) %>%
-                as_tibble() %>%
-                mutate(sample = s, m = 'n_non_contaminated')
-        )
+        bbduk <- bbduk %>%
+            union(
+                read_tsv(
+                    pipe(sprintf("grep 'Result:' %s | sed 's/Result:[ \t]*//; s/ reads.*//' | sed 's/:/\t/'", f)),
+                    col_names = c('n_non_contaminated'),
+                    col_types = 'i'
+                ) %>%
+                    mutate(sample = s)
+            )
     }
+    if ( nrow(bbduk) == 0 ) bbduk <- bbduk %>% select(sample)
 
     # Add in stats from taxonomy and function
     ${read_mergetab}
 
-    # Write the table in wide format
-    t %>%
-        mutate(m = parse_factor(m, levels = TYPE_ORDER, ordered = TRUE)) %>%
-        arrange(sample, m) %>%
-        pivot_wider(names_from = m, values_from = v) %>%
-        left_join(mergetab, by = 'sample') %>%
-        write_tsv('${prefix}.overall_stats.tsv.gz')
+    # Write output
+    start %>%
+        left_join(trimming, by = join_by(sample)) %>%
+        left_join(bbduk, by = join_by(sample)) %>%
+        left_join(idxs, by = join_by(sample)) %>%
+        left_join(counts, by = join_by(sample)) %>%
+        left_join(mergetab, by = join_by(sample)) %>%
+        arrange(sample) %>%
+        write_tsv("${meta.id}.overall_stats.tsv.gz")
 
     writeLines(
         c(
             "\\"${task.process}\\":",
             paste0("    R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),
             paste0("    dplyr: ", packageVersion('dplyr')),
-            paste0("    dtplyr: ", packageVersion('dtplyr')),
-            paste0("    data.table: ", packageVersion('data.table')),
             paste0("    readr: ", packageVersion('readr')),
             paste0("    purrr: ", packageVersion('purrr')),
             paste0("    tidyr: ", packageVersion('tidyr')),

From f41b82533b7f067e66b92842582b2e79ce576faf Mon Sep 17 00:00:00 2001
From: Daniel Lundin <matricaria.suaveolens@gmail.com>
Date: Sat, 1 Feb 2025 10:18:42 +0100
Subject: [PATCH 3/8] Missing quote

---
 modules/local/collect_stats.nf | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf
index 5cfe5c88..19dad403 100644
--- a/modules/local/collect_stats.nf
+++ b/modules/local/collect_stats.nf
@@ -42,9 +42,7 @@ process COLLECT_STATS {
 
     if (mergetab) {
         read_mergetab = """
-
-        mergetab <- read_tsv("${mergetab}, show_col_types = FALSE)
-
+        mergetab <- read_tsv("${mergetab}", show_col_types = FALSE)
         """
     } else {
         read_mergetab = """

From 4636add3845b95f8fc6353c69953cd9f669a85fe Mon Sep 17 00:00:00 2001
From: Daniel Lundin <matricaria.suaveolens@gmail.com>
Date: Sat, 1 Feb 2025 20:06:32 +0100
Subject: [PATCH 4/8] Restructure stats collection and add diamond taxonomy

---
 modules/local/merge_summary_tables.nf        |  3 +-
 modules/local/sumtaxonomy/environment.yml    |  7 +++
 modules/local/sumtaxonomy/main.nf            | 54 +++++++++++++++++++
 modules/local/sumtaxonomy/meta.yml           | 56 ++++++++++++++++++++
 modules/local/sumtaxonomy/tests/main.nf.test | 10 ++++
 subworkflows/local/eukulele.nf               |  8 +--
 workflows/metatdenovo.nf                     | 48 ++++++++---------
 7 files changed, 155 insertions(+), 31 deletions(-)
 create mode 100644 modules/local/sumtaxonomy/environment.yml
 create mode 100644 modules/local/sumtaxonomy/main.nf
 create mode 100644 modules/local/sumtaxonomy/meta.yml
 create mode 100644 modules/local/sumtaxonomy/tests/main.nf.test

diff --git a/modules/local/merge_summary_tables.nf b/modules/local/merge_summary_tables.nf
index ab7fafbd..8556a6fc 100644
--- a/modules/local/merge_summary_tables.nf
+++ b/modules/local/merge_summary_tables.nf
@@ -8,8 +8,7 @@ process MERGE_TABLES {
         'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }"
 
     input:
-
-    tuple val(meta), path(eggtab), path(taxtab), path(kofamscan)
+    tuple val(meta), path(tables)
 
     output:
     tuple val(meta), path("${meta.id}_merged_table.tsv.gz") , emit: merged_table
diff --git a/modules/local/sumtaxonomy/environment.yml b/modules/local/sumtaxonomy/environment.yml
new file mode 100644
index 00000000..a28fd724
--- /dev/null
+++ b/modules/local/sumtaxonomy/environment.yml
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8"
diff --git a/modules/local/sumtaxonomy/main.nf b/modules/local/sumtaxonomy/main.nf
new file mode 100644
index 00000000..12313a8b
--- /dev/null
+++ b/modules/local/sumtaxonomy/main.nf
@@ -0,0 +1,54 @@
+process SUMTAXONOMY {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' :
+        'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }"
+
+    input:
+    tuple val(meta), val(db), path(taxonomy)
+    path feature_counts
+    val  taxname
+
+    output:
+    tuple val(meta), path("*_summary.tsv.gz") , emit: taxonomy_summary
+    path "versions.yml"                       , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    #!/usr/bin/env Rscript
+
+    library(tidyverse)
+
+    # Read the taxonomy and counts tables
+    taxonomy <- read_tsv("${taxonomy}", show_col_types = FALSE )
+
+    counts <- read_tsv("${feature_counts}", show_col_types = FALSE) %>%
+        mutate(sample = as.character(sample))
+
+    # Join the two and count the number of ORFs with assigned taxonomy
+    counts %>%
+        inner_join(taxonomy, by = 'orf') %>%
+        group_by(sample) %>%
+        summarise(value = sum(count), .groups = 'drop') %>%
+        mutate(database = "${db ?: 'userdb'}", field = "${taxname}") %>%
+        relocate(value, .after = last_col()) %>%
+        write_tsv('${prefix}_summary.tsv.gz')
+
+    writeLines(
+        c(
+            "\\"${task.process}\\":",
+            paste0("    R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),
+            paste0("    tidyverse: ", packageVersion('tidyverse'))
+        ),
+        "versions.yml"
+    )
+    """
+}
diff --git a/modules/local/sumtaxonomy/meta.yml b/modules/local/sumtaxonomy/meta.yml
new file mode 100644
index 00000000..a0688978
--- /dev/null
+++ b/modules/local/sumtaxonomy/meta.yml
@@ -0,0 +1,56 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "sumtaxonomy"
+description: Small module to summarise taxonomy tables
+keywords:
+  - taxonomy
+  - summarise
+tools:
+  - "sumtaxonomy":
+      description: "R script that summarises taxonomy tables"
+
+input:
+  # Only when we have meta
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+    - db:
+        type: string
+        description: Name of database, e.g. gtdb
+    - taxonomy:
+        type: file
+        description: Taxonomy in tsv format
+        pattern: "*.tsv(.gz)?"
+
+  - - feature_counts:
+        type: file
+        description: File with gene counts
+
+  - - taxname:
+        type: string
+        description: Name of taxonomy, e.g. eukulele
+          
+
+output:
+  - taxonomy_summary:
+    - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+    - "*_summary.tsv.gz":
+        type: file
+        pattern: "*._summary.tsv.gz"
+          
+  - versions:
+    - "versions.yml":
+        type: file
+        description: File containing software versions
+        pattern: "versions.yml"
+
+authors:
+  - "@erikrikarddaniel"
+maintainers:
+  - "@erikrikarddaniel"
diff --git a/modules/local/sumtaxonomy/tests/main.nf.test b/modules/local/sumtaxonomy/tests/main.nf.test
new file mode 100644
index 00000000..5e23fec0
--- /dev/null
+++ b/modules/local/sumtaxonomy/tests/main.nf.test
@@ -0,0 +1,10 @@
+nextflow_process {
+
+    name "Test Process SUMTAXONOMY"
+    script "../main.nf"
+    process "SUMTAXONOMY"
+
+    tag "modules"
+    tag "modules_"
+    tag "sumdiamondtax"
+}
diff --git a/subworkflows/local/eukulele.nf b/subworkflows/local/eukulele.nf
index e9217200..54ed5fe1 100644
--- a/subworkflows/local/eukulele.nf
+++ b/subworkflows/local/eukulele.nf
@@ -2,9 +2,9 @@
 // Run EUKULELE on protein fasta from orf_caller output
 //
 
-include { EUKULELE_SEARCH       } from '../../modules/local/eukulele/search'
-include { FORMAT_EUKULELE_TAX   } from '../../modules/local/format_eukulele_tax'
-include { SUM_EUKULELE_TAXONOMY } from '../../modules/local/sum_eukulele_taxonomy'
+include { EUKULELE_SEARCH                      } from '../../modules/local/eukulele/search'
+include { FORMAT_EUKULELE_TAX                  } from '../../modules/local/format_eukulele_tax'
+include { SUMTAXONOMY as SUM_EUKULELE_TAXONOMY } from '../../modules/local/sumtaxonomy'
 
 workflow SUB_EUKULELE {
 
@@ -26,7 +26,7 @@ workflow SUB_EUKULELE {
         .map { meta, taxonomy, protein, dbname, database -> [ meta, dbname, taxonomy ] }
         .set { ch_sum_taxonomy }
 
-    SUM_EUKULELE_TAXONOMY ( ch_sum_taxonomy, feature_counts )
+    SUM_EUKULELE_TAXONOMY ( ch_sum_taxonomy, feature_counts, 'eukulele' )
     ch_versions = ch_versions.mix ( SUM_EUKULELE_TAXONOMY.out.versions )
 
     emit:
diff --git a/workflows/metatdenovo.nf b/workflows/metatdenovo.nf
index 3ed45a7b..a310d159 100644
--- a/workflows/metatdenovo.nf
+++ b/workflows/metatdenovo.nf
@@ -61,12 +61,14 @@ include { MEGAHIT_INTERLEAVED                } from '../modules/local/megahit/in
 include { MERGE_TABLES                       } from '../modules/local/merge_summary_tables'
 include { FORMAT_DIAMOND_TAX_RANKLIST        } from '../modules/local/format_diamond_tax_ranklist'
 include { FORMAT_DIAMOND_TAX_TAXDUMP         } from '../modules/local/format_diamond_tax_taxdump'
+include { SUMTAXONOMY as SUM_DIAMONDTAX      } from '../modules/local/sumtaxonomy'
 include { TRANSDECODER                       } from '../modules/local/transdecoder'
 include { TRANSRATE                          } from '../modules/local/transrate'
 include { UNPIGZ as UNPIGZ_CONTIGS           } from '../modules/local/unpigz'
 include { UNPIGZ as UNPIGZ_GFF               } from '../modules/local/unpigz'
 include { WRITESPADESYAML                    } from '../modules/local/writespadesyaml'
 
+
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
@@ -435,17 +437,14 @@ workflow METATDENOVO {
         .combine(ch_fcs_for_stats)
         .set { ch_collect_stats }
 
+    ch_merge_tables = Channel.empty()
     //
     // SUBWORKFLOW: run eggnog_mapper on the ORF-called amino acid sequences
     //
     if ( ! params.skip_eggnog ) {
         EGGNOG(ch_protein, ch_fcs_for_summary)
         ch_versions = ch_versions.mix(EGGNOG.out.versions)
-        ch_merge_tables = EGGNOG.out.sumtable
-    } else {
-        ch_protein
-            .map { meta, protein -> [ meta, [] ] }
-            .set { ch_merge_tables }
+        ch_merge_tables = ch_merge_tables.mix ( EGGNOG.out.sumtable.map { meta, tsv -> tsv } )
     }
 
 
@@ -458,14 +457,7 @@ workflow METATDENOVO {
             .set { ch_kofamscan }
         KOFAMSCAN( ch_kofamscan, ch_fcs_for_summary)
         ch_versions = ch_versions.mix(KOFAMSCAN.out.versions)
-        ch_kofamscan_summary = KOFAMSCAN.out.kofamscan_summary.collect{ meta, tsv -> tsv }
-        ch_merge_tables
-            .combine( ch_kofamscan_summary )
-            .set { ch_merge_tables }
-    } else {
-        ch_merge_tables
-            .map { meta, tsv -> [ meta, tsv, [] ] }
-            .set { ch_merge_tables }
+        ch_merge_tables = ch_merge_tables.mix ( KOFAMSCAN.out.kofamscan_summary.map { meta, tsv -> tsv } )
     }
 
     // set up contig channel to use in CAT and TransRate
@@ -500,15 +492,8 @@ workflow METATDENOVO {
             .combine( ch_eukulele_db )
             .set { ch_eukulele }
         SUB_EUKULELE( ch_eukulele, ch_fcs_for_summary )
-        ch_taxonomy_summary = SUB_EUKULELE.out.taxonomy_summary.collect{ meta, tsv -> tsv }
         ch_versions = ch_versions.mix(SUB_EUKULELE.out.versions)
-        ch_merge_tables
-            .combine( ch_taxonomy_summary )
-            .set { ch_merge_tables }
-    } else {
-        ch_merge_tables
-            .map { meta, tsv1, tsv2 -> [ meta, tsv1, tsv2, [] ] }
-            .set { ch_merge_tables }
+        ch_merge_tables = ch_merge_tables.mix ( SUB_EUKULELE.out.taxonomy_summary.map { meta, tsv -> tsv } )
     }
 
     //
@@ -545,7 +530,7 @@ workflow METATDENOVO {
         PIGZ_DIAMOND_LINEAGE.out.archive
             .map { it -> [ [ id: it[0].db ], it[0], it[1] ] }
             .join(ch_diamond_dbs)
-            .map { it -> [ [ id: it[1].id - ".lineage" + ".diamond" ], it[2], it[6] ] }
+            .map { it -> [ [ id: it[1].id - ".lineage" + ".diamond", db: it[1].db ], it[2], it[6] ] }
     )
     ch_versions     = ch_versions.mix(FORMAT_DIAMOND_TAX_RANKLIST.out.versions)
 
@@ -553,17 +538,29 @@ workflow METATDENOVO {
         PIGZ_DIAMOND_LINEAGE.out.archive
             .map { it -> [ [ id: it[0].db ], it[0], it[1] ] }
             .join(ch_diamond_dbs.filter { it -> it[5] })
-            .map { it -> [ [ id: it[1].id - ".lineage" + ".diamond" ], it[2], it[4], it[5], it[6] ] }
+            .map { it -> [ [ id: it[1].id - ".lineage" + ".diamond", db: it[1].db ], it[2], it[4], it[5], it[6] ] }
     )
     ch_versions     = ch_versions.mix(FORMAT_DIAMOND_TAX_TAXDUMP.out.versions)
 
-    // tuple val(meta), path(taxfile), val(ranks)
+    SUM_DIAMONDTAX(
+        FORMAT_DIAMOND_TAX_RANKLIST.out.taxonomy
+            .map { it -> [ it[0], it[0].db, it[1] ] },
+        ch_fcs_for_summary,
+        'diamondtax'
+    )
+    ch_versions     = ch_versions.mix(SUM_DIAMONDTAX.out.versions)
+
+    ch_merge_tables = ch_merge_tables.mix ( SUM_DIAMONDTAX.out.taxonomy_summary.map { meta, tsv -> tsv } )
 
     //
     // MODULE: Collect statistics from mapping analysis
     //
     if( !params.skip_eggnog  || !params.skip_eukulele || !params.skip_kofamscan) {
-        MERGE_TABLES ( ch_merge_tables )
+        MERGE_TABLES ( 
+            ch_merge_tables
+                .collect() 
+                .map { it -> [ [ id: "${assembly_name}.${orfs_name}" ], it ] }
+        )
         ch_collect_stats
             .combine(MERGE_TABLES.out.merged_table.collect{ meta, tblout -> tblout }.map { [ it ] })
             .set { ch_collect_stats }
@@ -573,6 +570,7 @@ workflow METATDENOVO {
             .map { meta, samples, report, tsv, idxstats, counts -> [ meta, samples, report, tsv, idxstats, counts, [] ] }
             .set { ch_collect_stats }
     }
+    //ch_collect_stats.view { it -> "ch_collect_stats: ${it}" }
 
     COLLECT_STATS(ch_collect_stats)
     ch_versions     = ch_versions.mix(COLLECT_STATS.out.versions)

From 099328b978bb06ec8aedb47609fb6047b6aeba18 Mon Sep 17 00:00:00 2001
From: Daniel Lundin <matricaria.suaveolens@gmail.com>
Date: Sat, 1 Feb 2025 23:04:49 +0100
Subject: [PATCH 5/8] Continue cleaning up overall stats

---
 conf/modules.config                   |  6 ++---
 modules/local/collect_stats.nf        |  4 +--
 modules/local/eggnog/sum.nf           |  2 +-
 modules/local/merge_summary_tables.nf |  3 ++-
 modules/local/sum_kofamscan.nf        |  2 +-
 workflows/metatdenovo.nf              | 37 +++++++++++++++------------
 6 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index b644a3c0..33de076b 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -272,13 +272,13 @@ process {
                 path: { "${params.outdir}/summary_tables/" },
                 pattern: "kofamscan.tsv.gz",
                 mode: params.publish_dir_mode,
-                saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${filename}" }
+                saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${filename}" }
             ],
             [
                 path: { "${params.outdir}/kofamscan/" },
                 pattern: "kofamscan_output.tsv.gz",
                 mode: params.publish_dir_mode,
-                saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${filename}" }
+                saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${filename}" }
             ]
         ]
     }
@@ -337,7 +337,7 @@ process {
             path: { "${params.outdir}/summary_tables" },
             mode: params.publish_dir_mode,
             pattern: '*.tsv.gz',
-            saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${params.eukulele_db ?: 'userdb'}.eukulele.taxonomy.tsv.gz" }
+            saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${params.eukulele_db ?: 'userdb'}.eukulele.taxonomy.tsv.gz" }
         ]
     }
 
diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf
index 19dad403..de061696 100644
--- a/modules/local/collect_stats.nf
+++ b/modules/local/collect_stats.nf
@@ -40,7 +40,7 @@ process COLLECT_STATS {
         """
     }
 
-    if (mergetab) {
+    if ( mergetab ) {
         read_mergetab = """
         mergetab <- read_tsv("${mergetab}", show_col_types = FALSE)
         """
@@ -59,8 +59,6 @@ process COLLECT_STATS {
     library(tidyr)
     library(stringr)
 
-    #TYPE_ORDER = c('n_trimmed', 'n_non_contaminated', 'idxs_n_mapped', 'idxs_n_unmapped', 'n_feature_count')
-
     start    <- tibble(sample = c("${samples.join('", "')}"))
 
     trimming <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs}
diff --git a/modules/local/eggnog/sum.nf b/modules/local/eggnog/sum.nf
index 7eb1bfa5..54620b1c 100644
--- a/modules/local/eggnog/sum.nf
+++ b/modules/local/eggnog/sum.nf
@@ -45,7 +45,7 @@ process EGGNOG_SUM {
         group_by(sample) %>%
         drop_na() %>%
         summarise( value = sum(count), .groups = 'drop') %>%
-        add_column(database = "eggnog", field = "eggnog_n_counts") %>%
+        add_column(database = "eggnog", field = "n") %>%
         relocate(value, .after = last_col()) %>%
         write_tsv('${meta.id}.eggnog_summary.tsv.gz')
 
diff --git a/modules/local/merge_summary_tables.nf b/modules/local/merge_summary_tables.nf
index 8556a6fc..d03f1916 100644
--- a/modules/local/merge_summary_tables.nf
+++ b/modules/local/merge_summary_tables.nf
@@ -33,7 +33,8 @@ process MERGE_TABLES {
     Sys.glob('*.tsv.gz') %>%
         read_tsv() %>%
         mutate(sample = as.character(sample)) %>%
-        pivot_wider(names_from = c(database,field), values_from = value) %>%
+        arrange(field, database) %>%
+        pivot_wider(names_from = c(field,database), values_from = value) %>%
         write_tsv('${prefix}_merged_table.tsv.gz')
 
     writeLines(
diff --git a/modules/local/sum_kofamscan.nf b/modules/local/sum_kofamscan.nf
index 490ff543..5aaea8a1 100644
--- a/modules/local/sum_kofamscan.nf
+++ b/modules/local/sum_kofamscan.nf
@@ -45,7 +45,7 @@ process SUM_KOFAMSCAN {
         inner_join(kofams, by = 'orf') %>%
         group_by(sample) %>%
         summarise(value = sum(count), .groups = 'drop') %>%
-        add_column(database = "kofamscan", field = "kofamscan_n_counts") %>%
+        add_column(database = "kofamscan", field = "n") %>%
         relocate(value, .after = last_col()) %>%
         write_tsv('${meta.id}.kofamscan_summary.tsv.gz')
 
diff --git a/workflows/metatdenovo.nf b/workflows/metatdenovo.nf
index a310d159..2792c176 100644
--- a/workflows/metatdenovo.nf
+++ b/workflows/metatdenovo.nf
@@ -297,8 +297,8 @@ workflow METATDENOVO {
         )
 
         SPADES.out.transcripts
-            .ifEmpty{ [] }
-            .combine(SPADES.out.contigs.ifEmpty{ [] } )
+            .ifEmpty { [] }
+            .combine(SPADES.out.contigs.ifEmpty { [] } )
             .set { ch_assembly }
         ch_versions = ch_versions.mix(SPADES.out.versions)
 
@@ -555,22 +555,25 @@ workflow METATDENOVO {
     //
     // MODULE: Collect statistics from mapping analysis
     //
-    if( !params.skip_eggnog  || !params.skip_eukulele || !params.skip_kofamscan) {
-        MERGE_TABLES ( 
-            ch_merge_tables
-                .collect() 
-                .map { it -> [ [ id: "${assembly_name}.${orfs_name}" ], it ] }
+    MERGE_TABLES ( 
+        ch_merge_tables
+            .collect() 
+            .map { it -> [ [ id: "${assembly_name}.${orfs_name}" ], it ] }
+    )
+    MERGE_TABLES.out.merged_table
+        //.view { "merged0: ${it}" }
+        //.collect { meta, tblout -> tblout }
+        //.view { "merged1: ${it}" }
+        //.map { meta, tblout -> [ tblout ] }
+        //.view { "merged2: ${it}" }
+    ch_collect_stats = ch_collect_stats
+        .combine(
+            Channel.empty()
+                .mix ( MERGE_TABLES.out.merged_table.map { meta, tblout -> [ tblout ] } )
+                .ifEmpty { [ [] ] }
+                //.map { [ it ] }
         )
-        ch_collect_stats
-            .combine(MERGE_TABLES.out.merged_table.collect{ meta, tblout -> tblout }.map { [ it ] })
-            .set { ch_collect_stats }
-        ch_versions       = ch_versions.mix(MERGE_TABLES.out.versions)
-    } else {
-        ch_collect_stats
-            .map { meta, samples, report, tsv, idxstats, counts -> [ meta, samples, report, tsv, idxstats, counts, [] ] }
-            .set { ch_collect_stats }
-    }
-    //ch_collect_stats.view { it -> "ch_collect_stats: ${it}" }
+    ch_versions     = ch_versions.mix(MERGE_TABLES.out.versions)
 
     COLLECT_STATS(ch_collect_stats)
     ch_versions     = ch_versions.mix(COLLECT_STATS.out.versions)

From 92c4a9f3f8c874090ae29bdb2864b0ef7c832859 Mon Sep 17 00:00:00 2001
From: Daniel Lundin <erik.rikard.daniel@gmail.com>
Date: Sat, 1 Feb 2025 23:23:03 +0100
Subject: [PATCH 6/8] Changelog

---
 CHANGELOG.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 13046e62..3fa1c7db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co
 
 ### `Added`
 
-- [#320](<[https://github.com/nf-core/metatdenovo/pull/320](https://github.com/nf-core/metatdenovo/pull/320)>) improvments to Diamond taxonomy plus documentation
+- [#320](<[https://github.com/nf-core/metatdenovo/pull/320](https://github.com/nf-core/metatdenovo/pull/320)>) added taxonomy directly with Diamond, part 2
 - [#312](<[https://github.com/nf-core/metatdenovo/pull/312](https://github.com/nf-core/metatdenovo/pull/312)>) added taxonomy directly with Diamond, see `--diamond_dbs`
 - [#286](<[https://github.com/nf-core/metatdenovo/pull/286](https://github.com/nf-core/metatdenovo/pull/286)>) added an option to save the fasta file output from formatspades.nf module
 - [#285](<[https://github.com/nf-core/metatdenovo/pull/285](https://github.com/nf-core/metatdenovo/pull/285)>) added nf-test for default settings.
@@ -18,6 +18,7 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co
 
 ### `Changed`
 
+- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Clean up overall stats table
 - [#323](<[https://github.com/nf-core/metatdenovo/pull/323](https://github.com/nf-core/metatdenovo/pull/323)>) - Modified param names for input of assembly and ORFs; added name params for output file naming
 - [#323](<[https://github.com/nf-core/metatdenovo/pull/323](https://github.com/nf-core/metatdenovo/pull/323)>) - Removed default for `assembler` and `orf_caller` parameters
 - [#311](<[https://github.com/nf-core/metatdenovo/pull/311](https://github.com/nf-core/metatdenovo/pull/311)>) - Update modules and subworkflows
@@ -29,8 +30,10 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co
 
 ### `Fixed`
 
-- [#305](<[https://github.com/nf-core/ampliseq/pull/681](https://github.com/nf-core/metatdenovo/pull/305)>) - Make EUKulele counts output optional as it's not always created
-- [#269](<[https://github.com/nf-core/ampliseq/pull/681](https://github.com/nf-core/metatdenovo/pull/269)>) - Make Transdecoder work better with `-resume`
+- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Fix resources for test cases
+- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Fix output file names for Eukulele and Kofamscan
+- [#305](<[https://github.com/nf-core/metatdenovo/pull/305](https://github.com/nf-core/metatdenovo/pull/305)>) - Make EUKulele counts output optional as it's not always created
+- [#269](<[https://github.com/nf-core/metatdenovo/pull/269](https://github.com/nf-core/metatdenovo/pull/269)>) - Make Transdecoder work better with `-resume`
 
 ### `Dependencies`
 

From f48a48fa87734d936cd57ae59bf2bbd4d2b4d523 Mon Sep 17 00:00:00 2001
From: Daniel Lundin <erik.rikard.daniel@gmail.com>
Date: Sat, 1 Feb 2025 23:30:22 +0100
Subject: [PATCH 7/8] Whitespace

---
 workflows/metatdenovo.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/metatdenovo.nf b/workflows/metatdenovo.nf
index 2792c176..c3d7eab8 100644
--- a/workflows/metatdenovo.nf
+++ b/workflows/metatdenovo.nf
@@ -555,9 +555,9 @@ workflow METATDENOVO {
     //
     // MODULE: Collect statistics from mapping analysis
     //
-    MERGE_TABLES ( 
+    MERGE_TABLES (
         ch_merge_tables
-            .collect() 
+            .collect()
             .map { it -> [ [ id: "${assembly_name}.${orfs_name}" ], it ] }
     )
     MERGE_TABLES.out.merged_table

From a64ae1d412cc4bb5f845ba491ae6eeb867fc50cd Mon Sep 17 00:00:00 2001
From: Daniel Lundin <erik.rikard.daniel@gmail.com>
Date: Sat, 1 Feb 2025 23:35:02 +0100
Subject: [PATCH 8/8] Prettier

---
 modules/local/sumtaxonomy/meta.yml | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/modules/local/sumtaxonomy/meta.yml b/modules/local/sumtaxonomy/meta.yml
index a0688978..f07700c6 100644
--- a/modules/local/sumtaxonomy/meta.yml
+++ b/modules/local/sumtaxonomy/meta.yml
@@ -31,24 +31,23 @@ input:
   - - taxname:
         type: string
         description: Name of taxonomy, e.g. eukulele
-          
 
 output:
   - taxonomy_summary:
-    - meta:
-        type: map
-        description: |
-          Groovy Map containing sample information
-          e.g. `[ id:'sample1', single_end:false ]`
-    - "*_summary.tsv.gz":
-        type: file
-        pattern: "*._summary.tsv.gz"
-          
+      - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1', single_end:false ]`
+      - "*_summary.tsv.gz":
+          type: file
+          pattern: "*._summary.tsv.gz"
+
   - versions:
-    - "versions.yml":
-        type: file
-        description: File containing software versions
-        pattern: "versions.yml"
+      - "versions.yml":
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
 
 authors:
   - "@erikrikarddaniel"