nf-core · erikrikarddaniel · Feb 3, 2025 · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,7 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co
 
 ### `Added`
 
-- [#320](<[https://github.com/nf-core/metatdenovo/pull/320](https://github.com/nf-core/metatdenovo/pull/320)>) improvments to Diamond taxonomy plus documentation
+- [#320](<[https://github.com/nf-core/metatdenovo/pull/320](https://github.com/nf-core/metatdenovo/pull/320)>) added taxonomy directly with Diamond, part 2
 - [#312](<[https://github.com/nf-core/metatdenovo/pull/312](https://github.com/nf-core/metatdenovo/pull/312)>) added taxonomy directly with Diamond, see `--diamond_dbs`
 - [#286](<[https://github.com/nf-core/metatdenovo/pull/286](https://github.com/nf-core/metatdenovo/pull/286)>) added an option to save the fasta file output from formatspades.nf module
 - [#285](<[https://github.com/nf-core/metatdenovo/pull/285](https://github.com/nf-core/metatdenovo/pull/285)>) added nf-test for default settings.
@@ -18,6 +18,7 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co
 
 ### `Changed`
 
+- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Clean up overall stats table
 - [#323](<[https://github.com/nf-core/metatdenovo/pull/323](https://github.com/nf-core/metatdenovo/pull/323)>) - Modified param names for input of assembly and ORFs; added name params for output file naming
 - [#323](<[https://github.com/nf-core/metatdenovo/pull/323](https://github.com/nf-core/metatdenovo/pull/323)>) - Removed default for `assembler` and `orf_caller` parameters
 - [#311](<[https://github.com/nf-core/metatdenovo/pull/311](https://github.com/nf-core/metatdenovo/pull/311)>) - Update modules and subworkflows
@@ -29,8 +30,10 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co
 
 ### `Fixed`
 
-- [#305](<[https://github.com/nf-core/ampliseq/pull/681](https://github.com/nf-core/metatdenovo/pull/305)>) - Make EUKulele counts output optional as it's not always created
-- [#269](<[https://github.com/nf-core/ampliseq/pull/681](https://github.com/nf-core/metatdenovo/pull/269)>) - Make Transdecoder work better with `-resume`
+- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Fix resources for test cases
+- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Fix output file names for Eukulele and Kofamscan
+- [#305](<[https://github.com/nf-core/metatdenovo/pull/305](https://github.com/nf-core/metatdenovo/pull/305)>) - Make EUKulele counts output optional as it's not always created
+- [#269](<[https://github.com/nf-core/metatdenovo/pull/269](https://github.com/nf-core/metatdenovo/pull/269)>) - Make Transdecoder work better with `-resume`
 
 ### `Dependencies`
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -272,13 +272,13 @@ process {
                 path: { "${params.outdir}/summary_tables/" },
                 pattern: "kofamscan.tsv.gz",
                 mode: params.publish_dir_mode,
-                saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${filename}" }
+                saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${filename}" }
             ],
             [
                 path: { "${params.outdir}/kofamscan/" },
                 pattern: "kofamscan_output.tsv.gz",
                 mode: params.publish_dir_mode,
-                saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${filename}" }
+                saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${filename}" }
             ]
         ]
     }
@@ -337,7 +337,7 @@ process {
             path: { "${params.outdir}/summary_tables" },
             mode: params.publish_dir_mode,
             pattern: '*.tsv.gz',
-            saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${params.eukulele_db ?: 'userdb'}.eukulele.taxonomy.tsv.gz" }
+            saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${params.eukulele_db ?: 'userdb'}.eukulele.taxonomy.tsv.gz" }
         ]
     }
 

diff --git a/conf/test_eggnog.config b/conf/test_eggnog.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test eggnog profile'
     config_profile_description = 'Minimal test dataset to check pipeline with eggnog function added'

diff --git a/conf/test_eukulele.config b/conf/test_eukulele.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test profile for eukulele taxonomic annotation'
     config_profile_description = 'Minimal test dataset to check pipeline function'

diff --git a/conf/test_filter.config b/conf/test_filter.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function, including removal of contaminating sequences (e.g. rRNA)'

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Full test profile'
     config_profile_description = 'Full test dataset to check pipeline function'

diff --git a/conf/test_kofamscan.config b/conf/test_kofamscan.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test kofamscan profile'
     config_profile_description = 'Minimal test dataset to check pipeline with kofamscan function added'

diff --git a/conf/test_prokka.config b/conf/test_prokka.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test profile for prokka orf caller'
     config_profile_description = 'Minimal test dataset to check pipeline function'

diff --git a/conf/test_spades.config b/conf/test_spades.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test spades assembler profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'

diff --git a/conf/test_transdecoder.config b/conf/test_transdecoder.config
@@ -10,6 +10,14 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Test profile for transdecoder  orf caller'
     config_profile_description = 'Minimal test dataset to check pipeline function'

diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf
@@ -27,101 +27,88 @@ process COLLECT_STATS {
             d = map(
                 sample,
                 function(s) {
-                    fread(cmd = sprintf("grep 'Reads written (passing filters)' %s*trimming_report.txt | sed 's/.*: *//' | sed 's/ .*//' | sed 's/,//g'", s)) %>%
-                        as_tibble()
+                    read_tsv(
+                        pipe(sprintf("grep 'Reads written (passing filters)' %s*trimming_report.txt | sed 's/.*: *//' | sed 's/ .*//' | sed 's/,//g'", s)),
+                        col_names = c('n_trimmed'),
+                        col_types = 'i'
+                    ) %>%
+                        mutate(n_trimmed = n_trimmed * 2)
                 }
             )
         ) %>%
-        unnest(d) %>%
-        rename(n_trimmed = V1) %>%
-        mutate(n_trimmed = n_trimmed*2) %>%
+        unnest(d)
         """
-    } else {
-        read_trimlogs = "%>%"
     }
 
-    if (mergetab) {
+    if ( mergetab ) {
         read_mergetab = """
-
-        mergetab <- list.files(pattern = "*_merged_table.tsv.gz" ) %>%
-            map_df(~read_tsv(.,  show_col_types  = FALSE)) %>%
-            mutate(sample = as.character(sample))
-
+        mergetab <- read_tsv("${mergetab}", show_col_types = FALSE)
         """
     } else {
         read_mergetab = """
-        mergetab <- data.frame(sample = character(), stringsAsFactors = FALSE)
+        mergetab <- tibble(sample = character())
         """
     }
 
     """
     #!/usr/bin/env Rscript
 
-    library(data.table)
-    library(dtplyr)
     library(dplyr)
     library(readr)
     library(purrr)
     library(tidyr)
     library(stringr)
 
-    TYPE_ORDER = c('n_trimmed', 'n_non_contaminated', 'idxs_n_mapped', 'idxs_n_unmapped', 'n_feature_count')
+    start    <- tibble(sample = c("${samples.join('", "')}"))
 
-    # Collect stats for each sample, create a table in long format that can be appended to
-    t <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs}
-        # add samtools idxstats output
-        mutate(
-            i = map(
-                sample,
-                function(s) {
-                    fread(cmd = sprintf("grep -v '^*' %s*idxstats", s), sep = '\\t', col.names = c('chr', 'length', 'idxs_n_mapped', 'idxs_n_unmapped')) %>%
-                        lazy_dt() %>%
-                        summarise(idxs_n_mapped = sum(idxs_n_mapped), idxs_n_unmapped = sum(idxs_n_unmapped)) %>%
-                        as_tibble()
-                }
-            )
-        ) %>%
-        unnest(i) %>%
-        pivot_longer(2:ncol(.), names_to = 'm', values_to = 'v') %>%
-        union(
-            # Total observation after featureCounts
-            tibble(file = Sys.glob('*.counts.tsv.gz')) %>%
-                mutate(d = map(file, function(f) fread(cmd = sprintf("gunzip -c %s", f), sep = '\\t'))) %>%
-                as_tibble() %>%
-                unnest(d) %>%
-                mutate(sample = as.character(sample)) %>%
-                group_by(sample) %>% summarise(n_feature_count = sum(count), .groups = 'drop') %>%
-                pivot_longer(2:ncol(.), names_to = 'm', values_to = 'v')
-        )
-
-    # Add in stats from BBDuk, if present
+    trimming <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs}
+
+    idxs <- read_tsv(
+        pipe("grep -Hv '^*' *.idxstats"),
+        col_names = c('c', 'length', 'idxs_n_mapped', 'idxs_n_unmapped'),
+        col_types = 'ciii'
+    ) %>%
+        separate(c, c('sample', 'chr'), sep = ':') %>%
+        transmute(sample = str_remove(sample, '.idxstats'), idxs_n_mapped, idxs_n_unmapped) %>%
+        group_by(sample) %>% summarise(idxs_n_mapped = sum(idxs_n_mapped), idxs_n_unmapped = sum(idxs_n_unmapped))
+
+    counts <- read_tsv("${fcs}", col_types = 'cciicicid') %>%
+        group_by(sample) %>% summarise(n_feature_count = sum(count))
+
+
+    bbduk <- tibble(sample = character(), n_non_contaminated = integer())
     for ( f in Sys.glob('*.bbduk.log') ) {
         s = str_remove(f, '.bbduk.log')
-        t <- t %>% union(
-            fread(cmd = sprintf("grep 'Result:' %s | sed 's/Result:[ \\t]*//; s/ reads.*//'", f), col.names = c('v')) %>%
-                as_tibble() %>%
-                mutate(sample = s, m = 'n_non_contaminated')
-        )
+        bbduk <- bbduk %>%
+            union(
+                read_tsv(
+                    pipe(sprintf("grep 'Result:' %s | sed 's/Result:[ \t]*//; s/ reads.*//' | sed 's/:/\t/'", f)),
+                    col_names = c('n_non_contaminated'),
+                    col_types = 'i'
+                ) %>%
+                    mutate(sample = s)
+            )
     }
+    if ( nrow(bbduk) == 0 ) bbduk <- bbduk %>% select(sample)
 
     # Add in stats from taxonomy and function
     ${read_mergetab}
 
-    # Write the table in wide format
-    t %>%
-        mutate(m = parse_factor(m, levels = TYPE_ORDER, ordered = TRUE)) %>%
-        arrange(sample, m) %>%
-        pivot_wider(names_from = m, values_from = v) %>%
-        left_join(mergetab, by = 'sample') %>%
-        write_tsv('${prefix}.overall_stats.tsv.gz')
+    # Write output
+    start %>%
+        left_join(trimming, by = join_by(sample)) %>%
+        left_join(bbduk, by = join_by(sample)) %>%
+        left_join(idxs, by = join_by(sample)) %>%
+        left_join(counts, by = join_by(sample)) %>%
+        left_join(mergetab, by = join_by(sample)) %>%
+        arrange(sample) %>%
+        write_tsv("${meta.id}.overall_stats.tsv.gz")
 
     writeLines(
         c(
             "\\"${task.process}\\":",
             paste0("    R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),
             paste0("    dplyr: ", packageVersion('dplyr')),
-            paste0("    dtplyr: ", packageVersion('dtplyr')),
-            paste0("    data.table: ", packageVersion('data.table')),
             paste0("    readr: ", packageVersion('readr')),
             paste0("    purrr: ", packageVersion('purrr')),
             paste0("    tidyr: ", packageVersion('tidyr')),

diff --git a/modules/local/eggnog/sum.nf b/modules/local/eggnog/sum.nf
@@ -45,7 +45,7 @@ process EGGNOG_SUM {
         group_by(sample) %>%
         drop_na() %>%
         summarise( value = sum(count), .groups = 'drop') %>%
-        add_column(database = "eggnog", field = "eggnog_n_counts") %>%
+        add_column(database = "eggnog", field = "n") %>%
         relocate(value, .after = last_col()) %>%
         write_tsv('${meta.id}.eggnog_summary.tsv.gz')
 

diff --git a/modules/local/merge_summary_tables.nf b/modules/local/merge_summary_tables.nf
@@ -8,8 +8,7 @@ process MERGE_TABLES {
         'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }"
 
     input:
-
-    tuple val(meta), path(eggtab), path(taxtab), path(kofamscan)
+    tuple val(meta), path(tables)
 
     output:
     tuple val(meta), path("${meta.id}_merged_table.tsv.gz") , emit: merged_table
@@ -34,7 +33,8 @@ process MERGE_TABLES {
     Sys.glob('*.tsv.gz') %>%
         read_tsv() %>%
         mutate(sample = as.character(sample)) %>%
-        pivot_wider(names_from = c(database,field), values_from = value) %>%
+        arrange(field, database) %>%
+        pivot_wider(names_from = c(field,database), values_from = value) %>%
         write_tsv('${prefix}_merged_table.tsv.gz')
 
     writeLines(

diff --git a/modules/local/sum_kofamscan.nf b/modules/local/sum_kofamscan.nf
@@ -45,7 +45,7 @@ process SUM_KOFAMSCAN {
         inner_join(kofams, by = 'orf') %>%
         group_by(sample) %>%
         summarise(value = sum(count), .groups = 'drop') %>%
-        add_column(database = "kofamscan", field = "kofamscan_n_counts") %>%
+        add_column(database = "kofamscan", field = "n") %>%
         relocate(value, .after = last_col()) %>%
         write_tsv('${meta.id}.kofamscan_summary.tsv.gz')
 

diff --git a/modules/local/sumtaxonomy/environment.yml b/modules/local/sumtaxonomy/environment.yml
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8"