From efa3be4de528373b216469165c3ae90c3825ab11 Mon Sep 17 00:00:00 2001 From: Daniel Lundin Date: Sat, 1 Feb 2025 08:04:29 +0100 Subject: [PATCH 1/8] Set resourceLimits for all tests --- conf/test_eggnog.config | 8 ++++++++ conf/test_eukulele.config | 8 ++++++++ conf/test_filter.config | 8 ++++++++ conf/test_full.config | 8 ++++++++ conf/test_kofamscan.config | 8 ++++++++ conf/test_prokka.config | 8 ++++++++ conf/test_spades.config | 8 ++++++++ conf/test_transdecoder.config | 8 ++++++++ 8 files changed, 64 insertions(+) diff --git a/conf/test_eggnog.config b/conf/test_eggnog.config index 6858b82b..2605e951 100644 --- a/conf/test_eggnog.config +++ b/conf/test_eggnog.config @@ -10,6 +10,14 @@ ---------------------------------------------------------------------------------------- */ +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test eggnog profile' config_profile_description = 'Minimal test dataset to check pipeline with eggnog function added' diff --git a/conf/test_eukulele.config b/conf/test_eukulele.config index a89b941e..11abc578 100644 --- a/conf/test_eukulele.config +++ b/conf/test_eukulele.config @@ -10,6 +10,14 @@ ---------------------------------------------------------------------------------------- */ +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile for eukulele taxonomic annotation' config_profile_description = 'Minimal test dataset to check pipeline function' diff --git a/conf/test_filter.config b/conf/test_filter.config index 88109a56..908ba60d 100644 --- a/conf/test_filter.config +++ b/conf/test_filter.config @@ -10,6 +10,14 @@ ---------------------------------------------------------------------------------------- */ +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function, including removal of contaminating sequences (e.g. rRNA)' diff --git a/conf/test_full.config b/conf/test_full.config index 2dfa5092..af082c5b 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,6 +10,14 @@ ---------------------------------------------------------------------------------------- */ +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' diff --git a/conf/test_kofamscan.config b/conf/test_kofamscan.config index f9b6c579..6869e3c0 100644 --- a/conf/test_kofamscan.config +++ b/conf/test_kofamscan.config @@ -10,6 +10,14 @@ ---------------------------------------------------------------------------------------- */ +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test kofamscan profile' config_profile_description = 'Minimal test dataset to check pipeline with kofamscan function added' diff --git a/conf/test_prokka.config b/conf/test_prokka.config index 4753b975..24243cb8 100644 --- a/conf/test_prokka.config +++ b/conf/test_prokka.config @@ -10,6 +10,14 @@ ---------------------------------------------------------------------------------------- */ +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile for prokka orf caller' config_profile_description = 'Minimal test dataset to check pipeline function' diff --git a/conf/test_spades.config b/conf/test_spades.config index 5dcff137..ada71160 100644 --- a/conf/test_spades.config +++ b/conf/test_spades.config @@ -10,6 +10,14 @@ ---------------------------------------------------------------------------------------- */ +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test spades assembler profile' config_profile_description = 'Minimal test dataset to check pipeline function' diff --git a/conf/test_transdecoder.config b/conf/test_transdecoder.config index 3dd02a05..069d73af 100644 --- a/conf/test_transdecoder.config +++ b/conf/test_transdecoder.config @@ -10,6 +10,14 @@ ---------------------------------------------------------------------------------------- */ +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile for transdecoder orf caller' config_profile_description = 'Minimal test dataset to check pipeline function' From 6583394c3689410255d8ebbb4e8fde6501939e16 Mon Sep 17 00:00:00 2001 From: Daniel Lundin Date: Sat, 1 Feb 2025 09:43:58 +0100 Subject: [PATCH 2/8] Fix failing collect_stats for empty samples --- modules/local/collect_stats.nf | 103 +++++++++++++++------------------ 1 file changed, 47 insertions(+), 56 deletions(-) diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf index 59ab2ffa..5cfe5c88 100644 --- a/modules/local/collect_stats.nf +++ b/modules/local/collect_stats.nf @@ -27,101 +27,92 @@ process COLLECT_STATS { d = map( sample, function(s) { - fread(cmd = sprintf("grep 'Reads written (passing filters)' %s*trimming_report.txt | sed 's/.*: *//' | sed 's/ .*//' | sed 's/,//g'", s)) %>% - as_tibble() + read_tsv( + pipe(sprintf("grep 'Reads written (passing filters)' %s*trimming_report.txt | sed 's/.*: *//' | sed 's/ .*//' | sed 's/,//g'", s)), + col_names = c('n_trimmed'), + col_types = 'i' + ) %>% + mutate(n_trimmed = n_trimmed * 2) } ) ) %>% - unnest(d) %>% - rename(n_trimmed = V1) %>% - mutate(n_trimmed = n_trimmed*2) %>% + unnest(d) """ - } else { - read_trimlogs = "%>%" } if (mergetab) { read_mergetab = """ - mergetab <- list.files(pattern = "*_merged_table.tsv.gz" ) %>% - map_df(~read_tsv(., show_col_types = FALSE)) %>% - mutate(sample = as.character(sample)) + mergetab <- read_tsv("${mergetab}, show_col_types = FALSE) """ } else { read_mergetab = """ - mergetab <- data.frame(sample = character(), stringsAsFactors = FALSE) + mergetab <- tibble(sample = character()) """ } """ #!/usr/bin/env Rscript - library(data.table) - library(dtplyr) library(dplyr) library(readr) library(purrr) library(tidyr) library(stringr) - TYPE_ORDER = c('n_trimmed', 'n_non_contaminated', 'idxs_n_mapped', 'idxs_n_unmapped', 'n_feature_count') + #TYPE_ORDER = c('n_trimmed', 'n_non_contaminated', 'idxs_n_mapped', 'idxs_n_unmapped', 'n_feature_count') - # Collect stats for each sample, create a table in long format that can be appended to - t <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs} - # add samtools idxstats output - mutate( - i = map( - sample, - function(s) { - fread(cmd = sprintf("grep -v '^*' %s*idxstats", s), sep = '\\t', col.names = c('chr', 'length', 'idxs_n_mapped', 'idxs_n_unmapped')) %>% - lazy_dt() %>% - summarise(idxs_n_mapped = sum(idxs_n_mapped), idxs_n_unmapped = sum(idxs_n_unmapped)) %>% - as_tibble() - } - ) - ) %>% - unnest(i) %>% - pivot_longer(2:ncol(.), names_to = 'm', values_to = 'v') %>% - union( - # Total observation after featureCounts - tibble(file = Sys.glob('*.counts.tsv.gz')) %>% - mutate(d = map(file, function(f) fread(cmd = sprintf("gunzip -c %s", f), sep = '\\t'))) %>% - as_tibble() %>% - unnest(d) %>% - mutate(sample = as.character(sample)) %>% - group_by(sample) %>% summarise(n_feature_count = sum(count), .groups = 'drop') %>% - pivot_longer(2:ncol(.), names_to = 'm', values_to = 'v') - ) - - # Add in stats from BBDuk, if present + start <- tibble(sample = c("${samples.join('", "')}")) + + trimming <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs} + + idxs <- read_tsv( + pipe("grep -Hv '^*' *.idxstats"), + col_names = c('c', 'length', 'idxs_n_mapped', 'idxs_n_unmapped'), + col_types = 'ciii' + ) %>% + separate(c, c('sample', 'chr'), sep = ':') %>% + transmute(sample = str_remove(sample, '.idxstats'), idxs_n_mapped, idxs_n_unmapped) %>% + group_by(sample) %>% summarise(idxs_n_mapped = sum(idxs_n_mapped), idxs_n_unmapped = sum(idxs_n_unmapped)) + + counts <- read_tsv("${fcs}", col_types = 'cciicicid') %>% + group_by(sample) %>% summarise(n_feature_count = sum(count)) + + + bbduk <- tibble(sample = character(), n_non_contaminated = integer()) for ( f in Sys.glob('*.bbduk.log') ) { s = str_remove(f, '.bbduk.log') - t <- t %>% union( - fread(cmd = sprintf("grep 'Result:' %s | sed 's/Result:[ \\t]*//; s/ reads.*//'", f), col.names = c('v')) %>% - as_tibble() %>% - mutate(sample = s, m = 'n_non_contaminated') - ) + bbduk <- bbduk %>% + union( + read_tsv( + pipe(sprintf("grep 'Result:' %s | sed 's/Result:[ \t]*//; s/ reads.*//' | sed 's/:/\t/'", f)), + col_names = c('n_non_contaminated'), + col_types = 'i' + ) %>% + mutate(sample = s) + ) } + if ( nrow(bbduk) == 0 ) bbduk <- bbduk %>% select(sample) # Add in stats from taxonomy and function ${read_mergetab} - # Write the table in wide format - t %>% - mutate(m = parse_factor(m, levels = TYPE_ORDER, ordered = TRUE)) %>% - arrange(sample, m) %>% - pivot_wider(names_from = m, values_from = v) %>% - left_join(mergetab, by = 'sample') %>% - write_tsv('${prefix}.overall_stats.tsv.gz') + # Write output + start %>% + left_join(trimming, by = join_by(sample)) %>% + left_join(bbduk, by = join_by(sample)) %>% + left_join(idxs, by = join_by(sample)) %>% + left_join(counts, by = join_by(sample)) %>% + left_join(mergetab, by = join_by(sample)) %>% + arrange(sample) %>% + write_tsv("${meta.id}.overall_stats.tsv.gz") writeLines( c( "\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), paste0(" dplyr: ", packageVersion('dplyr')), - paste0(" dtplyr: ", packageVersion('dtplyr')), - paste0(" data.table: ", packageVersion('data.table')), paste0(" readr: ", packageVersion('readr')), paste0(" purrr: ", packageVersion('purrr')), paste0(" tidyr: ", packageVersion('tidyr')), From f41b82533b7f067e66b92842582b2e79ce576faf Mon Sep 17 00:00:00 2001 From: Daniel Lundin Date: Sat, 1 Feb 2025 10:18:42 +0100 Subject: [PATCH 3/8] Missing quote --- modules/local/collect_stats.nf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf index 5cfe5c88..19dad403 100644 --- a/modules/local/collect_stats.nf +++ b/modules/local/collect_stats.nf @@ -42,9 +42,7 @@ process COLLECT_STATS { if (mergetab) { read_mergetab = """ - - mergetab <- read_tsv("${mergetab}, show_col_types = FALSE) - + mergetab <- read_tsv("${mergetab}", show_col_types = FALSE) """ } else { read_mergetab = """ From 4636add3845b95f8fc6353c69953cd9f669a85fe Mon Sep 17 00:00:00 2001 From: Daniel Lundin Date: Sat, 1 Feb 2025 20:06:32 +0100 Subject: [PATCH 4/8] Restructure stats collection and add diamond taxonomy --- modules/local/merge_summary_tables.nf | 3 +- modules/local/sumtaxonomy/environment.yml | 7 +++ modules/local/sumtaxonomy/main.nf | 54 +++++++++++++++++++ modules/local/sumtaxonomy/meta.yml | 56 ++++++++++++++++++++ modules/local/sumtaxonomy/tests/main.nf.test | 10 ++++ subworkflows/local/eukulele.nf | 8 +-- workflows/metatdenovo.nf | 48 ++++++++--------- 7 files changed, 155 insertions(+), 31 deletions(-) create mode 100644 modules/local/sumtaxonomy/environment.yml create mode 100644 modules/local/sumtaxonomy/main.nf create mode 100644 modules/local/sumtaxonomy/meta.yml create mode 100644 modules/local/sumtaxonomy/tests/main.nf.test diff --git a/modules/local/merge_summary_tables.nf b/modules/local/merge_summary_tables.nf index ab7fafbd..8556a6fc 100644 --- a/modules/local/merge_summary_tables.nf +++ b/modules/local/merge_summary_tables.nf @@ -8,8 +8,7 @@ process MERGE_TABLES { 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" input: - - tuple val(meta), path(eggtab), path(taxtab), path(kofamscan) + tuple val(meta), path(tables) output: tuple val(meta), path("${meta.id}_merged_table.tsv.gz") , emit: merged_table diff --git a/modules/local/sumtaxonomy/environment.yml b/modules/local/sumtaxonomy/environment.yml new file mode 100644 index 00000000..a28fd724 --- /dev/null +++ b/modules/local/sumtaxonomy/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" diff --git a/modules/local/sumtaxonomy/main.nf b/modules/local/sumtaxonomy/main.nf new file mode 100644 index 00000000..12313a8b --- /dev/null +++ b/modules/local/sumtaxonomy/main.nf @@ -0,0 +1,54 @@ +process SUMTAXONOMY { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + tuple val(meta), val(db), path(taxonomy) + path feature_counts + val taxname + + output: + tuple val(meta), path("*_summary.tsv.gz") , emit: taxonomy_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + + library(tidyverse) + + # Read the taxonomy and counts tables + taxonomy <- read_tsv("${taxonomy}", show_col_types = FALSE ) + + counts <- read_tsv("${feature_counts}", show_col_types = FALSE) %>% + mutate(sample = as.character(sample)) + + # Join the two and count the number of ORFs with assigned taxonomy + counts %>% + inner_join(taxonomy, by = 'orf') %>% + group_by(sample) %>% + summarise(value = sum(count), .groups = 'drop') %>% + mutate(database = "${db ?: 'userdb'}", field = "${taxname}") %>% + relocate(value, .after = last_col()) %>% + write_tsv('${prefix}_summary.tsv.gz') + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" tidyverse: ", packageVersion('tidyverse')) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/sumtaxonomy/meta.yml b/modules/local/sumtaxonomy/meta.yml new file mode 100644 index 00000000..a0688978 --- /dev/null +++ b/modules/local/sumtaxonomy/meta.yml @@ -0,0 +1,56 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "sumtaxonomy" +description: Small module to summarise taxonomy tables +keywords: + - taxonomy + - summarise +tools: + - "sumtaxonomy": + description: "R script that summarises taxonomy tables" + +input: + # Only when we have meta + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - db: + type: string + description: Name of database, e.g. gtdb + - taxonomy: + type: file + description: Taxonomy in tsv format + pattern: "*.tsv(.gz)?" + + - - feature_counts: + type: file + description: File with gene counts + + - - taxname: + type: string + description: Name of taxonomy, e.g. eukulele + + +output: + - taxonomy_summary: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*_summary.tsv.gz": + type: file + pattern: "*._summary.tsv.gz" + + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@erikrikarddaniel" +maintainers: + - "@erikrikarddaniel" diff --git a/modules/local/sumtaxonomy/tests/main.nf.test b/modules/local/sumtaxonomy/tests/main.nf.test new file mode 100644 index 00000000..5e23fec0 --- /dev/null +++ b/modules/local/sumtaxonomy/tests/main.nf.test @@ -0,0 +1,10 @@ +nextflow_process { + + name "Test Process SUMTAXONOMY" + script "../main.nf" + process "SUMTAXONOMY" + + tag "modules" + tag "modules_" + tag "sumdiamondtax" +} diff --git a/subworkflows/local/eukulele.nf b/subworkflows/local/eukulele.nf index e9217200..54ed5fe1 100644 --- a/subworkflows/local/eukulele.nf +++ b/subworkflows/local/eukulele.nf @@ -2,9 +2,9 @@ // Run EUKULELE on protein fasta from orf_caller output // -include { EUKULELE_SEARCH } from '../../modules/local/eukulele/search' -include { FORMAT_EUKULELE_TAX } from '../../modules/local/format_eukulele_tax' -include { SUM_EUKULELE_TAXONOMY } from '../../modules/local/sum_eukulele_taxonomy' +include { EUKULELE_SEARCH } from '../../modules/local/eukulele/search' +include { FORMAT_EUKULELE_TAX } from '../../modules/local/format_eukulele_tax' +include { SUMTAXONOMY as SUM_EUKULELE_TAXONOMY } from '../../modules/local/sumtaxonomy' workflow SUB_EUKULELE { @@ -26,7 +26,7 @@ workflow SUB_EUKULELE { .map { meta, taxonomy, protein, dbname, database -> [ meta, dbname, taxonomy ] } .set { ch_sum_taxonomy } - SUM_EUKULELE_TAXONOMY ( ch_sum_taxonomy, feature_counts ) + SUM_EUKULELE_TAXONOMY ( ch_sum_taxonomy, feature_counts, 'eukulele' ) ch_versions = ch_versions.mix ( SUM_EUKULELE_TAXONOMY.out.versions ) emit: diff --git a/workflows/metatdenovo.nf b/workflows/metatdenovo.nf index 3ed45a7b..a310d159 100644 --- a/workflows/metatdenovo.nf +++ b/workflows/metatdenovo.nf @@ -61,12 +61,14 @@ include { MEGAHIT_INTERLEAVED } from '../modules/local/megahit/in include { MERGE_TABLES } from '../modules/local/merge_summary_tables' include { FORMAT_DIAMOND_TAX_RANKLIST } from '../modules/local/format_diamond_tax_ranklist' include { FORMAT_DIAMOND_TAX_TAXDUMP } from '../modules/local/format_diamond_tax_taxdump' +include { SUMTAXONOMY as SUM_DIAMONDTAX } from '../modules/local/sumtaxonomy' include { TRANSDECODER } from '../modules/local/transdecoder' include { TRANSRATE } from '../modules/local/transrate' include { UNPIGZ as UNPIGZ_CONTIGS } from '../modules/local/unpigz' include { UNPIGZ as UNPIGZ_GFF } from '../modules/local/unpigz' include { WRITESPADESYAML } from '../modules/local/writespadesyaml' + // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // @@ -435,17 +437,14 @@ workflow METATDENOVO { .combine(ch_fcs_for_stats) .set { ch_collect_stats } + ch_merge_tables = Channel.empty() // // SUBWORKFLOW: run eggnog_mapper on the ORF-called amino acid sequences // if ( ! params.skip_eggnog ) { EGGNOG(ch_protein, ch_fcs_for_summary) ch_versions = ch_versions.mix(EGGNOG.out.versions) - ch_merge_tables = EGGNOG.out.sumtable - } else { - ch_protein - .map { meta, protein -> [ meta, [] ] } - .set { ch_merge_tables } + ch_merge_tables = ch_merge_tables.mix ( EGGNOG.out.sumtable.map { meta, tsv -> tsv } ) } @@ -458,14 +457,7 @@ workflow METATDENOVO { .set { ch_kofamscan } KOFAMSCAN( ch_kofamscan, ch_fcs_for_summary) ch_versions = ch_versions.mix(KOFAMSCAN.out.versions) - ch_kofamscan_summary = KOFAMSCAN.out.kofamscan_summary.collect{ meta, tsv -> tsv } - ch_merge_tables - .combine( ch_kofamscan_summary ) - .set { ch_merge_tables } - } else { - ch_merge_tables - .map { meta, tsv -> [ meta, tsv, [] ] } - .set { ch_merge_tables } + ch_merge_tables = ch_merge_tables.mix ( KOFAMSCAN.out.kofamscan_summary.map { meta, tsv -> tsv } ) } // set up contig channel to use in CAT and TransRate @@ -500,15 +492,8 @@ workflow METATDENOVO { .combine( ch_eukulele_db ) .set { ch_eukulele } SUB_EUKULELE( ch_eukulele, ch_fcs_for_summary ) - ch_taxonomy_summary = SUB_EUKULELE.out.taxonomy_summary.collect{ meta, tsv -> tsv } ch_versions = ch_versions.mix(SUB_EUKULELE.out.versions) - ch_merge_tables - .combine( ch_taxonomy_summary ) - .set { ch_merge_tables } - } else { - ch_merge_tables - .map { meta, tsv1, tsv2 -> [ meta, tsv1, tsv2, [] ] } - .set { ch_merge_tables } + ch_merge_tables = ch_merge_tables.mix ( SUB_EUKULELE.out.taxonomy_summary.map { meta, tsv -> tsv } ) } // @@ -545,7 +530,7 @@ workflow METATDENOVO { PIGZ_DIAMOND_LINEAGE.out.archive .map { it -> [ [ id: it[0].db ], it[0], it[1] ] } .join(ch_diamond_dbs) - .map { it -> [ [ id: it[1].id - ".lineage" + ".diamond" ], it[2], it[6] ] } + .map { it -> [ [ id: it[1].id - ".lineage" + ".diamond", db: it[1].db ], it[2], it[6] ] } ) ch_versions = ch_versions.mix(FORMAT_DIAMOND_TAX_RANKLIST.out.versions) @@ -553,17 +538,29 @@ workflow METATDENOVO { PIGZ_DIAMOND_LINEAGE.out.archive .map { it -> [ [ id: it[0].db ], it[0], it[1] ] } .join(ch_diamond_dbs.filter { it -> it[5] }) - .map { it -> [ [ id: it[1].id - ".lineage" + ".diamond" ], it[2], it[4], it[5], it[6] ] } + .map { it -> [ [ id: it[1].id - ".lineage" + ".diamond", db: it[1].db ], it[2], it[4], it[5], it[6] ] } ) ch_versions = ch_versions.mix(FORMAT_DIAMOND_TAX_TAXDUMP.out.versions) - // tuple val(meta), path(taxfile), val(ranks) + SUM_DIAMONDTAX( + FORMAT_DIAMOND_TAX_RANKLIST.out.taxonomy + .map { it -> [ it[0], it[0].db, it[1] ] }, + ch_fcs_for_summary, + 'diamondtax' + ) + ch_versions = ch_versions.mix(SUM_DIAMONDTAX.out.versions) + + ch_merge_tables = ch_merge_tables.mix ( SUM_DIAMONDTAX.out.taxonomy_summary.map { meta, tsv -> tsv } ) // // MODULE: Collect statistics from mapping analysis // if( !params.skip_eggnog || !params.skip_eukulele || !params.skip_kofamscan) { - MERGE_TABLES ( ch_merge_tables ) + MERGE_TABLES ( + ch_merge_tables + .collect() + .map { it -> [ [ id: "${assembly_name}.${orfs_name}" ], it ] } + ) ch_collect_stats .combine(MERGE_TABLES.out.merged_table.collect{ meta, tblout -> tblout }.map { [ it ] }) .set { ch_collect_stats } @@ -573,6 +570,7 @@ workflow METATDENOVO { .map { meta, samples, report, tsv, idxstats, counts -> [ meta, samples, report, tsv, idxstats, counts, [] ] } .set { ch_collect_stats } } + //ch_collect_stats.view { it -> "ch_collect_stats: ${it}" } COLLECT_STATS(ch_collect_stats) ch_versions = ch_versions.mix(COLLECT_STATS.out.versions) From 099328b978bb06ec8aedb47609fb6047b6aeba18 Mon Sep 17 00:00:00 2001 From: Daniel Lundin Date: Sat, 1 Feb 2025 23:04:49 +0100 Subject: [PATCH 5/8] Continue cleaning up overall stats --- conf/modules.config | 6 ++--- modules/local/collect_stats.nf | 4 +-- modules/local/eggnog/sum.nf | 2 +- modules/local/merge_summary_tables.nf | 3 ++- modules/local/sum_kofamscan.nf | 2 +- workflows/metatdenovo.nf | 37 +++++++++++++++------------ 6 files changed, 28 insertions(+), 26 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index b644a3c0..33de076b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -272,13 +272,13 @@ process { path: { "${params.outdir}/summary_tables/" }, pattern: "kofamscan.tsv.gz", mode: params.publish_dir_mode, - saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${filename}" } + saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${filename}" } ], [ path: { "${params.outdir}/kofamscan/" }, pattern: "kofamscan_output.tsv.gz", mode: params.publish_dir_mode, - saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${filename}" } + saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${filename}" } ] ] } @@ -337,7 +337,7 @@ process { path: { "${params.outdir}/summary_tables" }, mode: params.publish_dir_mode, pattern: '*.tsv.gz', - saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${params.eukulele_db ?: 'userdb'}.eukulele.taxonomy.tsv.gz" } + saveAs: { filename -> "${params.assembler ?: params.user_assembly_name}.${params.orf_caller ?: params.user_orfs_name}.${params.eukulele_db ?: 'userdb'}.eukulele.taxonomy.tsv.gz" } ] } diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf index 19dad403..de061696 100644 --- a/modules/local/collect_stats.nf +++ b/modules/local/collect_stats.nf @@ -40,7 +40,7 @@ process COLLECT_STATS { """ } - if (mergetab) { + if ( mergetab ) { read_mergetab = """ mergetab <- read_tsv("${mergetab}", show_col_types = FALSE) """ @@ -59,8 +59,6 @@ process COLLECT_STATS { library(tidyr) library(stringr) - #TYPE_ORDER = c('n_trimmed', 'n_non_contaminated', 'idxs_n_mapped', 'idxs_n_unmapped', 'n_feature_count') - start <- tibble(sample = c("${samples.join('", "')}")) trimming <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs} diff --git a/modules/local/eggnog/sum.nf b/modules/local/eggnog/sum.nf index 7eb1bfa5..54620b1c 100644 --- a/modules/local/eggnog/sum.nf +++ b/modules/local/eggnog/sum.nf @@ -45,7 +45,7 @@ process EGGNOG_SUM { group_by(sample) %>% drop_na() %>% summarise( value = sum(count), .groups = 'drop') %>% - add_column(database = "eggnog", field = "eggnog_n_counts") %>% + add_column(database = "eggnog", field = "n") %>% relocate(value, .after = last_col()) %>% write_tsv('${meta.id}.eggnog_summary.tsv.gz') diff --git a/modules/local/merge_summary_tables.nf b/modules/local/merge_summary_tables.nf index 8556a6fc..d03f1916 100644 --- a/modules/local/merge_summary_tables.nf +++ b/modules/local/merge_summary_tables.nf @@ -33,7 +33,8 @@ process MERGE_TABLES { Sys.glob('*.tsv.gz') %>% read_tsv() %>% mutate(sample = as.character(sample)) %>% - pivot_wider(names_from = c(database,field), values_from = value) %>% + arrange(field, database) %>% + pivot_wider(names_from = c(field,database), values_from = value) %>% write_tsv('${prefix}_merged_table.tsv.gz') writeLines( diff --git a/modules/local/sum_kofamscan.nf b/modules/local/sum_kofamscan.nf index 490ff543..5aaea8a1 100644 --- a/modules/local/sum_kofamscan.nf +++ b/modules/local/sum_kofamscan.nf @@ -45,7 +45,7 @@ process SUM_KOFAMSCAN { inner_join(kofams, by = 'orf') %>% group_by(sample) %>% summarise(value = sum(count), .groups = 'drop') %>% - add_column(database = "kofamscan", field = "kofamscan_n_counts") %>% + add_column(database = "kofamscan", field = "n") %>% relocate(value, .after = last_col()) %>% write_tsv('${meta.id}.kofamscan_summary.tsv.gz') diff --git a/workflows/metatdenovo.nf b/workflows/metatdenovo.nf index a310d159..2792c176 100644 --- a/workflows/metatdenovo.nf +++ b/workflows/metatdenovo.nf @@ -297,8 +297,8 @@ workflow METATDENOVO { ) SPADES.out.transcripts - .ifEmpty{ [] } - .combine(SPADES.out.contigs.ifEmpty{ [] } ) + .ifEmpty { [] } + .combine(SPADES.out.contigs.ifEmpty { [] } ) .set { ch_assembly } ch_versions = ch_versions.mix(SPADES.out.versions) @@ -555,22 +555,25 @@ workflow METATDENOVO { // // MODULE: Collect statistics from mapping analysis // - if( !params.skip_eggnog || !params.skip_eukulele || !params.skip_kofamscan) { - MERGE_TABLES ( - ch_merge_tables - .collect() - .map { it -> [ [ id: "${assembly_name}.${orfs_name}" ], it ] } + MERGE_TABLES ( + ch_merge_tables + .collect() + .map { it -> [ [ id: "${assembly_name}.${orfs_name}" ], it ] } + ) + MERGE_TABLES.out.merged_table + //.view { "merged0: ${it}" } + //.collect { meta, tblout -> tblout } + //.view { "merged1: ${it}" } + //.map { meta, tblout -> [ tblout ] } + //.view { "merged2: ${it}" } + ch_collect_stats = ch_collect_stats + .combine( + Channel.empty() + .mix ( MERGE_TABLES.out.merged_table.map { meta, tblout -> [ tblout ] } ) + .ifEmpty { [ [] ] } + //.map { [ it ] } ) - ch_collect_stats - .combine(MERGE_TABLES.out.merged_table.collect{ meta, tblout -> tblout }.map { [ it ] }) - .set { ch_collect_stats } - ch_versions = ch_versions.mix(MERGE_TABLES.out.versions) - } else { - ch_collect_stats - .map { meta, samples, report, tsv, idxstats, counts -> [ meta, samples, report, tsv, idxstats, counts, [] ] } - .set { ch_collect_stats } - } - //ch_collect_stats.view { it -> "ch_collect_stats: ${it}" } + ch_versions = ch_versions.mix(MERGE_TABLES.out.versions) COLLECT_STATS(ch_collect_stats) ch_versions = ch_versions.mix(COLLECT_STATS.out.versions) From 92c4a9f3f8c874090ae29bdb2864b0ef7c832859 Mon Sep 17 00:00:00 2001 From: Daniel Lundin Date: Sat, 1 Feb 2025 23:23:03 +0100 Subject: [PATCH 6/8] Changelog --- CHANGELOG.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 13046e62..3fa1c7db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co ### `Added` -- [#320](<[https://github.com/nf-core/metatdenovo/pull/320](https://github.com/nf-core/metatdenovo/pull/320)>) improvments to Diamond taxonomy plus documentation +- [#320](<[https://github.com/nf-core/metatdenovo/pull/320](https://github.com/nf-core/metatdenovo/pull/320)>) added taxonomy directly with Diamond, part 2 - [#312](<[https://github.com/nf-core/metatdenovo/pull/312](https://github.com/nf-core/metatdenovo/pull/312)>) added taxonomy directly with Diamond, see `--diamond_dbs` - [#286](<[https://github.com/nf-core/metatdenovo/pull/286](https://github.com/nf-core/metatdenovo/pull/286)>) added an option to save the fasta file output from formatspades.nf module - [#285](<[https://github.com/nf-core/metatdenovo/pull/285](https://github.com/nf-core/metatdenovo/pull/285)>) added nf-test for default settings. @@ -18,6 +18,7 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co ### `Changed` +- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Clean up overall stats table - [#323](<[https://github.com/nf-core/metatdenovo/pull/323](https://github.com/nf-core/metatdenovo/pull/323)>) - Modified param names for input of assembly and ORFs; added name params for output file naming - [#323](<[https://github.com/nf-core/metatdenovo/pull/323](https://github.com/nf-core/metatdenovo/pull/323)>) - Removed default for `assembler` and `orf_caller` parameters - [#311](<[https://github.com/nf-core/metatdenovo/pull/311](https://github.com/nf-core/metatdenovo/pull/311)>) - Update modules and subworkflows @@ -29,8 +30,10 @@ Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co ### `Fixed` -- [#305](<[https://github.com/nf-core/ampliseq/pull/681](https://github.com/nf-core/metatdenovo/pull/305)>) - Make EUKulele counts output optional as it's not always created -- [#269](<[https://github.com/nf-core/ampliseq/pull/681](https://github.com/nf-core/metatdenovo/pull/269)>) - Make Transdecoder work better with `-resume` +- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Fix resources for test cases +- [#326](<[https://github.com/nf-core/metatdenovo/pull/326](https://github.com/nf-core/metatdenovo/pull/326)>) - Fix output file names for Eukulele and Kofamscan +- [#305](<[https://github.com/nf-core/metatdenovo/pull/305](https://github.com/nf-core/metatdenovo/pull/305)>) - Make EUKulele counts output optional as it's not always created +- [#269](<[https://github.com/nf-core/metatdenovo/pull/269](https://github.com/nf-core/metatdenovo/pull/269)>) - Make Transdecoder work better with `-resume` ### `Dependencies` From f48a48fa87734d936cd57ae59bf2bbd4d2b4d523 Mon Sep 17 00:00:00 2001 From: Daniel Lundin Date: Sat, 1 Feb 2025 23:30:22 +0100 Subject: [PATCH 7/8] Whitespace --- workflows/metatdenovo.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/metatdenovo.nf b/workflows/metatdenovo.nf index 2792c176..c3d7eab8 100644 --- a/workflows/metatdenovo.nf +++ b/workflows/metatdenovo.nf @@ -555,9 +555,9 @@ workflow METATDENOVO { // // MODULE: Collect statistics from mapping analysis // - MERGE_TABLES ( + MERGE_TABLES ( ch_merge_tables - .collect() + .collect() .map { it -> [ [ id: "${assembly_name}.${orfs_name}" ], it ] } ) MERGE_TABLES.out.merged_table From a64ae1d412cc4bb5f845ba491ae6eeb867fc50cd Mon Sep 17 00:00:00 2001 From: Daniel Lundin Date: Sat, 1 Feb 2025 23:35:02 +0100 Subject: [PATCH 8/8] Prettier --- modules/local/sumtaxonomy/meta.yml | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/modules/local/sumtaxonomy/meta.yml b/modules/local/sumtaxonomy/meta.yml index a0688978..f07700c6 100644 --- a/modules/local/sumtaxonomy/meta.yml +++ b/modules/local/sumtaxonomy/meta.yml @@ -31,24 +31,23 @@ input: - - taxname: type: string description: Name of taxonomy, e.g. eukulele - output: - taxonomy_summary: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. `[ id:'sample1', single_end:false ]` - - "*_summary.tsv.gz": - type: file - pattern: "*._summary.tsv.gz" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*_summary.tsv.gz": + type: file + pattern: "*._summary.tsv.gz" + - versions: - - "versions.yml": - type: file - description: File containing software versions - pattern: "versions.yml" + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@erikrikarddaniel"