1_NCBI_statistics.Rmd

---
title: "Nanopore reads spanning the entire genome of nuclear arthropod-specific large
  DNA viruses"
author: "Jörg Wennmann"
date: "2024-10-06"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Required libraries

The following libraries are required to run this script.

```{r message=FALSE}
library(xml2)
library(XML)
library(ggplot2)
library(patchwork)
library(RColorBrewer)
`%ni%` <- Negate(`%in%`)
```

## Required functions

```{r}
phred_to_probability <- function(phred_scores) {
  probabilities <- 10^(-phred_scores / 10)
  probabilities <- 1-probabilities
  return(probabilities)
}

probability_to_phred <- function(probability) {
  phred_score <- -10 * log10(1 - probability)
  return(phred_score)
}

meanPhredScore <- function(qualities) {
  scores <- as.numeric(charToRaw(as.character(qualities))) - 33
  probabilities <- phred_to_probability(scores)
  probabilities <- mean(probabilities)
  scores <- probability_to_phred(probabilities)
  
  return(scores)
}
```


## Consensus genomes published on NCBI Genbank

To get an overview of the number of published NALDV genomes (as of July 2024), all NCBI Genbank entries found under the individual names of the virus families (Baculoviridae, Nudiviridae, Nimaviridae, Hytrosaviridae) are available in the directory /data/xml. A filter has been set to only allow sequences in the range of complete genomes. The list is probably not complete, but it is a good approximation of all genomes that are publicly available at this time.

The XML files are first read into R and converted into a data frame. The virus name (isolate, strain), accession number, genome length and publication date are extracted.

### Function to read and process XML files

A function is needed to read the NCBI Genbank XML files. This is provided below. However, the XML files are very large and therefore not included in this Github repository. Instead, only one file (for the virus family Hytrosaviridae) was provided. Instead, the output for all XML, a CSV file, was added to the Github repository. This is imported below so that the entire data set can be further processed.

```{r}
process_virus_family <- function(file_path, family_name, must_be_in_name = NULL, alpha_terms = NULL, beta_terms = NULL) {
  
  # Read the XML file
  xml_file <- read_xml(file_path)
  
  # Find all INSDSeq nodes
  insdseq_nodes <- xml_find_all(xml_file, "//INSDSeq")
  
  # Extract relevant information
  accession_numbers <- xml_text(xml_find_all(insdseq_nodes, ".//INSDSeq_locus"))
  create_dates <- xml_text(xml_find_all(insdseq_nodes, ".//INSDSeq_create-date"))
  source_org <- xml_text(xml_find_all(insdseq_nodes, ".//INSDSeq_source"))
  genome_length <- xml_text(xml_find_all(insdseq_nodes, ".//INSDSeq_length"))
  
  # Create DataFrame
  df <- data.frame(AccessionNumber = accession_numbers, 
                   Length = genome_length,
                   CreateDate = create_dates, 
                   Source = source_org, 
                   stringsAsFactors = FALSE)
  
  # If a filter for specific terms in must_be_in_name exists
  if (!is.null(must_be_in_name)) {
    matches <- sapply(must_be_in_name, grepl, df$Source)
    keep_indices <- apply(matches, 1, any)
    df <- df[keep_indices, ]
  }
  
  # Add column for category, if alpha and beta terms are provided
  df$Category <- NA
  
  if (!is.null(alpha_terms)) {
    for (word in alpha_terms) {
      df$Category[grepl(word, df$Source, ignore.case = TRUE)] <- "alpha, gamma and delta"
    }
  }
  
  if (!is.null(beta_terms)) {
    for (word in beta_terms) {
      df$Category[grepl(word, df$Source, ignore.case = TRUE)] <- "beta"
    }
  }
  
  # Add family tag
  df$Family <- family_name
  
  return(df)
}
```

### Importing the entire genome data set from a CSV file

The code below creates a DataFrame from provided XML files (available from NCBI GenBank) that extracts the names of virus genomes, their lengths and the date of publication. However, since the XML files are too large to include in this GitHub repository, the code first checks whether the XML files have already been processed and saved as a CSV file. Since this is the case, the existing CSV file that is attached to the repository is read and used.

```{r}
# path to file
file_path <- "output/genome_genbank_table/xml_imported_virus_data.csv"

# test if file is already present
if (file.exists(file_path)) {
  
  # CSV file exists; read CSV
  df <- read.csv(file_path, stringsAsFactors = FALSE)
  # Create the specific dataframe per virus family
  dfBAC <- df[df$Family == "Baculoviridae", ]
  dfNimV <- df[df$Family == "Nimaviridae", ]
  dfNV <- df[df$Family == "Nudiviridae", ]
  dfHV <- df[df$Family == "Hytrosaviridae", ]
  
} else {
  
  # Read Baculoviridae data
  dfBAC <- process_virus_family(
    file_path = "data/xml/Baculoviridae_ncbi_genbank_entry.xml",
    family_name = "Baculoviridae",
    must_be_in_name = c("nucleopolyhedrovirus", 
                        "granulovirus", 
                        "SNPV", 
                        "nuclopolyhedrovirus",
                        "Alphabaculovirus", 
                        "Betabaculovirus", 
                        "betabaculovirus", 
                        "NPV",
                        "nucleopolyhedrosis", 
                        "alphabaculovirus"),
    alpha_terms = c("nucleopolyhedrovirus", 
                    "SNPV", 
                    "nuclopolyhedrovirus", 
                    "Alphabaculovirus", 
                    "NPV", 
                    "nucleopolyhedrosis", 
                    "alphabaculovirus"),
    beta_terms = c("granulovirus", 
                   "Betabaculovirus", 
                   "betabaculovirus")
  )
  
  
  # Read Nudiviridae data
  dfNV <- process_virus_family(
    file_path = "data/xml/Nudiviridae_ncbi_genbank_entry.xml",
    family_name = "Nudiviridae"
  )
  
  
  # Read Hytrosaviridae data
  dfHV <- process_virus_family(
    file_path = "data/xml/Hytrosaviridae_ncbi_genbank_entry.xml",
    family_name = "Hytrosaviridae"
  )
  
  
  # Read Nimaviridae data
  dfNimV <- process_virus_family(
    file_path = "data/xml/Nimaviridae_ncbi_genbank_entry.xml",
    family_name = "Nimaviridae"
  )
  
  
  df <- rbind(dfNV, dfBAC, dfHV, dfNimV)
  
  #export zu CSV file
  write.csv(df, file = file_path, row.names = FALSE)
  
}

print(head(df))
```

### Visualisation of genome publications over time

After the data frame has been created, the time course of the publication of the consensus sequences can be displayed with ggplot2.

```{r}
# Umwandlung der CreateDate-Spalte in ein Date-Objekt
df$CreateDate <- as.Date(df$CreateDate, format = "%d-%b-%Y")

# Filter auf die gewünschten Daten
df <- df[df$CreateDate <= as.Date("2024-07-30"), ]

# Berechnung der kumulativen Häufigkeit für jede Familie
families <- unique(df$Family)
cumulative_values <- list()

for (family in families) {
  family_data <- df[df$Family == family, ]
  family_data <- family_data[order(family_data$CreateDate), ]
  family_data$Value <- 1
  family_data$CumulativeValue <- cumsum(family_data$Value)
  
  # Sicherstellen, dass alle Familien am 30.07.2024 enden
  full_dates <- seq.Date(from = as.Date("1994-01-01"), to = as.Date("2024-07-30"), by = "day")
  complete_data <- merge(data.frame(CreateDate = full_dates), family_data, by = "CreateDate", all.x = TRUE)
  complete_data$Family <- family
  complete_data$Value[is.na(complete_data$Value)] <- 0
  complete_data$CumulativeValue <- cumsum(complete_data$Value)
  
  cumulative_values[[family]] <- complete_data
}

df_complete <- do.call(rbind, cumulative_values)

df_complete$Family <- factor(df_complete$Family, levels = c("Baculoviridae", "Nimaviridae", "Nudiviridae", "Hytrosaviridae"))

#----------PLOT------------

colorPalette <- "Oranges"

date_breaks <- seq(as.Date("1994-01-01"), as.Date("2024-07-30"), by = "2 years")

pGB <- ggplot(df_complete, aes(x = CreateDate, y = CumulativeValue, 
                               fill = Family, group = Family)) +
  geom_area(colour = "black", 
            #position = "identity",
            linewidth = 0.2, alpha = 0.6) +
  scale_fill_manual(values = c("Baculoviridae" = brewer.pal(4, colorPalette)[1],
                               "Nimaviridae" = brewer.pal(4, colorPalette)[2],
                               "Nudiviridae" = brewer.pal(4, colorPalette)[3], 
                               "Hytrosaviridae" = brewer.pal(4, colorPalette)[4]),
                    
                    labels = c(paste("Baculoviridae = ", dim(dfBAC)[1], sep = ""),
                               paste("Nimaviridae = ", dim(dfNimV)[1], 
                                     " (incl. MAGs)", sep = ""),
                               paste("Nudiviridae = ", dim(dfNV)[1], sep = ""), 
                               paste("Hytrosaviridae = ", dim(dfHV)[1], sep = ""))) +
  theme_bw() +
  scale_x_date(breaks = date_breaks, date_labels = "%Y", 
               limits = as.Date(c("1994-01-01", max(df_complete$CreateDate)))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 700),
                     breaks = seq(0, 700, by = 50)) +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position  = c(0.4, 0.7),
        legend.title = element_blank(),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  ylab("Cumulative Frequency") +
  xlab("Year of NCBI Genbank Submission")

print(pGB)
```

```{r}
pGB_right <- ggplot(df_complete, aes(x = CreateDate, y = CumulativeValue, 
                               fill = Family, group = Family)) +
  geom_area(colour = "black", # dünne schwarze Linie für Grenzen
            linewidth = 0.2, alpha = 0.6) +
  scale_fill_manual(values = c("Baculoviridae" = brewer.pal(4, colorPalette)[1],
                               "Nimaviridae" = brewer.pal(4, colorPalette)[2],
                               "Nudiviridae" = brewer.pal(4, colorPalette)[3], 
                               "Hytrosaviridae" = brewer.pal(4, colorPalette)[4]),
                    
                    labels = c(paste("Baculoviridae = ", dim(dfBAC)[1], sep = ""),
                               paste("Nimaviridae = ", dim(dfNimV)[1], 
                                     " (incl. MAGs)", sep = ""),
                               paste("Nudiviridae = ", dim(dfNV)[1], sep = ""), 
                               paste("Hytrosaviridae = ", dim(dfHV)[1], sep = ""))) +
  theme_bw() +
  scale_x_date(breaks = date_breaks, date_labels = "%Y", 
               limits = as.Date(c("1994-01-01", max(df_complete$CreateDate)))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 700),
                     breaks = seq(0, 700, by = 50),
                     position = "right") + # Y-Achse nach rechts verschieben
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position  = c(0.4, 0.7),
        legend.title = element_blank(),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  ylab("Cumulative Frequency") +
  xlab("Year of NCBI Genbank Submission")

print(pGB_right)

f <- 2
# Speichern des kombinierten Plots als PDF
file_name <- "output/ncbi_stats/NALDV_stats_on_Genbank_Yright.png"
ggsave(file_name, pGB_right, width = 3*f, height = 2.5*f)
```


## Sequence data available from NCBI SRA

Analysing the availability of Illumina and Nanopore data sets on NCBI SRA is a bit easier. You can search and filter for the datasets on NCBI. All hits can be downloaded directly as a CSV files. These files then serve as input for the subsequent analysis. The result is a plot showing the time course of the published SRA data sets (Illumina and Nanopore) (as of July 2024).

```{r}
sra_df_BAC <- read.csv("data/sra_table/SraRunInfo_BAC.csv")
sra_df_NV<- read.csv("data/sra_table/SraRunInfo_NV.csv")
sra_df_Nim <- read.csv("data/sra_table/SraRunInfo_Nim.csv")

sra_df <- rbind(sra_df_BAC, sra_df_NV, sra_df_Nim)

sra_df$ReleaseDate <- as.Date(sra_df$ReleaseDate)

sra_df <- sra_df[order(sra_df$ReleaseDate), ]

sra_IL <- subset(sra_df, Platform == "ILLUMINA")
sra_PB <- subset(sra_df, Platform == "PACBIO_SMRT")
sra_ON <- subset(sra_df, Platform == "OXFORD_NANOPORE")
sra_NA <- subset(sra_df, Platform %ni% c("ILLUMINA", "OXFORD_NANOPORE", "PACBIO_SMRT"))

sra_IL$CumulativeCount <- seq_along(sra_IL$ReleaseDate)
sra_ON$CumulativeCount <- seq_along(sra_ON$ReleaseDate)
sra_PB$CumulativeCount <- seq_along(sra_PB$ReleaseDate)
sra_NA$CumulativeCount <- seq_along(sra_NA$ReleaseDate)

#Ich vereinfache die Daten und kombiniere ONT und PB:
sra_short <- sra_IL
sra_short$Platform <- "Short Read: Illumina"

sra_long <- rbind(sra_ON, sra_PB)
sra_long <- sra_long[order(sra_long$ReleaseDate), ]
sra_long$Platform <- "Long Read: ONT and PacBio"
sra_long$CumulativeCount <- seq_along(sra_long$ReleaseDate)

#Combine short and long
sra_platform <- rbind(sra_long, sra_short)
#set the levels
sra_platform$Platform <- factor(sra_platform$Platform, 
                                levels = rev(c("Long Read: ONT and PacBio", 
                                           "Short Read: Illumina")))

pSRA <- ggplot(sra_platform, aes(x = ReleaseDate, y = CumulativeCount, 
                                 fill = Platform, group = Platform)) +
  geom_area(colour = "black", 
            position = "identity",
            linewidth = 0.2, alpha = 0.8) +
  scale_fill_brewer(palette = "Greens") +
  theme_bw() +
  scale_x_date(breaks = date_breaks, date_labels = "%Y", 
               limits = as.Date(c("2012-01-01", "2024-12-31")))+
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 250),
                     breaks = seq(0, 250, by = 50)) +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = c(0.3, 0.85),
        legend.title = element_blank(),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  ylab("Cumulative Frequency") +
  xlab("Year of NCBI SRA Release") +
  annotate("text", x = as.Date("2020-09-01"), y = 100, 
           label = "+153 LdMNPV (2018/02)", 
           size = 3, color = "black")

print(pSRA)
```

The library patchwork can be used to create beautiful graphics. Here, the NALDV statistics from NCBI Genbank and NCBI SRA are compared.

```{r}
combined_plot <- (pGB + pSRA) + plot_annotation(tag_levels = 'A')

# Anzeige des kombinierten Plots
print(combined_plot)

f <- 1.2
# Speichern des kombinierten Plots als PDF
file_name <- "output/ncbi_stats/NALDV_stats_on_Genbank_SRA.png"
ggsave(file_name, combined_plot, width = 8*f, height = 3*f)
```

# Error probability NCBI statistics

```{r}
data_ncbi <- read.csv(file = "data/read_length_quality/NCBI_read_statistics.csv")

# Daten für die Kurve erstellen und daraus einen Dataframe
all_phred_scores <- 0:40
all_error_probabilities <- phred_to_probability(all_phred_scores)
data_all <- data.frame(Phred=all_phred_scores, ErrorProbability=all_error_probabilities)

# manual y labels
y_breaks <- c(1, 0.1, 0.01, 0.001, 0.0001)
y_labels <- c("100%", "10%", "1%", "0.1%", "0.01%")

# Calculate mean for each iso
isos <- unique(data_ncbi$ISO)
mean_quality_scores <- sapply(isos, function(iso) {
  mean(data_ncbi$Mean_Quality_Score[data_ncbi$ISO == iso])
})
mean_quality_probabilities <- phred_to_probability(mean_quality_scores)

LAB <- c("2015: AcMNPV-WP10 (SRR1118212)\nIllumina Genome Analyzer IIx",
         "2024: BmNPV-Th2 (SRR25338386)\nIllumina NextSeq500", 
         "2024: HearNPV-IIPR05 (SRR21146807)\nIllumina HiSeq2500", 
         "2024: OpbuNPV (SRR27473752)\nIllumina MiSeq", 
         "2024: BmNPV-Th2 (SRR27030578)\nONT Rapid Ligation Kit", 
         "2023: PaviNPV-Gz05 (SRR17312069)\nONT Ligation Kit"
         )

data_means <- data.frame(ISO = isos, 
                         Mean_Quality_Score = mean_quality_scores, 
                         Mean_Quality_Probability = mean_quality_probabilities,
                         LAB)

data_labels <- c("2024: BmNPV-Th2 (SRR27030578)\nONT Rapid Ligation Kit", 
                 "2023: PaviNPV-Gz05 (SRR17312069)\nONT Ligation Kit",
                 "2015: AcMNPV-WP10 (SRR1118212)\nIllumina Genome Analyzer IIx",
                 "2024: BmNPV-Th2 (SRR25338386)\nIllumina NextSeq500", 
                 "2024: HearNPV-IIPR05 (SRR21146807)\nIllumina HiSeq2500", 
                 "2024: OpbuNPV (SRR27473752)\nIllumina MiSeq"
)

data_means$LAB <- factor(data_means$LAB, levels = c(data_labels))
custom_colors <- brewer.pal(length(LAB), "Purples")

x_offsets <- c(4, -4, 2, 3.5, -4.5)
y_offsets <- c(0.002, -0.00055, 0.0005, 0.09, -0.02)

#plot
pQP <- ggplot() +
  geom_line(data = data_all, aes(x = Phred, y = 1-ErrorProbability), color="darkblue") +
  geom_point(data = data_means, aes(x = Mean_Quality_Score, y = 1 - Mean_Quality_Probability, fill = LAB), 
             shape=22, size=4, colour="black", stroke = 0.2) +
  geom_segment(aes(x = 0, y = 0.01, xend = 20, yend = 0.01), 
               linetype = "dotted", color = "black") + #horizontal line
  geom_segment(aes(x = 20, y = 0, xend = 20, yend = 0.01), 
               linetype = "dotted", color = "black") + # #vertical line
  scale_y_continuous(expand = c(0, 0), limits = c(-0.03, 1), breaks = c(0, 0.25, 0.5, 0.75, 1), labels = c("0%", "25%", "50%", "75%", "100%")) +
  scale_x_continuous(expand = c(0,0)) +
  scale_fill_manual(values = custom_colors) +
  labs(x = "Phred Quality Score (Q)", y = "Error Probability (%)") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 8),
        legend.key.height = unit(1.5, "lines"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) 

# Plots speichern
f <- 2

ggsave(plot = pQP, filename = "output/ncbi_stats/phred_vs_probability.png",
       units = "cm", 
       dpi = 300, 
       width = 9 * f, 
       height = 5 * f)


#plot with log transformed y axis
pQP_log <- ggplot() +
  geom_line(data = data_all, aes(x = Phred, y = 1-ErrorProbability), color="darkblue") +
  geom_point(data = data_means, aes(x = Mean_Quality_Score, y = 1 - Mean_Quality_Probability, fill = LAB), 
             shape=22, size=4, colour="black", stroke = 0.2) +
  geom_segment(aes(x = 20, y = 0.01, xend = 0, yend = 0.01), 
               linetype = "dotted", color = "black") + #HL
  geom_segment(aes(x = 20, y = 0.01, xend = 20, yend = 0.0001), 
               linetype = "dotted", color = "black") + #VL
  scale_y_log10(expand = c(0,0), breaks = y_breaks, labels = y_labels) +
  scale_x_continuous(expand = c(0,0)) +
  scale_fill_manual(values = custom_colors) +
  labs(x="Phred Quality Score (Q)",
       y="Error Probability (%)") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 8),
        legend.key.height = unit(1.5, "lines"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) 


# Plots speichern
f <- 2

ggsave(plot = pQP_log, filename = "output/ncbi_stats/phred vs probability_log.png", 
       units = "cm", 
       dpi = 300, 
       width = 9 * f, 
       height = 5 * f)


######

pQP <- pQP + theme(legend.position = "none")
combined_plot <- (pQP + pQP_log) + plot_annotation(tag_levels = 'A')

# Anzeige des kombinierten Plots
print(combined_plot)

f <- 1.2
# Speichern des kombinierten Plots als PDF
file_name <- "output/ncbi_stats/phred_vs_probability_combined.png"
ggsave(file_name, combined_plot, width = 8*f, height = 3*f)
```