Step 6: Visualize the Top Differentially Expressed Genes and their Functions

Using the table of annotated differentially expressed genes (generated in step 5), make some insightful data visualizations to understand the differing key biological processes between coral embryos and coral recruits

Author

Sarah Tanja

Published

June 1, 2023

Install Packages

if ("tidyverse" %in% rownames(installed.packages()) == 'FALSE') install.packages('tidyverse')
if ("ggplot2" %in% rownames(installed.packages()) == 'FALSE') install.packages('ggplot2')
if ("DESeq2" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('DESeq2')
if ("kableExtra" %in% rownames(installed.packages()) == 'FALSE') install.packages('kableExtra')
if ("RColorBrewer" %in% rownames(installed.packages()) == 'FALSE') install.packages('RColorBrewer')
if ("pheatmap" %in% rownames(installed.packages()) == 'FALSE') install.packages('pheatmap')

Load Libraries

library(tidyverse)
library(ggplot2)
library(DESeq2)
library(kableExtra)
library(RColorBrewer)
library(pheatmap)

Read in Data

annote_counts <- read.delim(file = "../output/annote_counts.tab", sep = " ", header = TRUE)
head(annote_counts)

Let’s only look at the differential expression that is significant (adjusted p value is less than 0.05)

This code chunk subsets the deseq2.res data frame, selecting the rows where both of the following conditions are satisfied: !is.na(annote_counts$padj) adjusted p-value is not missing annote_counts$padj <= 0.05 adjusted p-value is less than or equal to 0.05

In summary, the code selects rows from the deseq2.res data frame where the padj values are not missing and are less than or equal to 0.05. It then calculates the dimensions (number of rows and columns) of this subset, giving us the number of significantly differentially expressed genes between our comparative groups ‘embryos’ and ‘recruits’

annote_counts_sig <- (annote_counts[!is.na(annote_counts$padj) & annote_counts$padj <= 0.05, ])

head(annote_counts_sig)

Now that we have our annotated counts dataframe subset to only significant p-values, we can order them in descending adjusted p value

acs_ordered <- annote_counts_sig[order(annote_counts_sig$padj), ]

Make it a matrix so we can give it rownames by GO term

acs_ordered_matrix <- as.matrix(acs_ordered) # make this a matrix again so it can take the rownames() function

GO_term <- acs_ordered$Gene.Ontology..biological.process. # making a vector of names

rownames(acs_ordered_matrix) <- GO_term # rename all rows

We want the heatmap to be a matrix that has GO term rownames, biological sample columns, and log2 transformed count data as values!

# Select top differentially expressed genes from the ordered matrix
top_genes <- acs_ordered_matrix[1:20, ]

Select only the biological sample columns from the top_genes matrix to work with

bio_samples <-  c("SRR22293447",
                  "SRR22293448",
                  "SRR22293449",
                  "SRR22293450",
                  "SRR22293451",
                  "SRR22293452",
                  "SRR22293453",
                  "SRR22293454")

heat_matrix <- top_genes[ , bio_samples ]

Log2 transform the matrix data

heat_matrix_num <- matrix(as.numeric(heat_matrix),
                          ncol = ncol(heat_matrix))

colnames(heat_matrix_num) <- colnames(heat_matrix)

#scrub up protein names list so that each entry is not as long 
protein_names <- rownames(heat_matrix) %>%
  sub(' \\([^)]+\\).*$', '', .) %>%
  sub('\\[.*?\\]', ' ', .)

rownames(heat_matrix_num) <- protein_names

heat_matrix_log2 <- log2(heat_matrix_num + 1)

#define some color palettes to play with from RColorBrewer
BrBG <- colorRampPalette(brewer.pal(8, "BrBG"))(25)
blue <- colorRampPalette(brewer.pal(8, "Blues"))(25)
blind <- brewer.pal(8, "Set1")

# generating heatmaps
heatmap(heat_matrix_log2,
          scale = "column",  # Scale the columns
          col = BrBG,        # Set the color palette
          cexCol = 0.8,       # adjust size of column labels
          cexRow = 0.8)

# Customize the heatmap using pheatmap
pheatmap(heat_matrix_log2,
          labels_col = c("embryo", "embryo", "embryo", "embryo", "recruit", "recruit", "recruit", "recruit"),
          scale="column",
          cellwidth = 15,
          cellheight = 11,
          fontsize = 12,
          show_rownames = FALSE,
          cexRow=0.8,
          color = BrBG,
          ylab = "GO Terms",         # y-axis label
          xlab = "Sample Run ID",    # x-axis label
          key=TRUE,                  # Add a legend
          trace="none",              # Remove the trace lines
          cexCol=0.8)                # Adjust column label font size

--- title: "Step 6: Visualize the Top Differentially Expressed Genes and their Functions" subtitle: "Using the table of annotated differentially expressed genes **(generated in step 5)**, make some insightful data visualizations to understand the differing key biological processes between coral embryos and coral recruits" author: "Sarah Tanja" date: "`r format(Sys.time(), '%d %B, %Y')`" format: html: df-print: paged toc: true toc-location: left smooth-scroll: true link-external-icon: true link-external-newwindow: true code-fold: false code-tools: true code-copy: true highlight-style: breeze code-overflow: wrap theme: minty --- ```{r setup, include=FALSE} knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = TRUE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages #fig.width = 6, # Set plot width in inches #fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` ### Install Packages ```{r install packages, filename='r'} if ("tidyverse" %in% rownames(installed.packages()) == 'FALSE') install.packages('tidyverse') if ("ggplot2" %in% rownames(installed.packages()) == 'FALSE') install.packages('ggplot2') if ("DESeq2" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('DESeq2') if ("kableExtra" %in% rownames(installed.packages()) == 'FALSE') install.packages('kableExtra') if ("RColorBrewer" %in% rownames(installed.packages()) == 'FALSE') install.packages('RColorBrewer') if ("pheatmap" %in% rownames(installed.packages()) == 'FALSE') install.packages('pheatmap') ``` ### Load Libraries ```{r load libraries, filename='r'} library(tidyverse) library(ggplot2) library(DESeq2) library(kableExtra) library(RColorBrewer) library(pheatmap) ``` ### Read in Data ```{r, filename='r'} annote_counts <- read.delim(file = "../output/annote_counts.tab", sep = " ", header = TRUE) head(annote_counts) ``` Let's only look at the differential expression that is significant (adjusted p value is less than 0.05) This code chunk subsets the deseq2.res data frame, selecting the rows where both of the following conditions are satisfied: `!is.na(annote_counts$padj)` adjusted p-value is not missing `annote_counts$padj <= 0.05` adjusted p-value is less than or equal to 0.05 In summary, the code selects rows from the deseq2.res data frame where the padj values are not missing and are less than or equal to 0.05. It then calculates the dimensions (number of rows and columns) of this subset, giving us the number of significantly differentially expressed genes between our comparative groups 'embryos' and 'recruits' ```{r} annote_counts_sig <- (annote_counts[!is.na(annote_counts$padj) & annote_counts$padj <= 0.05, ]) head(annote_counts_sig) ``` Now that we have our annotated counts dataframe subset to only significant p-values, we can order them in descending adjusted p value ```{r} acs_ordered <- annote_counts_sig[order(annote_counts_sig$padj), ] ``` Make it a matrix so we can give it rownames by GO term ```{r} acs_ordered_matrix <- as.matrix(acs_ordered) # make this a matrix again so it can take the rownames() function GO_term <- acs_ordered$Gene.Ontology..biological.process. # making a vector of names rownames(acs_ordered_matrix) <- GO_term # rename all rows ``` We want the heatmap to be a matrix that has GO term rownames, biological sample columns, and log2 transformed count data as values! ```{r} # Select top differentially expressed genes from the ordered matrix top_genes <- acs_ordered_matrix[1:20, ] ``` Select only the biological sample columns from the top_genes matrix to work with ```{r} bio_samples <- c("SRR22293447", "SRR22293448", "SRR22293449", "SRR22293450", "SRR22293451", "SRR22293452", "SRR22293453", "SRR22293454") heat_matrix <- top_genes[ , bio_samples ] ``` Log2 transform the matrix data ```{r} heat_matrix_num <- matrix(as.numeric(heat_matrix), ncol = ncol(heat_matrix)) colnames(heat_matrix_num) <- colnames(heat_matrix) #scrub up protein names list so that each entry is not as long protein_names <- rownames(heat_matrix) %>% sub(' \$[^)]+\$.*$', '', .) %>% sub('\\[.*?\\]', ' ', .) rownames(heat_matrix_num) <- protein_names ``` ```{r} heat_matrix_log2 <- log2(heat_matrix_num + 1) ``` ```{r} #define some color palettes to play with from RColorBrewer BrBG <- colorRampPalette(brewer.pal(8, "BrBG"))(25) blue <- colorRampPalette(brewer.pal(8, "Blues"))(25) blind <- brewer.pal(8, "Set1") ``` ```{r} # generating heatmaps heatmap(heat_matrix_log2, scale = "column", # Scale the columns col = BrBG, # Set the color palette cexCol = 0.8, # adjust size of column labels cexRow = 0.8) ``` ```{r} # Customize the heatmap using pheatmap pheatmap(heat_matrix_log2, labels_col = c("embryo", "embryo", "embryo", "embryo", "recruit", "recruit", "recruit", "recruit"), scale="column", cellwidth = 15, cellheight = 11, fontsize = 12, show_rownames = FALSE, cexRow=0.8, color = BrBG, ylab = "GO Terms", # y-axis label xlab = "Sample Run ID", # x-axis label key=TRUE, # Add a legend trace="none", # Remove the trace lines cexCol=0.8) # Adjust column label font size ```