title: “Running DESeq2 pipeline” output: html_notebook
# Running DESeq2
This code performs differential expression analysis to identify deferentially expressed genes (DEGs).
First, it reads in a count matrix of isoform counts generated by `kallisto` and `trinity`, with row names set to the gene/transcript IDs and the first column removed. It then rounds the counts to whole numbers.
The `results()` function is used to extract the results table, which is ordered by gene/transcript ID.
The code then prints the top few rows of the results table and calculates the number of DEGs with an adjusted p-value less than or equal to 0.05. It plots the log2 fold changes versus the mean normalized counts for all genes, highlighting significant DEGs in red and adding horizontal lines at 2-fold upregulation and downregulation. Finally, it writes the list of significant DEGs to a file called "DEGlist.tab".
### Install Packages
if ("DESeq2" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('DESeq2')
if ("kableExtra" %in% rownames(installed.packages()) == 'FALSE') install.packages('kableExtra')
Load Libraries
library(DESeq2)
library(kableExtra)
library(tidyverse)
library(ggplot2)
Read in count matrix
countmatrix <- read.delim("../output/trinity-matrix/mcap.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)
##Round integers up to whole numbers for analysis
countmatrix <- round(countmatrix, 0)
head(countmatrix)
#ggplot(countmatrix) + geom_histogram(aes(x=sex), stat="bin", bins= 200) + xlab("Raw expression counts") + ylab("Number of genes") + xlim(0, 25000) + ylim(0, 5000)
Create Dataframe
# make a dataframe of 4 embryos and 4 recruits
deseq2.colData <- data.frame(condition=factor(c(rep("embryos", 4), rep("recruits", 4))),
type=factor(rep("single-read", 8)))
# set row names to match the column names in the count matrix
rownames(deseq2.colData) <- colnames(data)
# DESeqDataSet object created using the `DESeqDataSetFromMatrix` function
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
colData = deseq2.colData,
design = ~ condition)
Negative Binomial: DESeq2 Results
deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
dim(deseq2.res)
summary(deseq2.res)
# deseq2.res will be used in step 6.. is there a way to save this and read it in, rather than coming back and re-running this code?
deseq2.sig.res <- (deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
head(deseq2.sig.res)
# calculate the dimensions of the resulting subset. The dim() function returns a vector with two elements: the number of rows and the number of columns in the subset.
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
Plotting
PCA
vsd <- vst(deseq2.dds, blind = FALSE)
pca <- plotPCA(vsd, intgroup = "condition")
pca
ggsave("../figs/pca.png", plot = pca, width = 8, height = 4, dpi = 600)
Heatmap
Install Packages
if ("pheatmap" %in% rownames(installed.packages()) == 'FALSE') install.packages('pheatmap')
Load Libraries
library(pheatmap)
Top 50 Differentially Expressed Genes
# Select top 20 differentially expressed genes
res <- results(deseq2.dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:20]
# Extract counts and normalize
counts <- counts(deseq2.dds, normalized = TRUE)
counts_top <- counts[top_genes, ]
# Log-transform counts
log_counts_top <- log2(counts_top + 1)
# Generate heatmap
heatmap_20 <- pheatmap(log_counts_top, scale = "row")
ggsave("../figs/heatmap_20.png", plot = heatmap_20, width = 8, height = 5, dpi = 600)
heatmap_20
Fancy Volcano
# The main plot
fancy_volcano <- plot(deseq2.res$baseMean, deseq2.res$log2FoldChange,
pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG Coral Early Life History (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
points(deseq2.sig.res$baseMean, deseq2.sig.res$log2FoldChange, pch=20, cex=0.45, col="red")
abline(h=c(-1,1), col="blue")
dev.print(png, file = "../figs/fancy-volcano.png", width = 6, height = 4, units = "in", res = 300)
Basic Volcano
# Prepare the data for plotting
res_df <- as.data.frame(deseq2.res)
res_df$gene <- row.names(res_df)
# Create volcano plot
basic_volcano <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("grey", "red")) +
labs(title = "Volcano Plot",
x = "Log2 Fold Change",
y = "-Log10 Adjusted P-value",
color = "Significantly\nDifferentially Expressed") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "top")
# Save plot
ggsave(path = "../figs", filename="basic-volcano.png")
# Print plot
print(basic_volcano)
Save the list of Differentially Expressed Genes (DEGs)!
Write the output to a table
mkdir -p ../output/deseq
write.table(deseq2.res, "../output/deseq/DEGlist.tab", sep = '\t', row.names = T)
Let’s look at this list
deglist <- read.csv("../output/deseq/DEGlist.tab", sep = '\t', header = TRUE)
deglist$RowName <- rownames(deglist)
deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns
head(deglist2)
#rank-sum Wilcoxon test for differential expressed genes
#Run the rank-sum Wilcoxon test for each gene
pvalues <- sapply(1:nrow(count_norm),function(i){
data<-cbind.data.frame(gene=as.numeric(t(count_norm[i,])),conditions)
p=wilcox.test(gene~conditions, data)$p.value
return(p)
})
fdr=p.adjust(pvalues,method = "fdr")
#Calculate fold-change for each gene
conditionsLevel<-levels(conditions)
dataCon1=count_norm[,c(which(conditions==conditionsLevel[1]))]
dataCon2=count_norm[,c(which(conditions==conditionsLevel[2]))]
foldChanges=log2(rowMeans(dataCon2)/rowMeans(dataCon1))
###Output results base on FDR threshold
outRst<-data.frame(log2foldChange=foldChanges, pValues=pvalues, FDR=fdr)
rownames(outRst)=rownames(count_norm)
---
output:
  html_notebook: default
  html_document: default
---
---
title: "Running DESeq2 pipeline"
output: html_notebook

```{r}
# Running DESeq2

This code performs differential expression analysis to identify deferentially expressed genes (DEGs).

First, it reads in a count matrix of isoform counts generated by `kallisto` and `trinity`, with row names set to the gene/transcript IDs and the first column removed. It then rounds the counts to whole numbers.

The `results()` function is used to extract the results table, which is ordered by gene/transcript ID.

The code then prints the top few rows of the results table and calculates the number of DEGs with an adjusted p-value less than or equal to 0.05. It plots the log2 fold changes versus the mean normalized counts for all genes, highlighting significant DEGs in red and adding horizontal lines at 2-fold upregulation and downregulation. Finally, it writes the list of significant DEGs to a file called "DEGlist.tab".

### Install Packages

```{r install-packages, filename='r'}
if ("DESeq2" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('DESeq2')
if ("kableExtra" %in% rownames(installed.packages()) == 'FALSE') install.packages('kableExtra')
```

### Load Libraries

```{r, filename = 'r'}
library(DESeq2)
library(kableExtra)
library(tidyverse)
library(ggplot2)
```

### Read in count matrix

```{r, filename = 'r'}
countmatrix <- read.delim("../output/trinity-matrix/mcap.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)
```

##Round integers up to whole numbers for analysis

```{r, filename = 'r'}
countmatrix <- round(countmatrix, 0)
head(countmatrix)
#ggplot(countmatrix) + geom_histogram(aes(x=sex), stat="bin", bins= 200) + xlab("Raw expression counts") + ylab("Number of genes") + xlim(0, 25000) + ylim(0, 5000)
```

### Create Dataframe



```{r, filename = 'r'}
# make a dataframe of 4 embryos and 4 recruits
deseq2.colData <- data.frame(condition=factor(c(rep("embryos", 4), rep("recruits", 4))), 
                             type=factor(rep("single-read", 8)))

# set row names to match the column names in the count matrix
rownames(deseq2.colData) <- colnames(data)

# DESeqDataSet object created using the `DESeqDataSetFromMatrix` function
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)
```

### Negative Binomial: DESeq2 Results



```{r, filename='r'}
deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
dim(deseq2.res)
summary(deseq2.res)
# deseq2.res will be used in step 6.. is there a way to save this and read it in, rather than coming back and re-running this code? 

```


```{r}
deseq2.sig.res <- (deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])

head(deseq2.sig.res)

# calculate the dimensions of the resulting subset. The dim() function returns a vector with two elements: the number of rows and the number of columns in the subset.
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```

# Plotting

## PCA

```{r}
vsd <- vst(deseq2.dds, blind = FALSE)
pca <- plotPCA(vsd, intgroup = "condition")
pca

ggsave("../figs/pca.png", plot = pca, width = 8, height = 4, dpi = 600)
```

## Heatmap

### Install Packages

```{r}
if ("pheatmap" %in% rownames(installed.packages()) == 'FALSE') install.packages('pheatmap') 
```

### Load Libraries

```{r}
library(pheatmap)
```

### Top 50 Differentially Expressed Genes

```{r, fig.height=8., fig.width=8}

# Select top 20 differentially expressed genes
res <- results(deseq2.dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:20]

# Extract counts and normalize
counts <- counts(deseq2.dds, normalized = TRUE)
counts_top <- counts[top_genes, ]

# Log-transform counts
log_counts_top <- log2(counts_top + 1)

# Generate heatmap
heatmap_20 <- pheatmap(log_counts_top, scale = "row")

ggsave("../figs/heatmap_20.png", plot = heatmap_20, width = 8, height = 5, dpi = 600)

heatmap_20

```

## Fancy Volcano


```{r, fig.height=5, fig.width=8}

# The main plot
fancy_volcano <- plot(deseq2.res$baseMean, deseq2.res$log2FoldChange, 
                          pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
                          main="DEG Coral Early Life History  (pval <= 0.05)",
                          xlab="mean of normalized counts",
                          ylab="Log2 Fold Change") 
                  points(deseq2.sig.res$baseMean, deseq2.sig.res$log2FoldChange, pch=20, cex=0.45, col="red") 
                  abline(h=c(-1,1), col="blue")
                  
dev.print(png, file = "../figs/fancy-volcano.png", width = 6, height = 4, units = "in", res = 300)
```


## Basic Volcano

```{r, fig.width=5, fig.height=8}
# Prepare the data for plotting
res_df <- as.data.frame(deseq2.res)
res_df$gene <- row.names(res_df)

# Create volcano plot
basic_volcano <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) +
                 geom_point(alpha = 0.6, size = 1.5) +
                 scale_color_manual(values = c("grey", "red")) +
                 labs(title = "Volcano Plot",
                        x = "Log2 Fold Change",
                        y = "-Log10 Adjusted P-value",
                        color = "Significantly\nDifferentially Expressed") +
                 theme_minimal() +
                 theme(panel.grid.major = element_blank(),
                       panel.grid.minor = element_blank(),
                       legend.position = "top")

# Save plot
ggsave(path = "../figs", filename="basic-volcano.png")

# Print plot
print(basic_volcano)
```

# Save the list of Differentially Expressed Genes (DEGs)!

Write the output to a table

```{r, engine='bash'}
mkdir -p ../output/deseq
```

```{r}
write.table(deseq2.res, "../output/deseq/DEGlist.tab", sep = '\t', row.names = T)
```

Let's look at this list

```{r}
deglist <- read.csv("../output/deseq/DEGlist.tab", sep = '\t', header = TRUE)
deglist$RowName <- rownames(deglist)
deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns
head(deglist2)
```

```{r}
#rank-sum Wilcoxon test for differential expressed genes
#Run the rank-sum Wilcoxon test for each gene
pvalues <- sapply(1:nrow(count_norm),function(i){
     data<-cbind.data.frame(gene=as.numeric(t(count_norm[i,])),conditions)
     p=wilcox.test(gene~conditions, data)$p.value
     return(p)
   })
fdr=p.adjust(pvalues,method = "fdr")
#Calculate fold-change for each gene
conditionsLevel<-levels(conditions)
dataCon1=count_norm[,c(which(conditions==conditionsLevel[1]))]
dataCon2=count_norm[,c(which(conditions==conditionsLevel[2]))]
foldChanges=log2(rowMeans(dataCon2)/rowMeans(dataCon1))
###Output results base on FDR threshold
outRst<-data.frame(log2foldChange=foldChanges, pValues=pvalues, FDR=fdr)
rownames(outRst)=rownames(count_norm)
```




