DESeq2 and DEG analysis

Install or load packages

library(DESeq2)
library(pheatmap)
library(dplyr)
library(ggplot2)
library(knitr)
library(RColorBrewer)
library(corrplot)
library(PoiClaClu)
library(plotly)
library(stats)

Load envrioment

NOTE: This step will be executed only if you’ve saved working.RData file, otherwise should just skip

load("/Users/leon/Documents/Project/Sorghum/Sorghum_Range28.RData")

Imports FeatureCounts data

#Input list of genes from one dataset been modififed 
FC1 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/3_FeatureCounts/Sorghum_combine_96-clean.csv", header = T)
info <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/3_FeatureCounts/Sorghum_96-info.csv", header = T)

#Place Gene_IDs into appropriate format for DESeq2
row.names(FC1) <- FC1[,1]
#FC <- FC1[,3:102]
FC <- FC1[,3:98]

head(FC)
info$Treatment <- as.factor(info$Treatment)
rownames(info) <- info$Sample

info$Group <- paste0(info$Accession, "_", info$Treatment, "_", info$Time)
info
### Test of all rownames from info (Coldata) matched the colnames in FC (counts_data)
all(colnames(FC) %in% rownames(info))
## [1] TRUE
### Test of all rownames from info (Coldata) matched the order of colnames in FC (counts_data)
all(colnames(FC) == rownames(info))
## [1] TRUE

Experimental design and DEGs analysis

#start normalization
dds <- DESeqDataSetFromMatrix(countData = FC,
                              colData = info, 
                              design = ~ Group)

## Keep the lines with great than 1 count (at least one count per gene)
keep <- rownames(dds)[rowSums(counts(dds)) > 96]
dds <- dds[keep,]

### check numbers of genes remained for analysis
nrow(dds)
###Relevel the factor
dds$Treatment <- relevel(dds$Treatment, ref = "WW")
### Reads normalization and DEGs analysis
dds <- DESeq(dds)
### Export data for A accessions
resA1 <- as.data.frame(results(dds, contrast=c("Group","A_WL_TP1","A_WW_TP1")))
resA1$Sample = "A_TP1"
resA1$Name = rownames(resA1)
resA3 <- as.data.frame(results(dds, contrast=c("Group","A_WL_TP3","A_WW_TP3")))
resA3$Sample = "A_TP3"
resA3$Name = rownames(resA3)
resA5 <- as.data.frame(results(dds, contrast=c("Group","A_WL_TP5","A_WW_TP5")))
resA5$Sample = "A_TP5"
resA5$Name = rownames(resA5)
resA7 <- as.data.frame(results(dds, contrast=c("Group","A_WL_TP7","A_WW_TP7")))
resA7$Sample = "A_TP7"
resA7$Name = rownames(resA7)
write.csv(rbind(resA1, resA3, resA5, resA7), "/Users/Leon/Desktop/DEGs/DEGs_A.csv", row.names = F)

### Export data for B accessions
resB1 <- as.data.frame(results(dds, contrast=c("Group","B_WL_TP1","B_WW_TP1")))
resB1$Sample = "B_TP1"
resB1$Name = rownames(resB1)
resB3 <- as.data.frame(results(dds, contrast=c("Group","B_WL_TP3","B_WW_TP3")))
resB3$Sample = "B_TP3"
resB3$Name = rownames(resB3)
resB5 <- as.data.frame(results(dds, contrast=c("Group","B_WL_TP5","B_WW_TP5")))
resB5$Sample = "B_TP5"
resB5$Name = rownames(resB5)
resB7 <- as.data.frame(results(dds, contrast=c("Group","B_WL_TP7","B_WW_TP7")))
resB7$Sample = "B_TP7"
resB7$Name = rownames(resB7)
write.csv(rbind(resB1, resB3, resB5, resB7), "/Users/Leon/Desktop/DEGs/DEGs_B.csv",row.names = F)

### Export data for C accessions
resC1 <- as.data.frame(results(dds, contrast=c("Group","C_WL_TP1","C_WW_TP1")))
resC1$Sample = "C_TP1"
resC1$Name = rownames(resC1)
resC3 <- as.data.frame(results(dds, contrast=c("Group","C_WL_TP3","C_WW_TP3")))
resC3$Sample = "C_TP3"
resC3$Name = rownames(resC3)
resC5 <- as.data.frame(results(dds, contrast=c("Group","C_WL_TP5","C_WW_TP5")))
resC5$Sample = "C_TP5"
resC5$Name = rownames(resC5)
resC7 <- as.data.frame(results(dds, contrast=c("Group","C_WL_TP7","C_WW_TP7")))
resC7$Sample = "C_TP7"
resC7$Name = rownames(resC7)
write.csv(rbind(resC1, resC3, resC5, resC7), "/Users/Leon/Desktop/DEGs/DEGs_C.csv",row.names = F)

### Export data for D accessions
resD1 <- as.data.frame(results(dds, contrast=c("Group","D_WL_TP1","D_WW_TP1")))
resD1$Sample = "D_TP1"
resD1$Name = rownames(resD1)
resD3 <- as.data.frame(results(dds, contrast=c("Group","D_WL_TP3","D_WW_TP3")))
resD3$Sample = "D_TP3"
resD3$Name = rownames(resD3)
resD5 <- as.data.frame(results(dds, contrast=c("Group","D_WL_TP5","D_WW_TP5")))
resD5$Sample = "D_TP5"
resD5$Name = rownames(resD5)
resD7 <- as.data.frame(results(dds, contrast=c("Group","D_WL_TP7","D_WW_TP7")))
resD7$Sample = "D_TP7"
resD7$Name = rownames(resD7)
write.csv(rbind(resD1, resD3, resD5, resD7), "/Users/Leon/Desktop/DEGs/DEGs_D.csv",row.names = F)

### Export data for E accessions
resE1 <- as.data.frame(results(dds, contrast=c("Group","E_WL_TP1","E_WW_TP1")))
resE1$Sample = "E_TP1"
resE1$Name = rownames(resE1)
resE3 <- as.data.frame(results(dds, contrast=c("Group","E_WL_TP3","E_WW_TP3")))
resE3$Sample = "E_TP3"
resE3$Name = rownames(resE3)
resE5 <- as.data.frame(results(dds, contrast=c("Group","E_WL_TP5","E_WW_TP5")))
resE5$Sample = "E_TP5"
resE5$Name = rownames(resE5)
resE7 <- as.data.frame(results(dds, contrast=c("Group","E_WL_TP7","E_WW_TP7")))
resE7$Sample = "E_TP7"
resE7$Name = rownames(resE7)
write.csv(rbind(resE1, resE3, resE5, resE7), "/Users/Leon/Desktop/DEGs/DEGs_E.csv",row.names = F)

### Export data for F accessions
resF1 <- as.data.frame(results(dds, contrast=c("Group","F_WL_TP1","F_WW_TP1")))
resF1$Sample = "F_TP1"
resF1$Name = rownames(resF1)
resF3 <- as.data.frame(results(dds, contrast=c("Group","F_WL_TP3","F_WW_TP3")))
resF3$Sample = "F_TP3"
resF3$Name = rownames(resF3)
resF5 <- as.data.frame(results(dds, contrast=c("Group","F_WL_TP5","F_WW_TP5")))
resF5$Sample = "F_TP5"
resF5$Name = rownames(resF5)
resF7 <- as.data.frame(results(dds, contrast=c("Group","F_WL_TP7","F_WW_TP7")))
resF7$Sample = "F_TP7"
resF7$Name = rownames(resF7)
write.csv(rbind(resF1, resF3, resF5, resF7), "/Users/Leon/Desktop/DEGs/DEGs_F.csv",row.names = F)
head(rbind(resF1, resF3, resF5, resF7), 20)

Export data for inter-accession comparison (Using the C accession as reference accession for both conditions)

For example, for a simple two-group comparison, this would return the log2 fold changes of the second group over the first group (the reference level). Please see examples below and in the vignette.

Count Normalization

#Export the count information
normalized_counts <- counts(dds, normalized=TRUE)
normalized_counts <- data.frame(normalized_counts)

normalized_counts$Name <- rownames(normalized_counts)
normalized_counts <- left_join(normalized_counts, FC1[,c(1,2)], by = "Name")
NFC  <- normalized_counts[,c(97,98,1:96)]
#NFC  <- normalized_counts[,c(101,102,1:100)]

### 30248 genes retained for analysis
write.table(NFC, file="/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/3_FeatureCounts/Sorghum_NFC_96-PTM.csv", sep="\t", quote=F, row.names = F)

head(NFC, 20)

PCA based on vst transfomration

### Use vsd transformation
vsd <- vst(dds, blind = F)

### Modified version of PCA to extract PC1, PC2, and PC3
plotPCA <- function (object, intgroup=c("Accession", "Treatment"), ntop = 2346, returnData=TRUE) 
{
    rv <- rowVars(assay(object))
    select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, 
        length(rv)))]
    pca <- prcomp(t(assay(object)[select, ]))
    percentVar <- pca$sdev^2/sum(pca$sdev^2)
    if (!all(intgroup %in% names(colData(object)))) {
        stop("the argument 'intgroup' should specify columns of colData(dds)")
    }
    intgroup.df <- as.data.frame(colData(object)[, intgroup, 
        drop = FALSE])
    group <- if (length(intgroup) > 1) {
        factor(apply(intgroup.df, 1, paste, collapse = " : "))
    }
    else {
        colData(object)[[intgroup]]
    }
    d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], PC3 = pca$x[, 3], PC4 = pca$x[, 4], group = group, 
        intgroup.df, name = colnames(object))
    if (returnData) {
        attr(d, "percentVar") <- percentVar[1:4]
        return(d)
    }
    ggplot(data = d, aes_string(x = "PC3", y = "PC4", color = "group")) + 
        geom_point(size = 3) + xlab(paste0("PC3: ", round(percentVar[1] * 
        100), "% variance")) + ylab(paste0("PC4: ", round(percentVar[2] * 
        100), "% variance")) + coord_fixed()
}

### extract the PC1, PC2, PC3, and PC4 data
pcaData <- plotPCA(vsd, intgroup=c("Accession", "Time"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
percentVar
## [1] 19 14 12 11
### Export PCA values for each component
write.csv(pcaData, "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/3_FeatureCounts/PCA/vsd_PCA_top10SD_100.csv")

pcaData
#Subset different Accessions for independent WGCNA 
vsd_sub1 <- vsd[ , vsd$Accession == "A" ]
plotPCA(vsd_sub1, intgroup=c("Time"))
vsd_sub2 <- vsd[ , vsd$Accession == "B" ]
plotPCA(vsd_sub2, intgroup=c("Time"))
vsd_sub3 <- vsd[ , vsd$Accession == "C" ]
plotPCA(vsd_sub3, intgroup=c("Time"))
vsd_sub4 <- vsd[ , vsd$Accession == "D" ]
plotPCA(vsd_sub4, intgroup=c("Time"))
vsd_sub5 <- vsd[ , vsd$Accession == "E" ]
plotPCA(vsd_sub5, intgroup=c("Time"))
vsd_sub6 <- vsd[ , vsd$Accession == "F" ]
plotPCA(vsd_sub6, intgroup=c("Time"))

Distance Matrix clustering under WL and WW condition

vsd_WL <- vsd[, vsd$Treatment == "WL"]

sampleDists <- dist(t(assay(vsd_WL)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd_WL$Accession, vsd_WL$Time, sep="_")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(brewer.pal(15, "RdYlBu"))(255)
pheatmap(sampleDistMatrix,color = colors,  
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,border_color = NA)

vsd_WW <- vsd[, vsd$Treatment == "WW"]
sampleDists <- dist(t(assay(vsd_WW)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd_WW$Accession, vsd_WW$Time, sep="_")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(brewer.pal(15, "RdYlBu"))(255)
pheatmap(sampleDistMatrix,color = colors,  
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         border_color = NA)

Transform into TPM

##Calculate the TPM based on definition 
kb <- NFC$Length / 1000
countdata <- NFC[,3:98]
rpk <- countdata / kb
tpm <- t(t(rpk)/colSums(rpk) * 1000000)
TPM <- data.frame("Name" = NFC$Name, tpm)

write.csv(TPM, "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/3_FeatureCounts/Sorghum_20220629_clean_TPM_update96.csv", row.names = F, quote=FALSE)

head(TPM, 20)

Pearson correaltion for each two samples based on normalized featureCounts/TPM

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
## 
##     subplot
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:Biobase':
## 
##     contents
## The following objects are masked from 'package:base':
## 
##     format.pval, units
vsd_counts <- data.frame(assay(vsd, normalized=TRUE))
nrow(vsd_counts)
## [1] 22314
#### Plot correlation
P_sample <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/3_Salmon/Exp_correlation.list.csv", header = T)
P_sample 

Load the function for calculation

### Calcualte the P-value effects, score of all correlations
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

Plot the correlation plot using VSD

# Pearson correlation based on Vsd
for (i in 1:nrow(P_sample)){
  Correlation.df <- vsd_counts[,grepl(P_sample[i,1],colnames(vsd_counts))]
  Correlation.plot <- corrplot.mixed(cor(Correlation.df), order = 'AOE')
                      
  assign(P_sample[i,1], Correlation.df)
  assign(paste0(P_sample[i,1],"_Plot"), Correlation.plot)
}

Plot the correlation plot using TPM

# Pearson correlation based on TPM
for (i in 1:nrow(P_sample)){
  Correlation.df <- TPM [,grepl(P_sample[i,1],colnames(TPM ))]
  Correlation.plot <- corrplot.mixed(cor(Correlation.df), order = 'AOE')
                      
  assign(P_sample[i,1], Correlation.df)
  assign(paste0(P_sample[i,1],"_Plot"), Correlation.plot)
}

Generate the report table

df1 <- data.frame(matrix(ncol = 4 , nrow = 48))
colnames(df1) <- c("row", "column", "R", "P-value")

for (i in 1:(nrow(df1))){
  Correlation.df <- vsd_counts[,grepl(P_sample[i,1],colnames(vsd_counts))]
  res <- rcorr(Correlation.df[,1], Correlation.df[,2, ],  type=c("pearson"))
  df1[i,1:4] <- flattenCorrMatrix(res$r, res$P)
}

write.csv(df1, "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/3_FeatureCounts/Pearson-correlation_vsd-20220311.csv", row.names = F)

Correlation based on Normalized featureCounts

for (i in 1:nrow(P_sample)){
  Correlation.df <- NFC[,grepl(P_sample$Sample[i],colnames(NFC))]
  Correlation.plot <- corrplot.mixed(cor(Correlation.df), order = 'AOE')
  
  assign(P_sample$Sample[i], Correlation.df)
  assign(paste0(P_sample$Sample[i],"_Plot"), Correlation.plot)
}