Initialize working directory and libraries
setwd("C:/Users/Fei_lab/Documents/BTI2023/Reruns")
library(DESeq2)
library(tidyverse)
library(airway)
library(RColorBrewer)
library(pheatmap)
library(countToFPKM)
library(ComplexHeatmap)
library(ggpubr)
library(dplyr)
library(UpSetR)
library(ComplexUpset)
library(devtools)
Load raw read counts, sample info, and test id consistency
# Load raw counts
Pennellii <- read.table("Data/Pennellii_M82_RawReads.txt", header = TRUE, row.names = 1)
# Data frame
dim(Pennellii)
## [1] 36638 66
Pennellii[20:30,1:5]
## Jelly_Br_rep1_M82 Jelly_Br_rep2_M82 Jelly_Br_rep3_M82
## Slym00g375810 304 294 349
## Slym00g375820 1 0 0
## Slym00g375830 0 0 0
## Slym00g375840 0 0 0
## Slym00g375850 0 0 0
## Slym00g375860 1 0 0
## Slym00g375870 0 0 0
## Slym00g375880 0 0 0
## Slym00g375890 0 0 0
## Slym00g375920 805 684 740
## Slym00g375930 2 0 1
## Jelly_MG_rep2_M82 Jelly_MG_rep3_M82
## Slym00g375810 273 330
## Slym00g375820 0 0
## Slym00g375830 0 0
## Slym00g375840 0 0
## Slym00g375850 0 0
## Slym00g375860 0 1
## Slym00g375870 0 0
## Slym00g375880 0 0
## Slym00g375890 0 0
## Slym00g375920 368 341
## Slym00g375930 2 0
#Load sample info
Pennellii.info <- read.table("Data/Pennellii_M82.Info.txt", header = TRUE, row.names = 1)
# Test of id consistency
all(colnames(Pennellii) %in% rownames(Pennellii.info))
## [1] TRUE
all(colnames(Pennellii) == rownames(Pennellii.info))
## [1] TRUE
Perform PCA
#start normalization
dds <- DESeqDataSetFromMatrix(countData = Pennellii,
colData = Pennellii.info,
design = ~ Group)
# Reads normalization and DEGs analysis
dds_norm <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
normalized_counts_mean <- counts(dds_norm, normalized=TRUE)
normalized_counts_mean <- data.frame(normalized_counts_mean)
# Calculate the mean value of three replicates
normalized_counts_mean_mean <- sapply(seq(1, ncol(normalized_counts_mean), 3), function(j) rowMeans(normalized_counts_mean[, j+(0:2)]))
# Check the data format
normalized_counts_mean_mean[20:30,1:6]
## [,1] [,2] [,3] [,4] [,5]
## Slym00g375810 337.8059926 395.4422984 436.1542374 441.7229056 465.275995
## Slym00g375820 0.3453461 0.0000000 0.3720359 0.0000000 0.000000
## Slym00g375830 0.0000000 0.0000000 0.0000000 0.0000000 0.000000
## Slym00g375840 0.0000000 0.0000000 0.0000000 0.0000000 0.000000
## Slym00g375850 0.0000000 0.0000000 0.0000000 0.0000000 0.000000
## Slym00g375860 0.3453461 0.8444576 0.0000000 0.6306598 5.117827
## Slym00g375870 0.0000000 0.0000000 0.0000000 0.0000000 0.000000
## Slym00g375880 0.0000000 0.0000000 0.0000000 0.0000000 0.000000
## Slym00g375890 0.0000000 0.0000000 0.0000000 0.0000000 0.000000
## Slym00g375920 792.9075440 474.2608905 986.4670803 1342.7377613 684.057651
## Slym00g375930 1.0625800 0.8404029 0.0000000 0.0000000 0.000000
## [,6]
## Slym00g375810 515.8421646
## Slym00g375820 0.0000000
## Slym00g375830 0.0000000
## Slym00g375840 0.0000000
## Slym00g375850 0.0000000
## Slym00g375860 0.4371832
## Slym00g375870 0.0000000
## Slym00g375880 0.0000000
## Slym00g375890 0.0000000
## Slym00g375920 1265.1698671
## Slym00g375930 0.2202460
# Add uniq sample names as column ids
id <- read.table("Data/Pennellii_M82.uniq.ids.txt", header = FALSE, colClasses="character")
colnames(normalized_counts_mean_mean) <- id
vsd <- vst(dds, blind = F)
AA <- data.frame(assay(vsd, blind=FALSE))
write.table(AA, "Data/M82_vsd.txt")
# Modified version of PCA to extract PC1, PC2, and PC3, using top25%
plotPCA <- function (object, intgroup=c("Tissue", "Stage", "Group1"), ntop = 9160, returnData=TRUE)
{
rv <- rowVars(assay(object))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(assay(object)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
if (!all(intgroup %in% names(colData(object)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}
intgroup.df <- as.data.frame(colData(object)[, intgroup,
drop = FALSE])
group <- if (length(intgroup) > 1) {
factor(apply(intgroup.df, 1, paste, collapse = " : "))
}
else {
colData(object)[[intgroup]]
}
d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], PC3 = pca$x[, 3], PC4 = pca$x[, 4], group = group,
intgroup.df, name = colnames(object))
if (returnData) {
attr(d, "percentVar") <- percentVar[1:4]
return(d)
}
ggplot(data = d, aes_string(x = "PC3", y = "PC4", color = "group")) +
geom_point(size = 3) + xlab(paste0("PC3: ", round(percentVar[1] *
100), "% variance")) + ylab(paste0("PC4: ", round(percentVar[2] *
100), "% variance")) + coord_fixed()
}
# Extract the PC1, PC2, PC3, and PC4 data
pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Stage", "Group", "Parent"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
percentVar
## [1] 32 22 11 9
# PC1_PC2
#pdf("Pennellii_PC1_PC2_1A.pdf", height = 5, width = 5)
ggscatter(pcaData, x = "PC1", y="PC2",color = "Stage", shape = "Tissue") +
scale_color_manual(values=c("darkgreen", "#FF9933", "blue", "red")) +
scale_shape_manual(values=c(0,1,2)) +
theme(legend.position="top") + xlab("PC1 (32.3%)") + ylab("PC2 (22.12%)")

#dev.off()
#pdf("Pennellii_PC1_PC2_2A.pdf", height = 5, width = 5)
ggscatter(pcaData, x = "PC1", y="PC2", shape = "Stage", color = "Stage") +
scale_color_manual(values=c("darkgreen", "#FF9933", "blue", "red")) +
scale_shape_manual(values=c(3,11,5,10)) +
theme(legend.position="top") + xlab("PC1 (32.3%)") + ylab("PC2 (22.12%)")

#dev.off()
Create heatmap of sample correlations
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette(brewer.pal(15, "RdYlBu"))(255)
## Warning in brewer.pal(15, "RdYlBu"): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colors
pheatmap(sampleDistMatrix,color = colors,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,border_color = NA,
fontsize = 7)

# pdf("Pennellii_Heatmap.pdf", height = 13, width = 15)
# dev.off()
FPKM Analysis
# Load all gene annotation features
gene <- read.table("Data/Pennellii_M82.gene.length.txt", header = TRUE, row.names = 1)
# Extract row ids
gene_ids <- row.names(Pennellii)
# Extract the corresponding genes
gene.info <- gene[row.names(gene) %in% gene_ids, ]
# Check whether the gene ids are consistent
all(rownames(Pennellii) == rownames(gene.info))
## [1] TRUE
# Prepare the data for FPKM calculation
featureLength <- gene$Length
meanFragmentLength <- Pennellii.info$FragmentLength
counts <- as.matrix(Pennellii)
# Return FPKM into a numeric matrix.
fpkm_matrix <- fpkm(counts, featureLength, meanFragmentLength)
write.csv(fpkm_matrix,"M82_Pennellii_FPKMEXAMPLE.csv")
Write normalized read counts to a results file
Jelly_Br_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Jelly_Br_M82","Jelly_Br_Pennellii")))
Jelly_MG_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Jelly_MG_M82","Jelly_MG_Pennellii")))
Jelly_Ripe_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Jelly_Ripe_M82","Jelly_Ripe_Pennellii")))
Pericarp_21dpa_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Pericarp_21dpa_M82","Pericarp_21dpa_Pennellii")))
Pericarp_Br_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Pericarp_Br_M82","Pericarp_Br_Pennellii")))
Pericarp_MG_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Pericarp_MG_M82","Pericarp_MG_Pennellii")))
Pericarp_Ripe_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Pericarp_Ripe_M82","Pericarp_Ripe_Pennellii")))
Placenta_21dpa_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Placenta_21dpa_M82","Placenta_21dpa_Pennellii")))
Placenta_Br_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Placenta_Br_M82","Placenta_Br_Pennellii")))
Placenta_MG_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Placenta_MG_M82","Placenta_MG_Pennellii")))
Placenta_Ripe_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Placenta_Ripe_M82","Placenta_Ripe_Pennellii")))
Jelly_Br_F1A_out <- subset(Jelly_Br_F1A, select = c ("log2FoldChange","padj"))
Jelly_MG_F1A_out <- subset(Jelly_MG_F1A, select = c ("log2FoldChange","padj"))
Jelly_Ripe_F1A_out <- subset(Jelly_Ripe_F1A, select = c ("log2FoldChange","padj"))
Pericarp_21dpa_F1A_out <- subset(Pericarp_21dpa_F1A, select = c ("log2FoldChange","padj"))
Pericarp_Br_F1A_out <- subset(Pericarp_Br_F1A, select = c ("log2FoldChange","padj"))
Pericarp_MG_F1A_out <- subset(Pericarp_MG_F1A, select = c ("log2FoldChange","padj"))
Pericarp_Ripe_F1A_out <- subset(Pericarp_Ripe_F1A, select = c ("log2FoldChange","padj"))
Placenta_21dpa_F1A_out <- subset(Placenta_21dpa_F1A, select = c ("log2FoldChange","padj"))
Placenta_Br_F1A_out <- subset(Placenta_Br_F1A, select = c ("log2FoldChange","padj"))
Placenta_MG_F1A_out <- subset(Placenta_MG_F1A, select = c ("log2FoldChange","padj"))
Placenta_Ripe_F1A_out <- subset(Placenta_Ripe_F1A, select = c ("log2FoldChange","padj"))
out <-cbind(Jelly_MG_F1A_out, Jelly_Br_F1A_out, Jelly_Ripe_F1A_out,
Pericarp_21dpa_F1A_out, Pericarp_MG_F1A_out, Pericarp_Br_F1A_out, Pericarp_Ripe_F1A_out,
Placenta_21dpa_F1A_out, Placenta_MG_F1A_out, Placenta_Br_F1A_out, Placenta_Ripe_F1A_out)
colnames(out) <- c("Jelly_MG_log2","Jelly_MG_padj",
"Jelly_Br_log2", "Jelly_Br_padj",
"Jelly_Ripe_log2","Jelly_Ripe_padj",
"Pericarp_21dpa_log2", "Pericarp_21dpa_padj",
"Pericarp_MG_log2", "Pericarp_MG_padj",
"Pericarp_Br_log2", "Pericarp_Br_padj",
"Pericarp_Ripe_log2", "Pericarp_Ripe_padj",
"Placenta_21dpa_log2", "Placenta_21dpa_padj",
"Placenta_MG_log2", "Placenta_MG_padj",
"Placenta_Br_log2", "Placenta_Br_padj",
"Placenta_Ripe_log2", "Placenta_Ripe_padj")
write.table(out,"Pennellii_M82_DESeq2EXAMPLE.txt", sep = "\t")
Create upset plot showing how differentially expressed genes are
distributed across samples
# padjvalues.csv is a csv file of all genes along with their adjusted p values for expression in all samples
setwd("C:/Users/Fei_lab/Documents/BTI2023/Reruns")
padjvalues <- read.csv("padjvalues.csv", header = TRUE)
df <- data.frame(padjvalues)
df[, -1][df[, -1] == 0 | df[, -1] >= 0.05] <- 0
df[, -1][df[, -1] != 0] <- 1
write.csv(df, "binarydataEXAMPLE.csv", row.names = TRUE)
samples = colnames(df)[2:12]
view(samples)
upsetplot <- upset(df, samples, name = "DEGs", min_size = 60, min_degree = 1)
upsetplot

#ggsave("Upsetplot.pdf", plot = upsetplot, height = 10, width = 14)
Create a csv file with genes that are differentially expressed in
all samples
expressedinall_df <- df[rowSums(df == 1) == 11, ]
write.csv(expressedinall_df, "expressedinallEXAMPLE.csv", row.names = TRUE)