Initialize working directory and libraries

setwd("C:/Users/Fei_lab/Documents/BTI2023/Reruns")
library(DESeq2)
library(tidyverse)
library(airway)
library(RColorBrewer)
library(pheatmap)
library(countToFPKM)
library(ComplexHeatmap)
library(ggpubr)
library(dplyr)
library(UpSetR)
library(ComplexUpset)
library(devtools)

Load raw read counts, sample info, and test id consistency

# Load raw counts
Pennellii <- read.table("Data/Pennellii_M82_RawReads.txt", header = TRUE, row.names = 1)

# Data frame
dim(Pennellii)
## [1] 36638    66
Pennellii[20:30,1:5]
##               Jelly_Br_rep1_M82 Jelly_Br_rep2_M82 Jelly_Br_rep3_M82
## Slym00g375810               304               294               349
## Slym00g375820                 1                 0                 0
## Slym00g375830                 0                 0                 0
## Slym00g375840                 0                 0                 0
## Slym00g375850                 0                 0                 0
## Slym00g375860                 1                 0                 0
## Slym00g375870                 0                 0                 0
## Slym00g375880                 0                 0                 0
## Slym00g375890                 0                 0                 0
## Slym00g375920               805               684               740
## Slym00g375930                 2                 0                 1
##               Jelly_MG_rep2_M82 Jelly_MG_rep3_M82
## Slym00g375810               273               330
## Slym00g375820                 0                 0
## Slym00g375830                 0                 0
## Slym00g375840                 0                 0
## Slym00g375850                 0                 0
## Slym00g375860                 0                 1
## Slym00g375870                 0                 0
## Slym00g375880                 0                 0
## Slym00g375890                 0                 0
## Slym00g375920               368               341
## Slym00g375930                 2                 0
#Load sample info
Pennellii.info <- read.table("Data/Pennellii_M82.Info.txt", header = TRUE, row.names = 1)

# Test of id consistency
all(colnames(Pennellii) %in% rownames(Pennellii.info))
## [1] TRUE
all(colnames(Pennellii) == rownames(Pennellii.info))
## [1] TRUE

Perform PCA

#start normalization
dds <- DESeqDataSetFromMatrix(countData = Pennellii,
                              colData = Pennellii.info,
                              design = ~ Group)

# Reads normalization and DEGs analysis
dds_norm <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
normalized_counts_mean <- counts(dds_norm, normalized=TRUE)
normalized_counts_mean <- data.frame(normalized_counts_mean)

# Calculate the mean value of three replicates
normalized_counts_mean_mean <- sapply(seq(1, ncol(normalized_counts_mean), 3), function(j) rowMeans(normalized_counts_mean[, j+(0:2)]))

# Check the data format
normalized_counts_mean_mean[20:30,1:6]
##                      [,1]        [,2]        [,3]         [,4]       [,5]
## Slym00g375810 337.8059926 395.4422984 436.1542374  441.7229056 465.275995
## Slym00g375820   0.3453461   0.0000000   0.3720359    0.0000000   0.000000
## Slym00g375830   0.0000000   0.0000000   0.0000000    0.0000000   0.000000
## Slym00g375840   0.0000000   0.0000000   0.0000000    0.0000000   0.000000
## Slym00g375850   0.0000000   0.0000000   0.0000000    0.0000000   0.000000
## Slym00g375860   0.3453461   0.8444576   0.0000000    0.6306598   5.117827
## Slym00g375870   0.0000000   0.0000000   0.0000000    0.0000000   0.000000
## Slym00g375880   0.0000000   0.0000000   0.0000000    0.0000000   0.000000
## Slym00g375890   0.0000000   0.0000000   0.0000000    0.0000000   0.000000
## Slym00g375920 792.9075440 474.2608905 986.4670803 1342.7377613 684.057651
## Slym00g375930   1.0625800   0.8404029   0.0000000    0.0000000   0.000000
##                       [,6]
## Slym00g375810  515.8421646
## Slym00g375820    0.0000000
## Slym00g375830    0.0000000
## Slym00g375840    0.0000000
## Slym00g375850    0.0000000
## Slym00g375860    0.4371832
## Slym00g375870    0.0000000
## Slym00g375880    0.0000000
## Slym00g375890    0.0000000
## Slym00g375920 1265.1698671
## Slym00g375930    0.2202460
# Add uniq sample names as column ids
id <- read.table("Data/Pennellii_M82.uniq.ids.txt", header = FALSE, colClasses="character")
colnames(normalized_counts_mean_mean) <- id

vsd <- vst(dds, blind = F)

AA <- data.frame(assay(vsd, blind=FALSE))
write.table(AA, "Data/M82_vsd.txt")

# Modified version of PCA to extract PC1, PC2, and PC3, using top25%
plotPCA <- function (object, intgroup=c("Tissue", "Stage", "Group1"), ntop = 9160, returnData=TRUE)
{
    rv <- rowVars(assay(object))
    select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
        length(rv)))]
    pca <- prcomp(t(assay(object)[select, ]))
    percentVar <- pca$sdev^2/sum(pca$sdev^2)
    if (!all(intgroup %in% names(colData(object)))) {
        stop("the argument 'intgroup' should specify columns of colData(dds)")
    }
    intgroup.df <- as.data.frame(colData(object)[, intgroup,
        drop = FALSE])
    group <- if (length(intgroup) > 1) {
        factor(apply(intgroup.df, 1, paste, collapse = " : "))
    }
    else {
        colData(object)[[intgroup]]
    }
    d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], PC3 = pca$x[, 3], PC4 = pca$x[, 4], group = group,
        intgroup.df, name = colnames(object))
    if (returnData) {
        attr(d, "percentVar") <- percentVar[1:4]
        return(d)
    }
    ggplot(data = d, aes_string(x = "PC3", y = "PC4", color = "group")) +
        geom_point(size = 3) + xlab(paste0("PC3: ", round(percentVar[1] *
        100), "% variance")) + ylab(paste0("PC4: ", round(percentVar[2] *
        100), "% variance")) + coord_fixed()
}

# Extract the PC1, PC2, PC3, and PC4 data
pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Stage", "Group", "Parent"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
percentVar
## [1] 32 22 11  9
# PC1_PC2
#pdf("Pennellii_PC1_PC2_1A.pdf", height = 5, width = 5)
ggscatter(pcaData, x = "PC1", y="PC2",color = "Stage", shape = "Tissue") +
  scale_color_manual(values=c("darkgreen", "#FF9933", "blue", "red")) +
  scale_shape_manual(values=c(0,1,2)) +
  theme(legend.position="top") + xlab("PC1 (32.3%)") + ylab("PC2 (22.12%)")

#dev.off()

#pdf("Pennellii_PC1_PC2_2A.pdf", height = 5, width = 5)
ggscatter(pcaData, x = "PC1", y="PC2", shape = "Stage", color = "Stage") +
  scale_color_manual(values=c("darkgreen", "#FF9933", "blue", "red")) +
  scale_shape_manual(values=c(3,11,5,10)) +
  theme(legend.position="top") + xlab("PC1 (32.3%)") + ylab("PC2 (22.12%)")

#dev.off()

Create heatmap of sample correlations

sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette(brewer.pal(15, "RdYlBu"))(255)
## Warning in brewer.pal(15, "RdYlBu"): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colors
pheatmap(sampleDistMatrix,color = colors,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,border_color = NA,
         fontsize = 7)

# pdf("Pennellii_Heatmap.pdf", height = 13, width = 15)
# dev.off()

FPKM Analysis

# Load all gene annotation features
gene <- read.table("Data/Pennellii_M82.gene.length.txt", header = TRUE, row.names = 1)

# Extract row ids
gene_ids <- row.names(Pennellii)

# Extract the corresponding genes
gene.info <- gene[row.names(gene) %in% gene_ids, ]

# Check whether the gene ids are consistent
all(rownames(Pennellii) == rownames(gene.info))
## [1] TRUE
# Prepare the data for FPKM calculation
featureLength <- gene$Length
meanFragmentLength <- Pennellii.info$FragmentLength
counts <- as.matrix(Pennellii)

# Return FPKM into a numeric matrix.
fpkm_matrix <- fpkm(counts, featureLength, meanFragmentLength)
write.csv(fpkm_matrix,"M82_Pennellii_FPKMEXAMPLE.csv")

Write normalized read counts to a results file

Jelly_Br_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Jelly_Br_M82","Jelly_Br_Pennellii")))
Jelly_MG_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Jelly_MG_M82","Jelly_MG_Pennellii")))
Jelly_Ripe_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Jelly_Ripe_M82","Jelly_Ripe_Pennellii")))
Pericarp_21dpa_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Pericarp_21dpa_M82","Pericarp_21dpa_Pennellii")))
Pericarp_Br_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Pericarp_Br_M82","Pericarp_Br_Pennellii")))
Pericarp_MG_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Pericarp_MG_M82","Pericarp_MG_Pennellii")))
Pericarp_Ripe_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Pericarp_Ripe_M82","Pericarp_Ripe_Pennellii")))
Placenta_21dpa_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Placenta_21dpa_M82","Placenta_21dpa_Pennellii")))
Placenta_Br_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Placenta_Br_M82","Placenta_Br_Pennellii")))
Placenta_MG_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Placenta_MG_M82","Placenta_MG_Pennellii")))
Placenta_Ripe_F1A <- as.data.frame(results(dds_norm, contrast=c("Group","Placenta_Ripe_M82","Placenta_Ripe_Pennellii")))

Jelly_Br_F1A_out <- subset(Jelly_Br_F1A, select = c ("log2FoldChange","padj"))
Jelly_MG_F1A_out <- subset(Jelly_MG_F1A, select = c ("log2FoldChange","padj"))
Jelly_Ripe_F1A_out <- subset(Jelly_Ripe_F1A, select = c ("log2FoldChange","padj"))
Pericarp_21dpa_F1A_out <- subset(Pericarp_21dpa_F1A, select = c ("log2FoldChange","padj"))
Pericarp_Br_F1A_out <- subset(Pericarp_Br_F1A, select = c ("log2FoldChange","padj"))
Pericarp_MG_F1A_out <- subset(Pericarp_MG_F1A, select = c ("log2FoldChange","padj"))
Pericarp_Ripe_F1A_out <- subset(Pericarp_Ripe_F1A, select = c ("log2FoldChange","padj"))
Placenta_21dpa_F1A_out <- subset(Placenta_21dpa_F1A, select = c ("log2FoldChange","padj"))
Placenta_Br_F1A_out <- subset(Placenta_Br_F1A, select = c ("log2FoldChange","padj"))
Placenta_MG_F1A_out <- subset(Placenta_MG_F1A, select = c ("log2FoldChange","padj"))
Placenta_Ripe_F1A_out <- subset(Placenta_Ripe_F1A, select = c ("log2FoldChange","padj"))

out <-cbind(Jelly_MG_F1A_out, Jelly_Br_F1A_out, Jelly_Ripe_F1A_out, 
            Pericarp_21dpa_F1A_out, Pericarp_MG_F1A_out, Pericarp_Br_F1A_out, Pericarp_Ripe_F1A_out,
            Placenta_21dpa_F1A_out, Placenta_MG_F1A_out, Placenta_Br_F1A_out, Placenta_Ripe_F1A_out)

colnames(out) <- c("Jelly_MG_log2","Jelly_MG_padj",
                   "Jelly_Br_log2", "Jelly_Br_padj",
                   "Jelly_Ripe_log2","Jelly_Ripe_padj",
                   "Pericarp_21dpa_log2", "Pericarp_21dpa_padj",
                   "Pericarp_MG_log2", "Pericarp_MG_padj",
                   "Pericarp_Br_log2", "Pericarp_Br_padj",
                   "Pericarp_Ripe_log2", "Pericarp_Ripe_padj",
                   "Placenta_21dpa_log2", "Placenta_21dpa_padj",
                   "Placenta_MG_log2", "Placenta_MG_padj",
                   "Placenta_Br_log2", "Placenta_Br_padj",
                   "Placenta_Ripe_log2", "Placenta_Ripe_padj")

write.table(out,"Pennellii_M82_DESeq2EXAMPLE.txt", sep = "\t")

Create upset plot showing how differentially expressed genes are distributed across samples

# padjvalues.csv is a csv file of all genes along with their adjusted p values for expression in all samples
setwd("C:/Users/Fei_lab/Documents/BTI2023/Reruns")
padjvalues <- read.csv("padjvalues.csv", header = TRUE)
df <- data.frame(padjvalues)

df[, -1][df[, -1] == 0 | df[, -1] >= 0.05] <- 0
df[, -1][df[, -1] != 0] <- 1
write.csv(df, "binarydataEXAMPLE.csv", row.names = TRUE)
samples = colnames(df)[2:12]
view(samples)
upsetplot <- upset(df, samples, name = "DEGs", min_size = 60, min_degree = 1)
upsetplot

#ggsave("Upsetplot.pdf", plot = upsetplot, height = 10, width = 14)

Create a csv file with genes that are differentially expressed in all samples

expressedinall_df <- df[rowSums(df == 1) == 11, ]
write.csv(expressedinall_df, "expressedinallEXAMPLE.csv", row.names = TRUE)