Here we explore the use of DeSeq2 to extract differential expressed genes in epithelial cells treated under four conditions: Media only control, Polymer alone, TNF-α and TGF-β1 (TNT) alone, TNT + Polymer. The purpose of this pipeline is to replicate the results published by Parikh et al., 2022 which showed that upregulation of genes that are important for the NRF2 signalling pathway is associated with the presence of the polymer and not necessarily TNT.
library(GEOquery)
library(DESeq2)
library(dplyr)
library(ComplexHeatmap)
library(tidyverse)
library(vsn)
library(data.table)
library(ggplot2)
library(RColorBrewer)
library(pheatmap)
library(ggrepel)
library(AnnotationDbi)
library(org.Hs.eg.db)
# Set the working directory
setwd("/Users/seymour/Desktop/Bioinformatics/Advanced_Computational_Biology_BIOL0050/Week3/Gene_Exp_analysis_3/")
gse <- getGEO(filename = "GSE176513_series_matrix.txt.gz", GSEMatrix = TRUE, getGPL = FALSE)
# Create a data frame with samples and conditions
sample_info <- pData(gse)
condition_df <- data.frame(Sample_ID = sample_info$description, Condition_name = sample_info$title) %>%
mutate(Sample_ID = gsub("gene_count_matrix.csv --", "", Sample_ID)) %>%
mutate(Condition = substr(Sample_ID, 1, 1)) %>%
mutate(Duration = substr(Sample_ID, 4, nchar(Sample_ID))) %>%
mutate(cond_duration = paste(Condition, Duration, sep = "_")) %>%
mutate(Condition_name = substr(Condition_name, 1, nchar(Condition_name) - 4)) %>%
mutate(Condition_name_short = gsub(" \\(8h\\)| \\(24h\\)", "", Condition_name))
#Separate the 8h and 24h samples
condition_df_8h <- condition_df[condition_df$Duration == "8h", ]
condition_df_24h <- condition_df[condition_df$Duration == "24h", ]
#Read in the count data
count_data <- read.csv(gzfile("GSE176513_gene_count_matrix.csv.gz"))
col_order<- condition_df$Sample_ID
rownames(count_data) <- count_data$EnsemblID
aligned_count_data <- count_data[, col_order]
all_dds <- DESeqDataSetFromMatrix(countData = aligned_count_data, colData = condition_df, design = ~ Condition)
all_dds <- all_dds[ rowSums(counts(all_dds)) > 1, ]
all_dds <- estimateSizeFactors(all_dds)
vsd <- vst(all_dds, blind = FALSE)
sampleDists_vsd <- dist(t(assay(vsd)))
sampleDistMatrix_vsd <- as.matrix( sampleDists_vsd )
rownames(sampleDistMatrix_vsd) <- paste(vsd$Condition_name)
colnames(sampleDistMatrix_vsd) <- paste(vsd$Condition_name)
Distance matix of gene expression
profiles from RNA sequencing of all four groups (Media only, 1 wt%
poly(CEP), TNT and 1 wt% poly(CEP) + TNT at two time points (8 and 24
h), each done in triplicates. 24 h), each done in triplicates. The
intensity of each box corresponds to the distance value shown in the
legend.
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
heatmap <- pheatmap(sampleDistMatrix_vsd,
clustering_distance_rows = sampleDists_vsd,
clustering_distance_cols = sampleDists_vsd,
col = colors,cellwidth=8,
cellheight=8,treeheight_row = 8, treeheight_col = 8, fontsize_row = 7,fontsize_col = 7)
pca_data_condition<-plotPCA(vsd, intgroup="Condition_name_short")
pca_data_condition
pca_data_duration<-plotPCA(vsd, intgroup="Duration")
pca_data_duration
library(ggfortify)
library(patchwork)
pca_data_duration <- plotPCA(vsd, intgroup = "Duration")
pca_data_condition <- plotPCA(vsd, intgroup = "Condition_name_short")
#match(pca_data_duration$data$name,pca_data_condition$data$name)
pca_data_condition$data$Duration<-pca_data_duration$data$Duration
pcaData<-pca_data_condition$data
ggplot(pcaData, aes(x= PC1, y = PC2))+geom_point(size= 3, aes(shape=Duration, fill=Condition_name_short))+scale_shape_manual(values=c(21, 25))+ scale_fill_manual(values = c("#E69F00", "#56B4E9","Red","Green"),guide = guide_legend(override.aes = list(shape = 22)))+scale_color_discrete()+labs(x = paste0("PC", pca_data_condition$x[, 1]), y = paste0("PC", pca_data_condition$x[, 2])) +ggtitle("PCA Plot - Condition") +theme_minimal()
#Create a function to investigate number of significant genes that are differentially expressed between conditions
create_dds <- function(condition_xh) {
col_order <- condition_xh$Sample_ID
aligned_count_data <- count_data[, col_order]
dds <- DESeqDataSetFromMatrix(countData = aligned_count_data, colData = condition_xh, design = ~ Condition)
dds <- dds[ rowSums(counts(dds)) > 1, ]
dds <- estimateSizeFactors(dds)
dds <- DESeq(dds)
res<-results(dds, contrast=c("Condition","A","B"))
sigs<-na.omit(res)
sigs<-sigs[sigs$padj<0.05,]
cat("Number of genes sig in poly(CEP) alone compared to media only:", nrow(sigs), "\n")
#print(nrow(sigs))
res1<-results(dds, contrast=c("Condition","C","D"))
sigs1<-na.omit(res1)
sigs1<-sigs1[sigs1$padj<0.05,]
cat("Number of genes sig in poly(CEP)+TNT compared to TNT:", nrow(sigs1), "\n")
return(dds)
}
dds_8h <- create_dds(condition_df_8h)
## Number of genes sig in poly(CEP) alone compared to media only: 3472
## Number of genes sig in poly(CEP)+TNT compared to TNT: 3790
dds_24h <- create_dds(condition_df_24h)
## Number of genes sig in poly(CEP) alone compared to media only: 6646
## Number of genes sig in poly(CEP)+TNT compared to TNT: 5162
res_8h1<-results(dds_8h,contrast=c("Condition","B","A"))
res_8h2<-results(dds_8h,contrast=c("Condition","C","A"))
res_8h3<-results(dds_8h,contrast=c("Condition","D","A"))
create_sigs_df <- function(res_xh) {
sigs_xh <- na.omit(res_xh)
sigs_xh <- sigs_xh[sigs_xh$padj < 0.05,]
df_sigs_xh <- as.data.frame(sigs_xh)
return(df_sigs_xh)
}
sigs_8h1 <- create_sigs_df(res_8h1)
sigs_8h2 <- create_sigs_df(res_8h2)
sigs_8h3 <- create_sigs_df(res_8h3)
mat_8h1<-counts(dds_8h,normalized=T)[rownames(sigs_8h1),]
common_row_names1 <- intersect(rownames(mat_8h1), rownames(sigs_8h2))
mat_8h2<-mat_8h1[common_row_names1,]
common_row_names2 <- intersect(rownames(mat_8h2), rownames(sigs_8h3))
mat_8h3<-mat_8h2[common_row_names2,]
z_mat_8h3<-t(apply(mat_8h3,1,scale))
colnames(z_mat_8h3) <- condition_df_8h$Condition_name_short
pheatmap(z_mat_8h3,clustering_distance_rows = "euclidean",clustering_distance_cols = "euclidean",clustering_method = "ward.D2",treeheight_row = 7,treeheight_col = 20,show_rownames = F, cellwidth=20, cellheight=0.2,fontsize=5)
# Your existing lists
Media_data <- rownames(sigs_8h1)
Polymer_data <- rownames(sigs_8h1)
print(length(Media_data))
## [1] 3472
TNT_data <- rownames(sigs_8h2)
Polymer_and_TNT_data <- rownames(sigs_8h3)
# Update Media with new rownames from sigs_8h2 and sigs_8h3
Media_data <- union(Media_data, TNT_data)
print(length(Media_data))
## [1] 7543
Media_data <- union(Media_data, Polymer_and_TNT_data)
print(length(Media_data))
## [1] 9358
venn_df_Media <- data.frame(Media=Media_data)
venn_df_Polymer <- data.frame(Polymer=Polymer_data)
venn_df_Media$RowName <- venn_df_Media$Media
venn_df_Polymer$RowName <-venn_df_Polymer$Polymer
dat1 <- merge(venn_df_Media , venn_df_Polymer, by="RowName", all = T)
venn_df_TNT <- data.frame(TNT=TNT_data)
venn_df_Polymer_and_TNT <- data.frame(Polymer_and_TNT=Polymer_and_TNT_data)
venn_df_TNT$RowName <- venn_df_TNT$TNT
venn_df_Polymer_and_TNT$RowName <-venn_df_Polymer_and_TNT$Polymer_and_TNT
dat2 <- merge(venn_df_TNT, venn_df_Polymer_and_TNT, by="RowName", all = T)
dat3 <- merge(dat1, dat2, by="RowName", all = T)
library(VennDiagram)
#venn.diagram(dat3, filename = "venn-4-dimensions.png")
#Error in `[[<-.data.frame`(`*tmp*`, i, value = c(NA, "ENSG00000000460", :
#replacement has 3473 rows, data has 9358
write.csv(dat3, file = "dat3.csv", row.names = FALSE)
prepro_res <- function(res_xh) {
sigs_xh <- res_xh
sigs_xh <- sigs_xh[sigs_xh$padj < 0.05 | is.na(sigs_xh$padj), ]
df_sigs_xh <- as.data.frame(sigs_xh)
return(df_sigs_xh)
}
volc_8h1 <- prepro_res(res_8h1)
volc_8h2 <- prepro_res(res_8h2)
volc_8h3 <- prepro_res(res_8h3)
library(fgsea)
library(tibble)
pathways.hallmark <- gmtPathways("h.all.v7.2.symbols.gmt")
plot_pathways<-function(res_xhx,plot_title){
ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,
key=rownames(res_xhx),
columns="SYMBOL",
keytype="ENSEMBL")
ens2symbol <- as_tibble(ens2symbol)
sigs_xhx_results <- res_xhx[order(res_xhx$pvalue),]
sigs_xhx_results$ENSEMBL <- rownames(sigs_xhx_results)
sigs_xhx_results <- inner_join(sigs_xhx_results, ens2symbol, by=c("ENSEMBL"="ENSEMBL"))
sigs_xhx_results <- sigs_xhx_results %>%
mutate(updown = ifelse(log2FoldChange > 0, "upregulated", "downregulated"))
res2 <- sigs_xhx_results %>%
dplyr::select(SYMBOL, stat,updown) %>%
na.omit() %>%
distinct() %>%
group_by(SYMBOL) %>%
summarize(stat=mean(stat), updown = first(updown))
ranks <- deframe(res2)
fgseaRes <- fgseaMultilevel(pathways=pathways.hallmark, stats=ranks)
fgseaResTidy <- fgseaRes %>%
as_tibble() %>%
arrange(desc(NES))
gene.in.pathway <- pathways.hallmark %>%
enframe("pathway", "SYMBOL") %>%
unnest(cols = c(SYMBOL)) %>%
inner_join(res2, by="SYMBOL")
gene.in.pathway <- gene.in.pathway %>%
inner_join(res2, by = "SYMBOL")
fgseaResTidy <- fgseaResTidy %>%
inner_join(gene.in.pathway, by = "pathway")
size_breaks <- c(10, 50, 100, 200,300, Inf)
fgseaResTidy$gene_count_group <- cut(fgseaResTidy$size, breaks = size_breaks,
labels = c("10", "50", "100", "200","300"))
colors <- c("downregulated" = "blue", "upregulated" = "brown")
fgseaResTidy$circle_color <- colors[fgseaResTidy$updown.x]
ggplot(fgseaResTidy, aes(-log10(padj), reorder(pathway, NES))) +
geom_point(aes(color = circle_color, size = gene_count_group)) +
scale_color_manual(values = c("blue", "brown")) +
scale_size_manual(values = c("10" = 1, "50" = 1.6, "100" = 2.4, "200" = 5, "300" = 6)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
axis.text.y = element_text(size = 6),
legend.text = element_text(size = 12))+
labs(x = "-log10(pval)", y = "Pathway",
title = plot_title) +
guides(size = guide_legend(title = "Gene Count Group"),
color = guide_legend(title = "Up/Down Regulation"))
}
plot_pathways(volc_8h1,"Polymer only, 8hr")
plot_pathways(volc_8h2,"TNT alone, 8hr")
plot_pathways(volc_8h3,"TNT+ploymer, 8hr")
res_xhx=volc_8h3
ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,
key=rownames(res_xhx),
columns="SYMBOL",
keytype="ENSEMBL")
sigs_xhx_results <- res_xhx[order(res_xhx$pvalue),]
sigs_xhx_results$ENSEMBL <- rownames(sigs_xhx_results)
sigs_xhx_results <- inner_join(sigs_xhx_results, ens2symbol, by=c("ENSEMBL"="ENSEMBL"))
res2 <- sigs_xhx_results %>%
dplyr::select(SYMBOL, stat) %>%
na.omit() %>%
distinct() %>%
group_by(SYMBOL) %>%
summarize(stat=mean(stat))
ranks <- deframe(res2)
fgseaRes <- fgseaMultilevel(pathways=pathways.hallmark, stats=ranks)
fgseaResTidy <- fgseaRes %>%
as_tibble() %>%
arrange(desc(NES))
gene.in.pathway <- pathways.hallmark %>%
enframe("pathway", "SYMBOL") %>%
unnest(cols = c(SYMBOL)) %>%
inner_join(res2, by="SYMBOL")
gene.in.pathway <- gene.in.pathway %>%
inner_join(res2, by = "SYMBOL")
fgseaResTidy <- fgseaResTidy %>%
inner_join(gene.in.pathway, by = "pathway")
most_significant_pathway <- fgseaResTidy[1, ]
most_significant_pathway_name <- most_significant_pathway$pathway
print(most_significant_pathway_name)
## [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB"
genes_most_significant <- gene.in.pathway %>%
filter(pathway == most_significant_pathway_name) %>%
dplyr::select(SYMBOL)
gene_symbols_most_significant <- genes_most_significant$SYMBOL
print(gene_symbols_most_significant)
## [1] "JUNB" "NFKBIA" "TNFAIP3" "PTGS2" "CXCL1" "IER3"
## [7] "CCL20" "CXCL3" "MAFF" "NFKB2" "TNFAIP2" "HBEGF"
## [13] "KLF6" "BIRC3" "PLAUR" "ICAM1" "JUN" "IL1B"
## [19] "BCL2A1" "PPP1R15A" "ZC3H12A" "IL1A" "RELB" "TRAF1"
## [25] "BTG2" "DUSP1" "MAP3K8" "F3" "SDC4" "EGR1"
## [31] "IL6" "KDM6B" "NFKB1" "LIF" "PTX3" "FOSL1"
## [37] "JAG1" "GCH1" "CCL2" "RCAN1" "EHD1" "CFLAR"
## [43] "RIPK2" "NFKBIE" "PHLDA1" "IER5" "TNFSF9" "GADD45A"
## [49] "CXCL10" "PLK2" "BHLHE40" "SLC2A6" "PTGER4" "DUSP5"
## [55] "SERPINB2" "NFIL3" "SERPINE1" "TRIB1" "RELA" "BIRC2"
## [61] "CXCL6" "LITAF" "TNFAIP6" "CD44" "INHBA" "PLAU"
## [67] "MYC" "TNFRSF9" "SGK1" "TNIP1" "NAMPT" "FOSL2"
## [73] "PNRC1" "ID2" "CD69" "IL7R" "EFNA1" "PHLDA2"
## [79] "PFKFB3" "YRDC" "IFNGR2" "SQSTM1" "BTG3" "GADD45B"
## [85] "KYNU" "G0S2" "BTG1" "MCL1" "VEGFA" "MAP2K3"
## [91] "CDKN1A" "CCN1" "TANK" "IFIT2" "IL18" "TUBB2A"
## [97] "IRF1" "FOS" "OLR1" "RHOB" "NINJ1" "ZBTB10"
## [103] "PLPP3" "CXCL11" "SAT1" "CSF1" "GPR183" "PMEPA1"
## [109] "PTPRE" "TLR2" "ACKR3" "MARCKS" "LAMB3" "CEBPB"
## [115] "TRIP10" "F2RL1" "LDLR" "TGIF1" "RNF19B" "DRAM1"
## [121] "B4GALT1" "DNAJB4" "PDE4B" "SNN" "DENND5A" "SPHK1"
## [127] "CD80" "TNFAIP8" "FUT4" "SPSB1" "TSC22D1" "B4GALT5"
## [133] "SIK1" "CLCF1" "NFE2L2" "NFAT5" "ATP2B1" "IL6ST"
## [139] "ABCA1" "HES1" "IRS2" "SLC2A3" "IL23A" "TAP1"
## [145] "MSC" "IFIH1" "IL15RA" "TNIP2" "PANX1" "EDN1"
## [151] "EIF1" "BMP2" "DUSP4" "PDLIM5" "ICOSLG" "GFPT2"
## [157] "TNC" "SERPINB8" "MXD1"
make_volc_plot <- function(volc_xhx, plot_name) {
volc_xhx$diffexpressed <- "NO"
volc_xhx$diffexpressed[volc_xhx$log2FoldChange > 0.6 & volc_xhx$pval < 0.05] <- "UP"
volc_xhx$diffexpressed[volc_xhx$log2FoldChange < -0.6 & volc_xhx$pval < 0.05] <- "DOWN"
head(volc_xhx[order(volc_xhx$padj) & volc_xhx$diffexpressed == 'DOWN', ])
volc_xhx <- volc_xhx %>%
mutate(ENSEMBL = rownames(volc_xhx))
volc_xhx <- inner_join(volc_xhx, ens2symbol, by = c("ENSEMBL" = "ENSEMBL"))
volc_xhx$delabel <- ifelse(volc_xhx$SYMBOL %in% head(volc_xhx[order(volc_xhx$padj), "SYMBOL"], 30), volc_xhx$SYMBOL, NA)
ggplot(data = volc_xhx, aes(x = log2FoldChange, y = -log10(padj), col = diffexpressed, label = delabel)) +
geom_vline(xintercept = c(-0.6, 0.6), col = "gray", linetype = 'dashed') +
geom_hline(yintercept = -log10(0.0001), col = "gray", linetype = 'dashed') +
geom_point(size = 1) +
scale_color_manual(values = c("#00AFBB", "grey", "#bb0c00"),
labels = c("Downregulated", "Not significant", "Upregulated")) +
coord_cartesian(ylim = c(0, 450), xlim = c(-10, 10)) + # since some genes can have minuslog10padj of inf, we set these limits
labs(color = '',
x = expression("log"[2]*"FC"), y = expression("-log"[10]*"p-value")) +
scale_x_continuous(breaks = seq(-10, 10, 2)) +
ggtitle(plot_name) +
geom_text_repel(size = 3,max.overlaps = 12)
}
make_volc_plot(volc_8h1,"Media vs Polymer")
make_volc_plot(volc_8h2,"Media vs TNT alone")
make_volc_plot(volc_8h3,"Media vs TNT+Polymer ")