A bio-functional polymer that prevents retinal scarring through modulation of NRF2 signalling pathway

Here we explore the use of DeSeq2 to extract differential expressed genes in epithelial cells treated under four conditions: Media only control, Polymer alone, TNF-α and TGF-β1 (TNT) alone, TNT + Polymer. The purpose of this pipeline is to replicate the results published by Parikh et al., 2022 which showed that upregulation of genes that are important for the NRF2 signalling pathway is associated with the presence of the polymer and not necessarily TNT.

library(GEOquery)
library(DESeq2)
library(dplyr)
library(ComplexHeatmap)

library(tidyverse)
library(vsn)
library(data.table)

library(ggplot2)
library(RColorBrewer)
library(pheatmap)
library(ggrepel)

library(AnnotationDbi)
library(org.Hs.eg.db)

# Set the working directory
setwd("/Users/seymour/Desktop/Bioinformatics/Advanced_Computational_Biology_BIOL0050/Week3/Gene_Exp_analysis_3/")

gse <- getGEO(filename = "GSE176513_series_matrix.txt.gz", GSEMatrix = TRUE, getGPL = FALSE)

# Create a data frame with samples and conditions
sample_info <- pData(gse)
condition_df <- data.frame(Sample_ID = sample_info$description, Condition_name = sample_info$title) %>%
  mutate(Sample_ID = gsub("gene_count_matrix.csv --", "", Sample_ID)) %>%
  mutate(Condition = substr(Sample_ID, 1, 1)) %>%
  mutate(Duration = substr(Sample_ID, 4, nchar(Sample_ID))) %>%
  mutate(cond_duration = paste(Condition, Duration, sep = "_"))  %>%
  mutate(Condition_name = substr(Condition_name, 1, nchar(Condition_name) - 4)) %>%
  mutate(Condition_name_short = gsub(" \\(8h\\)| \\(24h\\)", "", Condition_name))

#Separate the 8h and 24h samples
condition_df_8h <- condition_df[condition_df$Duration == "8h", ]
condition_df_24h <- condition_df[condition_df$Duration == "24h", ]

#Read in the count data
count_data <- read.csv(gzfile("GSE176513_gene_count_matrix.csv.gz"))

col_order<- condition_df$Sample_ID
rownames(count_data) <- count_data$EnsemblID
aligned_count_data <- count_data[, col_order]

all_dds <- DESeqDataSetFromMatrix(countData = aligned_count_data, colData = condition_df, design = ~ Condition)
all_dds <- all_dds[ rowSums(counts(all_dds)) > 1, ]
all_dds <- estimateSizeFactors(all_dds)

vsd <- vst(all_dds, blind = FALSE)
sampleDists_vsd <- dist(t(assay(vsd)))
sampleDistMatrix_vsd <- as.matrix( sampleDists_vsd )
rownames(sampleDistMatrix_vsd) <- paste(vsd$Condition_name)
colnames(sampleDistMatrix_vsd) <- paste(vsd$Condition_name)

Distance matix of gene expression profiles from RNA sequencing of all four groups (Media only, 1 wt% poly(CEP), TNT and 1 wt% poly(CEP) + TNT at two time points (8 and 24 h), each done in triplicates. 24 h), each done in triplicates. The intensity of each box corresponds to the distance value shown in the legend.

colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
heatmap <- pheatmap(sampleDistMatrix_vsd,
                    clustering_distance_rows = sampleDists_vsd,
                    clustering_distance_cols = sampleDists_vsd,
                    col = colors,cellwidth=8,
                    cellheight=8,treeheight_row = 8, treeheight_col = 8, fontsize_row = 7,fontsize_col = 7)

pca_data_condition<-plotPCA(vsd, intgroup="Condition_name_short")
pca_data_condition

pca_data_duration<-plotPCA(vsd, intgroup="Duration")
pca_data_duration

library(ggfortify)
library(patchwork)

pca_data_duration <- plotPCA(vsd, intgroup = "Duration")
pca_data_condition <- plotPCA(vsd, intgroup = "Condition_name_short")
#match(pca_data_duration$data$name,pca_data_condition$data$name)

pca_data_condition$data$Duration<-pca_data_duration$data$Duration
pcaData<-pca_data_condition$data
ggplot(pcaData, aes(x= PC1, y = PC2))+geom_point(size= 3, aes(shape=Duration, fill=Condition_name_short))+scale_shape_manual(values=c(21, 25))+ scale_fill_manual(values = c("#E69F00", "#56B4E9","Red","Green"),guide = guide_legend(override.aes = list(shape = 22)))+scale_color_discrete()+labs(x = paste0("PC", pca_data_condition$x[, 1]), y = paste0("PC", pca_data_condition$x[, 2])) +ggtitle("PCA Plot - Condition") +theme_minimal()

#Create a function to investigate number of significant genes that are differentially expressed between conditions
create_dds <- function(condition_xh) {
  col_order <- condition_xh$Sample_ID
  aligned_count_data <- count_data[, col_order]
  dds <- DESeqDataSetFromMatrix(countData = aligned_count_data, colData = condition_xh, design = ~ Condition)
  dds <- dds[ rowSums(counts(dds)) > 1, ]
  dds <- estimateSizeFactors(dds)
  
  dds <- DESeq(dds)
  
  res<-results(dds, contrast=c("Condition","A","B"))
  sigs<-na.omit(res)
  sigs<-sigs[sigs$padj<0.05,]
  cat("Number of genes sig in poly(CEP) alone compared to media only:", nrow(sigs), "\n")
  #print(nrow(sigs))
  
  res1<-results(dds, contrast=c("Condition","C","D"))
  sigs1<-na.omit(res1)
  sigs1<-sigs1[sigs1$padj<0.05,]
  cat("Number of genes sig in poly(CEP)+TNT compared to TNT:", nrow(sigs1), "\n")
  
  return(dds)
}

dds_8h <- create_dds(condition_df_8h)

## Number of genes sig in poly(CEP) alone compared to media only: 3472 
## Number of genes sig in poly(CEP)+TNT compared to TNT: 3790

dds_24h <- create_dds(condition_df_24h)

## Number of genes sig in poly(CEP) alone compared to media only: 6646 
## Number of genes sig in poly(CEP)+TNT compared to TNT: 5162

res_8h1<-results(dds_8h,contrast=c("Condition","B","A"))
res_8h2<-results(dds_8h,contrast=c("Condition","C","A"))
res_8h3<-results(dds_8h,contrast=c("Condition","D","A"))

create_sigs_df <- function(res_xh) {
  sigs_xh <- na.omit(res_xh)
  sigs_xh <- sigs_xh[sigs_xh$padj < 0.05,]
  df_sigs_xh <- as.data.frame(sigs_xh)
  return(df_sigs_xh)
}

sigs_8h1 <- create_sigs_df(res_8h1)
sigs_8h2 <- create_sigs_df(res_8h2)
sigs_8h3 <- create_sigs_df(res_8h3)

mat_8h1<-counts(dds_8h,normalized=T)[rownames(sigs_8h1),]
common_row_names1 <- intersect(rownames(mat_8h1), rownames(sigs_8h2))

mat_8h2<-mat_8h1[common_row_names1,]
common_row_names2 <- intersect(rownames(mat_8h2), rownames(sigs_8h3))

mat_8h3<-mat_8h2[common_row_names2,]
z_mat_8h3<-t(apply(mat_8h3,1,scale))

colnames(z_mat_8h3) <- condition_df_8h$Condition_name_short
pheatmap(z_mat_8h3,clustering_distance_rows = "euclidean",clustering_distance_cols = "euclidean",clustering_method = "ward.D2",treeheight_row = 7,treeheight_col = 20,show_rownames = F, cellwidth=20, cellheight=0.2,fontsize=5)

# Your existing lists
Media_data <- rownames(sigs_8h1)
Polymer_data <- rownames(sigs_8h1)

print(length(Media_data))

## [1] 3472

TNT_data <- rownames(sigs_8h2)
Polymer_and_TNT_data <- rownames(sigs_8h3)

# Update Media with new rownames from sigs_8h2 and sigs_8h3
Media_data <- union(Media_data, TNT_data)
print(length(Media_data))

## [1] 7543

Media_data <- union(Media_data, Polymer_and_TNT_data)
print(length(Media_data))

## [1] 9358

venn_df_Media <- data.frame(Media=Media_data)
venn_df_Polymer <- data.frame(Polymer=Polymer_data)

venn_df_Media$RowName <- venn_df_Media$Media
venn_df_Polymer$RowName <-venn_df_Polymer$Polymer

dat1 <- merge(venn_df_Media , venn_df_Polymer, by="RowName", all = T)


venn_df_TNT <- data.frame(TNT=TNT_data)
venn_df_Polymer_and_TNT <- data.frame(Polymer_and_TNT=Polymer_and_TNT_data)

venn_df_TNT$RowName <- venn_df_TNT$TNT
venn_df_Polymer_and_TNT$RowName <-venn_df_Polymer_and_TNT$Polymer_and_TNT

dat2 <- merge(venn_df_TNT, venn_df_Polymer_and_TNT, by="RowName", all = T)
dat3 <- merge(dat1, dat2, by="RowName", all = T)

library(VennDiagram)
#venn.diagram(dat3, filename = "venn-4-dimensions.png")

#Error in `[[<-.data.frame`(`*tmp*`, i, value = c(NA, "ENSG00000000460", :
#replacement has 3473 rows, data has 9358

write.csv(dat3, file = "dat3.csv", row.names = FALSE)

Fig

prepro_res <- function(res_xh) {
  sigs_xh <- res_xh
  sigs_xh <- sigs_xh[sigs_xh$padj < 0.05 | is.na(sigs_xh$padj), ]
  df_sigs_xh <- as.data.frame(sigs_xh)
  return(df_sigs_xh)
}

volc_8h1 <- prepro_res(res_8h1)
volc_8h2 <- prepro_res(res_8h2)
volc_8h3 <- prepro_res(res_8h3)

library(fgsea)
library(tibble)

pathways.hallmark <- gmtPathways("h.all.v7.2.symbols.gmt")

plot_pathways<-function(res_xhx,plot_title){
 
  ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,
                                      key=rownames(res_xhx), 
                                      columns="SYMBOL",
                                      keytype="ENSEMBL")
  ens2symbol <- as_tibble(ens2symbol)
  
  sigs_xhx_results <- res_xhx[order(res_xhx$pvalue),]
  sigs_xhx_results$ENSEMBL <- rownames(sigs_xhx_results)
  sigs_xhx_results <- inner_join(sigs_xhx_results, ens2symbol, by=c("ENSEMBL"="ENSEMBL"))
  sigs_xhx_results <- sigs_xhx_results %>%
    mutate(updown = ifelse(log2FoldChange > 0, "upregulated", "downregulated"))
  
  res2 <- sigs_xhx_results %>% 
  dplyr::select(SYMBOL, stat,updown) %>% 
  na.omit() %>% 
  distinct() %>% 
  group_by(SYMBOL) %>% 
  summarize(stat=mean(stat), updown = first(updown))

  ranks <- deframe(res2)
  
  fgseaRes <- fgseaMultilevel(pathways=pathways.hallmark, stats=ranks)


  fgseaResTidy <- fgseaRes %>%
    as_tibble() %>%
    arrange(desc(NES))
  

  gene.in.pathway <- pathways.hallmark %>% 
    enframe("pathway", "SYMBOL") %>% 
    unnest(cols = c(SYMBOL)) %>% 
    inner_join(res2, by="SYMBOL")
  
  gene.in.pathway <- gene.in.pathway %>%
    inner_join(res2, by = "SYMBOL")
  

  fgseaResTidy <- fgseaResTidy %>%
    inner_join(gene.in.pathway, by = "pathway")
  

  size_breaks <- c(10, 50, 100, 200,300, Inf)
  
  fgseaResTidy$gene_count_group <- cut(fgseaResTidy$size, breaks = size_breaks, 
                                       labels = c("10", "50", "100", "200","300"))
  
  colors <- c("downregulated" = "blue", "upregulated" = "brown")
  
  fgseaResTidy$circle_color <- colors[fgseaResTidy$updown.x]
  
  ggplot(fgseaResTidy, aes(-log10(padj), reorder(pathway, NES))) +
    geom_point(aes(color = circle_color, size = gene_count_group)) +
    scale_color_manual(values = c("blue", "brown")) +
    scale_size_manual(values = c("10" = 1,  "50" = 1.6, "100" = 2.4, "200" = 5, "300" = 6)) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          axis.text.y = element_text(size = 6),
    legend.text = element_text(size = 12))+
    labs(x = "-log10(pval)", y = "Pathway", 
         title = plot_title) +
    guides(size = guide_legend(title = "Gene Count Group"), 
           color = guide_legend(title = "Up/Down Regulation"))
  
    
}

plot_pathways(volc_8h1,"Polymer only, 8hr")

plot_pathways(volc_8h2,"TNT alone, 8hr")

plot_pathways(volc_8h3,"TNT+ploymer, 8hr")

res_xhx=volc_8h3
ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,
                                    key=rownames(res_xhx), 
                                    columns="SYMBOL",
                                    keytype="ENSEMBL")

sigs_xhx_results <- res_xhx[order(res_xhx$pvalue),]
sigs_xhx_results$ENSEMBL <- rownames(sigs_xhx_results)
sigs_xhx_results <- inner_join(sigs_xhx_results, ens2symbol, by=c("ENSEMBL"="ENSEMBL"))

res2 <- sigs_xhx_results %>% 
  dplyr::select(SYMBOL, stat) %>% 
  na.omit() %>% 
  distinct() %>% 
  group_by(SYMBOL) %>% 
  summarize(stat=mean(stat))

ranks <- deframe(res2)

fgseaRes <- fgseaMultilevel(pathways=pathways.hallmark, stats=ranks)

fgseaResTidy <- fgseaRes %>%
  as_tibble() %>%
  arrange(desc(NES))


gene.in.pathway <- pathways.hallmark %>% 
  enframe("pathway", "SYMBOL") %>% 
  unnest(cols = c(SYMBOL)) %>% 
  inner_join(res2, by="SYMBOL")

gene.in.pathway <- gene.in.pathway %>%
  inner_join(res2, by = "SYMBOL")

fgseaResTidy <- fgseaResTidy %>%
  inner_join(gene.in.pathway, by = "pathway")

most_significant_pathway <- fgseaResTidy[1, ]
most_significant_pathway_name <- most_significant_pathway$pathway
print(most_significant_pathway_name)

## [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB"

genes_most_significant <- gene.in.pathway %>%
  filter(pathway == most_significant_pathway_name) %>%
  dplyr::select(SYMBOL)

gene_symbols_most_significant <- genes_most_significant$SYMBOL
print(gene_symbols_most_significant)

##   [1] "JUNB"     "NFKBIA"   "TNFAIP3"  "PTGS2"    "CXCL1"    "IER3"    
##   [7] "CCL20"    "CXCL3"    "MAFF"     "NFKB2"    "TNFAIP2"  "HBEGF"   
##  [13] "KLF6"     "BIRC3"    "PLAUR"    "ICAM1"    "JUN"      "IL1B"    
##  [19] "BCL2A1"   "PPP1R15A" "ZC3H12A"  "IL1A"     "RELB"     "TRAF1"   
##  [25] "BTG2"     "DUSP1"    "MAP3K8"   "F3"       "SDC4"     "EGR1"    
##  [31] "IL6"      "KDM6B"    "NFKB1"    "LIF"      "PTX3"     "FOSL1"   
##  [37] "JAG1"     "GCH1"     "CCL2"     "RCAN1"    "EHD1"     "CFLAR"   
##  [43] "RIPK2"    "NFKBIE"   "PHLDA1"   "IER5"     "TNFSF9"   "GADD45A" 
##  [49] "CXCL10"   "PLK2"     "BHLHE40"  "SLC2A6"   "PTGER4"   "DUSP5"   
##  [55] "SERPINB2" "NFIL3"    "SERPINE1" "TRIB1"    "RELA"     "BIRC2"   
##  [61] "CXCL6"    "LITAF"    "TNFAIP6"  "CD44"     "INHBA"    "PLAU"    
##  [67] "MYC"      "TNFRSF9"  "SGK1"     "TNIP1"    "NAMPT"    "FOSL2"   
##  [73] "PNRC1"    "ID2"      "CD69"     "IL7R"     "EFNA1"    "PHLDA2"  
##  [79] "PFKFB3"   "YRDC"     "IFNGR2"   "SQSTM1"   "BTG3"     "GADD45B" 
##  [85] "KYNU"     "G0S2"     "BTG1"     "MCL1"     "VEGFA"    "MAP2K3"  
##  [91] "CDKN1A"   "CCN1"     "TANK"     "IFIT2"    "IL18"     "TUBB2A"  
##  [97] "IRF1"     "FOS"      "OLR1"     "RHOB"     "NINJ1"    "ZBTB10"  
## [103] "PLPP3"    "CXCL11"   "SAT1"     "CSF1"     "GPR183"   "PMEPA1"  
## [109] "PTPRE"    "TLR2"     "ACKR3"    "MARCKS"   "LAMB3"    "CEBPB"   
## [115] "TRIP10"   "F2RL1"    "LDLR"     "TGIF1"    "RNF19B"   "DRAM1"   
## [121] "B4GALT1"  "DNAJB4"   "PDE4B"    "SNN"      "DENND5A"  "SPHK1"   
## [127] "CD80"     "TNFAIP8"  "FUT4"     "SPSB1"    "TSC22D1"  "B4GALT5" 
## [133] "SIK1"     "CLCF1"    "NFE2L2"   "NFAT5"    "ATP2B1"   "IL6ST"   
## [139] "ABCA1"    "HES1"     "IRS2"     "SLC2A3"   "IL23A"    "TAP1"    
## [145] "MSC"      "IFIH1"    "IL15RA"   "TNIP2"    "PANX1"    "EDN1"    
## [151] "EIF1"     "BMP2"     "DUSP4"    "PDLIM5"   "ICOSLG"   "GFPT2"   
## [157] "TNC"      "SERPINB8" "MXD1"

make_volc_plot <- function(volc_xhx, plot_name) {

  volc_xhx$diffexpressed <- "NO"
  volc_xhx$diffexpressed[volc_xhx$log2FoldChange > 0.6 & volc_xhx$pval < 0.05] <- "UP"
  volc_xhx$diffexpressed[volc_xhx$log2FoldChange < -0.6 & volc_xhx$pval < 0.05] <- "DOWN"
  
  head(volc_xhx[order(volc_xhx$padj) & volc_xhx$diffexpressed == 'DOWN', ])
  
  volc_xhx <- volc_xhx %>%
    mutate(ENSEMBL = rownames(volc_xhx))
  
  volc_xhx <- inner_join(volc_xhx, ens2symbol, by = c("ENSEMBL" = "ENSEMBL"))
  
  volc_xhx$delabel <- ifelse(volc_xhx$SYMBOL %in% head(volc_xhx[order(volc_xhx$padj), "SYMBOL"], 30), volc_xhx$SYMBOL, NA)
  ggplot(data = volc_xhx, aes(x = log2FoldChange, y = -log10(padj), col = diffexpressed, label = delabel)) +
  geom_vline(xintercept = c(-0.6, 0.6), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.0001), col = "gray", linetype = 'dashed') + 
  geom_point(size = 1) + 
  scale_color_manual(values = c("#00AFBB", "grey", "#bb0c00"),  
                     labels = c("Downregulated", "Not significant", "Upregulated")) +
  coord_cartesian(ylim = c(0, 450), xlim = c(-10, 10)) + # since some genes can have minuslog10padj of inf, we set these limits
  labs(color = '',  
       x = expression("log"[2]*"FC"), y = expression("-log"[10]*"p-value")) + 
  scale_x_continuous(breaks = seq(-10, 10, 2)) +
  ggtitle(plot_name) + 
  geom_text_repel(size = 3,max.overlaps = 12)
}

make_volc_plot(volc_8h1,"Media vs Polymer")

make_volc_plot(volc_8h2,"Media vs TNT alone")

make_volc_plot(volc_8h3,"Media vs TNT+Polymer ")