library(IHW)
library(apeglm)
library(DT)
library(dplyr)
library(biomaRt)
library(tximport)
library(rhdf5)
library(gplots)
library(DESeq2)
library(circlize)
library(RColorBrewer)
library(ComplexHeatmap)
library(pheatmap)
library(PCAtools)
library(clusterProfiler)
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(BiocParallel)
library(ggvenn)
library(RColorBrewer)Document analyzing the concordance of differentially expressed genes in LNCaP Clone1 vs. Control samples and neuroendocrine prostate cancer (NEPCa) gene signature sets derived from Cheng et al. and Beltran et al..
Cheng et al. derived a 12 signature gene set from a literature review of genes involved in NEPCa progression and a 304 gene signature set derived of overlapping DEGs from 3 microarray studies analyzing NEPCa vs. Adenocarcinoma prostate cancer (AdPCa). The two gene sets were merged, resulting in a 310 gene signature set.
Beltran et al. provided an integrated 70 gene signature NEPCa classifier developed by using expression data of genes that were prioritized by genomic, transcriptomic or epigenomic status.
Firstly, we assess the number of overlapping genes in LNCaP Clone1 vs. Control DEGs compared to the two gene signature lists provided by Cheng et al. and Beltran et al.
mtx <- read.table("/data/projects/marvin_lim/mRNA_counts.txt", header=T, row.names = "gene_symbols")
samples <- c("Clone1_N1", "Clone1_N2", "Clone1_N3",
"Clone9_N1", "Clone9_N2", "Clone9_N3",
"Control_N1", "Control_N2", "Control_N3")
colnames(mtx) <- samples
## Create sample DF
sample_vec <- c()
replicates_vec <- c()
sample_df <- as.data.frame(matrix(ncol=2, nrow=0))
for(i in samples){
split <- strsplit(i, "\\_")
sample <- split[[1]][1]
rep <- split[[1]][2]
sample_df[i,1] <- sample
sample_df[i,2] <- rep
}
colnames(sample_df) <- c("Condition", "Replicate")
sample_df$Condition <- as.factor(sample_df$Condition)
sample_df$Replicate <- as.factor(sample_df$Replicate)
## Create DDS object
dds <- DESeqDataSetFromMatrix(mtx, colData = sample_df, design = ~ Replicate + Condition)
dds$Condition <- relevel(dds$Condition, ref = "Control")
keep <- rowSums(counts(dds)) >= 20
dds <- dds[keep,]
dds <- DESeq(dds)
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
res1 <- results(dds, filterFun=ihw, alpha=0.05, c("Condition", "Clone1", "Control"))
LFC1 <- lfcShrink(dds = dds, res= res1, coef = 4, type = "apeglm")
LFC1_df <- as.data.frame(LFC1)
output_col <- c("Gene", "Chromosome", "Start", "Stop", "Strand", "Description", "Log2FC", "P-value", "Adj P-value")
up_key <- intersect(rownames(LFC1)[which(LFC1$log2FoldChange>=1)],
rownames(LFC1)[which(LFC1$padj<=0.05)])
up_df <- as.data.frame((LFC1)[which(rownames(LFC1) %in% up_key),])
up_df <- tibble::rownames_to_column(up_df, "external_gene_name")
info <- getBM(attributes=c("external_gene_name",
"chromosome_name",
"start_position",
"end_position",
"strand",
"entrezgene_description"),
filters = c("external_gene_name"),
values = up_df$external_gene_name,
mart = mart,
useCache = F)
tmp <- merge(info, up_df, by="external_gene_name")
tmp <- tmp[order(-tmp$log2FoldChange),]
## col out index
index <- c(1, 2, 3, 4, 5, 6, 8, 10, 11)
tmp <- tmp[,index]
colnames(tmp) <- output_col
tmp$Strand <- gsub("-1", "-", tmp$Strand)
tmp$Strand <- gsub("1", "+", tmp$Strand)
# remove dups
tmp <- tmp[!duplicated(tmp$Gene),]
up_regulated_table1 <- tmp
output_col <- c("Gene", "Chromosome", "Start", "Stop", "Strand", "Description", "Log2FC", "P-value", "Adj P-value")
down_key <- intersect(rownames(LFC1)[which(LFC1$log2FoldChange<=-1)],
rownames(LFC1)[which(LFC1$padj<=0.05)])
down_df <- as.data.frame((LFC1)[which(rownames(LFC1) %in% down_key),])
down_df <- tibble::rownames_to_column(down_df, "external_gene_name")
info <- getBM(attributes=c("external_gene_name",
"chromosome_name",
"start_position",
"end_position",
"strand",
"entrezgene_description"),
filters = c("external_gene_name"),
values = down_df$external_gene_name,
mart = mart,
useCache = F)
tmp <- merge(info, down_df, by="external_gene_name")
tmp <- tmp[order(tmp$log2FoldChange),]
## col out index
index <- c(1, 2, 3, 4, 5, 6, 8, 10, 11)
tmp <- tmp[,index]
colnames(tmp) <- output_col
tmp$Strand <- gsub("-1", "-", tmp$Strand)
tmp$Strand <- gsub("1", "+", tmp$Strand)
# remove dups
tmp <- tmp[!duplicated(tmp$Gene),]
down_regulated_table1 <- tmp
cheng_up <- read.table("/data/projects/marvin_lim/cheng_et_al/up_nepca.txt")
cheng_down <- read.table("/data/projects/marvin_lim/cheng_et_al/down_nepca.txt")
cheng <- rbind(cheng_up, cheng_down)
beltran_up <- read.table("/data/projects/marvin_lim/beltrans_et_al/up_nepca.txt")
beltran_down <- read.table("/data/projects/marvin_lim/beltrans_et_al/down_nepca.txt")
beltran <- rbind(beltran_up, beltran_down)
deg <- rbind(up_regulated_table1, down_regulated_table1)
up_venn <- list("Lim et al." = deg$Gene,
"Cheng et al." = cheng$V1,
"Beltran et al." = beltran$V1)
ggvenn(data=up_venn, show_percentage = F) + ggtitle("Intersection of DE genes (LNCaP Clone 1 vs. Control) \n vs. NEPCa gene signatures") + theme(plot.title = element_text(hjust = 0.50, size = 16, face = "bold"))The venn diagram reveals 10 genes present in \(\text{Lim et al.} \cap \text{Beltran et al.} \cap \text{Cheng et al.}\)
intersection <- subset(deg, deg$Gene %in% beltran$V1 & deg$Gene %in% cheng$V1)
up <- subset(intersection, intersection$Log2FC > 0 )
DT::datatable(up, rownames = FALSE, options=list(scrollX=T))down <- subset(intersection, intersection$Log2FC < 0 )
DT::datatable(down, rownames = FALSE, options=list(scrollX=T))vsd <- assay(vst(dds, blind=FALSE))
# remove clone 9
subs_vsd <- subset(vsd, select=-c(Clone9_N1, Clone9_N2, Clone9_N3))
# sub 10 gene signature
sub_sig <- subset(subs_vsd, rownames(subs_vsd) %in% intersection$Gene)
mat <- t(sub_sig)
mat <- scale(mat, center = T, scale = T)
mat <- t(mat)
df2 = data.frame(Cell_Line = c(rep("Clone1", 3), rep("Control", 3)))
ha = HeatmapAnnotation(df = df2, col = list(Cell_Line = c("Control" = "black", "Clone1" = "gold")), annotation_legend_param = list(title_gp = gpar(fontsize = 12, fontface = "bold"), labels_gp = gpar(fontsize = 12)))
hm <- Heatmap(mat,
col= colorRamp2(c(-2,-1,0,1,2),c("blue","skyblue","white","lightcoral","red")),
heatmap_legend_param=list(at=c(-1.2,-0.6,0,0.6,1.2),color_bar="continuous",
legend_direction="vertical", legend_width=unit(15,"cm"),
title_position="topcenter", title_gp=gpar(fontsize=10, fontface="bold")),
name = "Z-score",
#Row annotation configurations
cluster_rows=TRUE,
show_row_dend=TRUE,
row_title_side="left",
row_title_gp=gpar(fontsize=10),
show_row_names=TRUE,
row_names_side="right",
#Column annotation configuratiions
cluster_columns=TRUE,
show_column_dend=TRUE,
column_title="NEPCa 10 gene signature\nClone1 vs Control",
column_title_side="top",
column_title_gp=gpar(fontsize=15, fontface="bold"),
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 12, fontface="bold"),
#Dendrogram configurations: columns
clustering_distance_columns="euclidean",
clustering_method_columns="complete",
column_dend_height=unit(10,"mm"),
#Dendrogram configurations: rows
clustering_distance_rows="euclidean",
clustering_method_rows="complete",
row_dend_width=unit(1,"cm"),
row_dend_side = "left",
row_dend_reorder = TRUE,
#Splits
border=T,
#row_km = 1,
#column_km = 2,
#plot params
#width = unit(5, "inch"),
#height = unit(4, "inch"),
#height = unit(0.8, "cm")*nrow(mat),
#Annotations
top_annotation = ha)
draw(hm, annotation_legend_side = "right", heatmap_legend_side="right")174 genes were found in \(\text{Lim et al.} \cap \text{Beltran et al.} \text{ AND } \text{Lim et al.} \cap \text{Cheng et al.}\)
beltran_int <- subset(deg, deg$Gene %in% beltran$V1)
cheng_int <- subset(deg, deg$Gene %in% cheng$V1)
int <- rbind(beltran_int, cheng_int)
up <- subset(int, int$Log2FC > 0 )
up <- up[!duplicated(up), ]
DT::datatable(up, rownames = FALSE, options=list(scrollX=T))down <- subset(int, int$Log2FC < 0 )
down <- down[!duplicated(down), ]
DT::datatable(down, rownames = FALSE, options=list(scrollX=T))beltran_int <- subset(deg, deg$Gene %in% beltran$V1)
cheng_int <- subset(deg, deg$Gene %in% cheng$V1)
int <- rbind(beltran_int, cheng_int)
sub_sig <- subset(subs_vsd, rownames(subs_vsd) %in% int$Gene)
# remove low sd
sub_sig <- sub_sig[apply(sub_sig, MARGIN = 1, FUN = function(x) sd(x) != 0),]
mat <- t(sub_sig)
mat <- scale(mat, center = T, scale = T)
mat <- t(mat)
df2 = data.frame(Cell_Line = c(rep("Clone1", 3), rep("Control", 3)))
ha = HeatmapAnnotation(df = df2, col = list(Cell_Line = c("Control" = "black", "Clone1" = "gold")), annotation_legend_param = list(title_gp = gpar(fontsize = 12, fontface = "bold"), labels_gp = gpar(fontsize = 12)))
hm <- Heatmap(mat,
col= colorRamp2(c(-2,-1,0,1,2),c("blue","skyblue","white","lightcoral","red")),
heatmap_legend_param=list(at=c(-1.8,-0.9,0,0.9,1.8),color_bar="continuous",
legend_direction="vertical", legend_width=unit(15,"cm"),
title_position="topcenter", title_gp=gpar(fontsize=10, fontface="bold")),
name = "Z-score",
#Row annotation configurations
cluster_rows=TRUE,
show_row_dend=TRUE,
row_title_side="left",
row_title_gp=gpar(fontsize=10),
show_row_names=FALSE,
row_names_side="right",
#Column annotation configuratiions
cluster_columns=TRUE,
show_column_dend=TRUE,
column_title="NEPCa 174 gene signature\nClone1 vs Control",
column_title_side="top",
column_title_gp=gpar(fontsize=15, fontface="bold"),
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 12, fontface="bold"),
#Dendrogram configurations: columns
clustering_distance_columns="euclidean",
clustering_method_columns="complete",
column_dend_height=unit(10,"mm"),
#Dendrogram configurations: rows
clustering_distance_rows="euclidean",
clustering_method_rows="complete",
row_dend_width=unit(1,"cm"),
row_dend_side = "left",
row_dend_reorder = TRUE,
#Splits
border=T,
#row_km = 1,
#column_km = 2,
#plot params
#width = unit(5, "inch"),
#height = unit(4, "inch"),
#height = unit(0.8, "cm")*nrow(mat),
#Annotations
top_annotation = ha)
draw(hm, annotation_legend_side = "right", heatmap_legend_side="right")