Document creation Libraries
library(DESeq2)
library(Rsubread)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(cluster)
library(factoextra)
library(amap)
library(NbClust)
library(dplyr)
Set working directory to local workspace Save R markup, featurecounts results and full gtf there RNA featurecounts generated with ‘Homo_sapiens.GRCh38.99.gtf.gz’ file
setwd("/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/ALCL/Full R project scripts")
getwd()
[1] "/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/ALCL/Full R project scripts"
RNA
Load featurecounts results, configure as data frame object Setup metadata information and check naming consistency Remove non-counts columns in fc object
fc <- read.table("RNA_ALCL_wholegenecounts", sep="\t", header=TRUE, row.names=1)
fc2 <- as.data.frame(fc)
orig_names <- colnames(fc2[6:14])
colData_rna <- data.frame(row.names=orig_names, sample=c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A"), subtype=c("ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK-ALCL", "ALK-ALCL", "ALK-ALCL"))
all(rownames(colData_rna) %in% colnames(fc2))
[1] TRUE
fc3 <- subset(fc2, select=-c(Chr, Start, End, Strand, Length))
colnames(fc3) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
Create DEseq object and save file Generate normalization factor with DEseq default
dds_rna <- DESeqDataSetFromMatrix(countData=fc3, colData=colData_rna, design=~sample)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
row.names(deseq_sizes_rna) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
deseq_sizes_rna
saveRDS(dds_rna, file="RNA_dds.rds")
Generate norm counts files and save to local directory
rawcounts_rna <- as.data.frame(counts(dds_rna, normalized=FALSE))
normcounts_rna <- as.data.frame(counts(dds_rna, normalized=TRUE))
colnames(rawcounts_rna) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
colnames(normcounts_rna) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
#rownames(rawcounts_rna) <- sapply(strsplit(rownames(rawcounts_rna), split="[.]"), head, 1)
#rownames(normcounts_rna) <- sapply(strsplit(rownames(normcounts_rna), split="[.]"), head, 1)
write.table(rawcounts_rna, file="RNA_rawcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
write.table(normcounts_rna, file="RNA_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
Data transformation for plot generation Top 500 contributors is the default for PCA Set ‘ntop’ to number of observations in the dds object
rld_rna <- rlog(dds_rna, blind=TRUE)
sampleDistsrna <- dist(t(assay(rld_rna)))
sampleDistMtxrna <- as.matrix(sampleDistsrna)
rownames(sampleDistMtxrna) <- paste0(rld_rna$sample)
colnames(sampleDistMtxrna) <- NULL
#Generate PCA plot (intgroup set equal to desired metadata variable)
plotPCA(rld_rna, intgroup = "subtype", ntop=60676) + ggtitle("RNA-seq PCA") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 500 PCA
plotPCA(rld_rna, intgroup = "subtype") + ggtitle("RNA-seq PCA Top 500") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 1000
plotPCA(rld_rna, intgroup = "subtype", ntop=1000) + ggtitle("RNA-seq PCA Top 1000") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 5000
plotPCA(rld_rna, intgroup = "subtype", ntop=5000) + ggtitle("RNA-seq PCA Top 5000") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 10000
plotPCA(rld_rna, intgroup = "subtype", ntop=10000) + ggtitle("RNA-seq PCA Top 10000") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 15000
plotPCA(rld_rna, intgroup = "subtype", ntop=15000) + ggtitle("RNA-seq PCA Top 15000") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

Additional dendrogram plots
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtxrna, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
cluster_rows=TRUE, cluster_cols=TRUE, main="ALCL RNA Sample Distance Clustering")

plot(hclust(sampleDistsrna), labels=rownames(sampleDistMtxrna), main="ALCL RNA Sample Distance Dendrogram")

Assemble counts results
avg_rna <- data.frame(EnsemblID=rownames(normcounts_rna))
avg_rna$KARPAS299 <- (normcounts_rna$KARPAS299)
avg_rna$L82 <- (normcounts_rna$L82)
avg_rna$SUPM2 <- (normcounts_rna$SUPM2)
avg_rna$SUDHL1 <- (normcounts_rna$SUDHL1)
avg_rna$KIJK <- (normcounts_rna$KIJK)
avg_rna$SR786 <- (normcounts_rna$SR786)
avg_rna$FEPD <- (normcounts_rna$FEPD)
avg_rna$DL40 <- (normcounts_rna$DL40)
avg_rna$MAC2A <- (normcounts_rna$MAC2A)
avg_rna$max <- apply(avg_rna[c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")], 1, max, na.rm=TRUE)
rel_rna <- data.frame(EnsemblID=avg_rna$EnsemblID)
rel_rna$KARPAS299 <- avg_rna$KARPAS299 / avg_rna$max
rel_rna$L82 <- avg_rna$L82 / avg_rna$max
rel_rna$SUPM2 <- avg_rna$SUPM2 / avg_rna$max
rel_rna$SUDHL1 <- avg_rna$SUDHL1 / avg_rna$max
rel_rna$KIJK <- avg_rna$KIJK / avg_rna$max
rel_rna$SR786 <- avg_rna$SR786 / avg_rna$max
rel_rna$FEPD <- avg_rna$FEPD / avg_rna$max
rel_rna$DL40 <- avg_rna$DL40 / avg_rna$max
rel_rna$MAC2A <- avg_rna$MAC2A / avg_rna$max
Data transform for variance assessment and sub-clustering
rv1 <- rowVars(assay(rld_rna))
select1 <- order(rv1, decreasing=TRUE)
pc1 <- prcomp(t(assay(rld_rna)[select1,]))
loadings1 <- as.data.frame(pc1$rotation)
aload1 <- abs(loadings1)
pcRank <- data.frame("EnsemblID"=rownames(loadings1), "PCA.rank"=seq.int(nrow(loadings1)))
sweep(aload1, 2, colSums(aload1), "/")
peakVar <- data.frame("EnsemblID"=rownames(assay(rld_rna)))
peakVar$rlog.variance <- rowVars(as.matrix(assay(rld_rna)))
peakVar <- merge(pcRank, peakVar, by.x="EnsemblID", by.y="EnsemblID")
ggplot(peakVar, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000,10000,15000), linetype="dotted") + ggtitle("ALCL Variance per PC Contributor")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar <- peakVar[order(peakVar$PCA.rank),]
peakVar$cumulative.Variance <- cumsum(peakVar$rlog.variance)/sum(peakVar$rlog.variance)
ggplot(peakVar, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000,10000,15000), linetype="dotted") + ggtitle("ALCL Cumulative Variance per PC Rank")

PCA top contributors selection (5K to 20K)
rv <- rowVars(assay(rld_rna))
select <- order(rv, decreasing=TRUE)[seq_len(min(1000, length(rv)))]
#Changing the 1000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_rna)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 1000 ALCL RNA PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 10000 elements with their annotations
top1000 <- rel_rna[(rel_rna$EnsemblID %in% rownames(aload)),]
write.table(top10000, file="Top1000.txt", sep='\t')
RNA_rel <- rel_rna[(rel_rna$EnsemblID %in% top1000$EnsemblID),]
saveRDS(RNA_rel, file="RNA_rel.rds")
colnames(RNA_rel) <- c("EnsemblID", "KARPAS299.gene", "L82.gene", "SUPM2.gene", "SUDHL1.gene", "KIJK.gene", "SR786.gene", "FEPD.gene", "DL40.gene", "MAC2A.gene")
head(top1000)
head(RNA_rel)
#Gap Stat Calculation
#set.seed(42)
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
gap_stat <- clusGap(RNA_rel[2:10], FUN = kmeans, nstart = 20, K.max = 24, B = 50)
Clustering k = 1,2,..., K.max (= 24): .. done
Bootstrapping, b = 1,2,..., B (= 50) [one "." per sample]:
.................................................. 50
#... = number of samples + 1
#Cluster Stat Plots
fviz_nbclust(top10000[2:10], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")

fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")

fviz_nbclust(top10000[2:10], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")

#k-means Clustering
CLARAk2 <- clara(top1000[2:10],2,medoids.x=TRUE)
CLARAk3 <- clara(top1000[2:10],3,medoids.x=TRUE)
CLARAk4 <- clara(top1000[2:10],4,medoids.x=TRUE)
CLARAk5 <- clara(top1000[2:10],5,medoids.x=TRUE)
CLARAk6 <- clara(top1000[2:10],6,medoids.x=TRUE)
CLARAk7 <- clara(top1000[2:10],7,medoids.x=TRUE)
CLARAk8 <- clara(top1000[2:10],8,medoids.x=TRUE)
top1000$k2 <- CLARAk2$clustering
top1000$k3 <- CLARAk3$clustering
top1000$k4 <- CLARAk4$clustering
top1000$k5 <- CLARAk5$clustering
top1000$k6 <- CLARAk6$clustering
top1000$k7 <- CLARAk7$clustering
top1000$k8 <- CLARAk8$clustering
#Displaying Heatmaps
#k-means 20
pheatmap(top1000[order(top1000[, 10+1]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+1])

#k-means 26
pheatmap(top1000[order(top1000[, 10+2]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+2])

#k-means 30
pheatmap(top1000[order(top1000[, 10+3]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+3])

#k-means 36
pheatmap(top1000[order(top1000[, 10+4]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+4])

#k-means 15
pheatmap(top1000[order(top1000[, 10+5]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+5])

#k-means 25
pheatmap(top1000[order(top1000[, 10+6]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+6])

#k-means 22
pheatmap(top1000[order(top1000[, 10+7]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+7])

–RNA END–
PRO
#Prepare dREG peak saf for featureCounts
dREG <- read.table("ALCL_s0.5p0.025_filtered.bed")
colnames(dREG) <- c("chromosome", "start", "end", "dREGpeakID", "dREGscore", "strand", "centerStart",
"centerEnd")
write.table(dREG, file="dREG readtable.txt", sep="\t", quote=F, row.names=F)
dREGsaf <- data.frame("GeneID"=dREG$dREGpeakID, "Chr"=dREG$chromosome, "Start"=dREG$start, "End"=dREG$end, "Strand"="+")
write.table(dREGsaf, file="dREG.saf", sep="\t", quote=F, row.names=F)
#Load sample SAM file list
#profiles <- c("")
#Run feautreCounts
#fc_dREG <- featureCounts(profiles, annot.ext=dREGsaf, isGTFAnnotationFile=FALSE, minMQS=10, countChimericFragments=FALSE, isPairedEnd=FALSE, strandSpecific=0, nthreads=8)
fc_pro <- read.table("PRO_ALCL_dREGcounts", header=T, row.names=1)
fc_pro <- as.data.frame(fc_pro)
fc_pro <- subset(fc_pro, select=-c(Chr, Start, End, Strand, Length))
colnames(fc_pro) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
#Load sample metadata (can add as much sample metadata as desired including cell type, drug treatment, etc)
colData_pro <- data.frame(row.names=colnames(fc_pro$counts), sample=c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A"), subtype=c("ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK-ALCL", "ALK-ALCL", "ALK-ALCL"))
all((colData_pro$sample) %in% colnames(fc_pro))
[1] TRUE
#Create DESeq object and generate normalized counts
dds_pro <- DESeqDataSetFromMatrix(countData=fc_pro, colData=colData_pro, design=~sample)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
dds_pro <- estimateSizeFactors(dds_pro)
deseq_sizes_pro <- as.data.frame(sizeFactors(dds_pro))
deseq_sizes_pro
variable.names(dds_pro)
[1] "KARPAS299" "L82" "SUPM2" "SUDHL1" "KIJK" "SR786" "FEPD" "DL40"
[9] "MAC2A"
keep <- rowMax(counts(dds_pro, normalize=TRUE)) >= 20
dds_pro <- dds_pro[keep,]
saveRDS(dds_pro, file="dREG_dds.rds")
dREG_readFilt <- dREG[dREG$dREGpeakID %in% rownames(dds_pro),]
write.table(dREG_readFilt, file="dREGpeaks.readFilt.bed", sep="\t", col.names=FALSE, row.names=FALSE,
quote=FALSE)
Generate norm counts files and save to local directory
rawcounts_pro <- as.data.frame(counts(dds_pro, normalized=FALSE))
normcounts_pro <- as.data.frame(counts(dds_pro, normalized=TRUE))
colnames(rawcounts_pro) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
colnames(normcounts_pro) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
write.table(rawcounts_pro, file="pro_rawcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
write.table(normcounts_pro, file="pro_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
Data transformation for plot generation Top 500 contributors is the default for PCA Set ‘ntop’ to number of observations in the dds object
rld_pro <- rlog(dds_pro, blind=TRUE)
sampleDistspro <- dist(t(assay(rld_pro)))
sampleDistMtxpro <- as.matrix(sampleDistspro)
rownames(sampleDistMtxpro) <- paste0(rld_pro$sample)
colnames(sampleDistMtxpro) <- NULL
#Generate PCA plot (intgroup set equal to desired metadata variable)
plotPCA(rld_pro, intgroup = "subtype", ntop=62744) + ggtitle("pro-seq PCA") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 500 PCA
plotPCA(rld_pro, intgroup = "subtype") + ggtitle("pro-seq PCA Top 500") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 1000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=1000) + ggtitle("pro-seq PCA Top 1000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 5000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=5000) + ggtitle("pro-seq PCA Top 5000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 10000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=10000) + ggtitle("pro-seq PCA Top 10000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 15000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=15000) + ggtitle("pro-seq PCA Top 15000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 20000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=20000) + ggtitle("pro-seq PCA Top 20000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

Additional dendrogram plots
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtxpro, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
cluster_rows=TRUE, cluster_cols=TRUE, main="ALCL Sample Distance Clustering")

plot(hclust(sampleDistspro), labels=rownames(sampleDistMtxpro), main="ALCL Sample Distance Dendrogram")

Assemble counts results
avg_pro <- data.frame(peakID=rownames(normcounts_pro))
avg_pro$KARPAS299 <- (normcounts_pro$KARPAS299)
avg_pro$L82 <- (normcounts_pro$L82)
avg_pro$SUPM2 <- (normcounts_pro$SUPM2)
avg_pro$SUDHL1 <- (normcounts_pro$SUDHL1)
avg_pro$KIJK <- (normcounts_pro$KIJK)
avg_pro$SR786 <- (normcounts_pro$SR786)
avg_pro$FEPD <- (normcounts_pro$FEPD)
avg_pro$DL40 <- (normcounts_pro$DL40)
avg_pro$MAC2A <- (normcounts_pro$MAC2A)
avg_pro$max <- apply(avg_pro[c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")], 1, max, na.rm=TRUE)
rel_pro <- data.frame(peakID=avg_pro$peakID)
rel_pro$KARPAS299 <- avg_pro$KARPAS299 / avg_pro$max
rel_pro$L82 <- avg_pro$L82 / avg_pro$max
rel_pro$SUPM2 <- avg_pro$SUPM2 / avg_pro$max
rel_pro$SUDHL1 <- avg_pro$SUDHL1 / avg_pro$max
rel_pro$KIJK <- avg_pro$KIJK / avg_pro$max
rel_pro$SR786 <- avg_pro$SR786 / avg_pro$max
rel_pro$FEPD <- avg_pro$FEPD / avg_pro$max
rel_pro$DL40 <- avg_pro$DL40 / avg_pro$max
rel_pro$MAC2A <- avg_pro$MAC2A / avg_pro$max
PCA top contributors selection (5000 tried first)
dREG_annot <- read.table("ALCL_p1000_dREGpeak_annotation.txt", sep="\t", header=TRUE)
PRO_rel <- rel_pro
#check this
colnames(PRO_rel) <- c("dREGpeakID", "KARPAS299.enhancer", "L82.enhancer", "SUPM2.enhancer", "SUDHL1.enhancer", "KIJK.enhancer", "SR786.enhancer", "FEPD.enhancer", "DL40.enhancer", "MAC2A.ehancer")
saveRDS(PRO_rel, file="pro_rel.rds")
head(PRO_rel)
rv <- rowVars(assay(rld_pro))
select <- order(rv, decreasing=TRUE)[seq_len(min(5000, length(rv)))]
#Changing the 5000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_pro)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 5000 pro PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 5000 elements with their annotations
top5000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload)),]
write.table(top5000, file="Top5000.txt", sep='\t')
ggplot(top5000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA Contributors dREGpeak Annotations")

ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
pro_5000_rel <- rel_pro[(rel_pro$peakID %in% top5000$peakID),]
#Gap Stat Calculation
#set.seed(42)
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
gap_stat <- clusGap(pro_5000_rel[2:10], FUN = kmeans, nstart = 20, K.max = 24, B = 50)
Clustering k = 1,2,..., K.max (= 24): .. done
Bootstrapping, b = 1,2,..., B (= 50) [one "." per sample]:
.................................................. 50
#... = number of samples + 1
#Cluster Stat Plots
fviz_nbclust(pro_5000_rel[2:10], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")

fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")

fviz_nbclust(pro_5000_rel[2:10], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")

#k-means Clustering
CLARAk4 <- clara(pro_5000_rel[2:10],4,medoids.x=TRUE)
CLARAk5 <- clara(pro_5000_rel[2:10],5,medoids.x=TRUE)
CLARAk6 <- clara(pro_5000_rel[2:10],6,medoids.x=TRUE)
CLARAk7 <- clara(pro_5000_rel[2:10],7,medoids.x=TRUE)
CLARAk8 <- clara(pro_5000_rel[2:10],8,medoids.x=TRUE)
pro_5000_rel$k4 <- CLARAk4$clustering
pro_5000_rel$k5 <- CLARAk5$clustering
pro_5000_rel$k6 <- CLARAk6$clustering
pro_5000_rel$k7 <- CLARAk7$clustering
pro_5000_rel$k8 <- CLARAk8$clustering
#Displaying Heatmaps
#k-means 4
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+1]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+1])

#k-means 5
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+2]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+2])

#k-means 6
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+3]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+3])

#k-means 7
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+4]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+4])

#k-means 8
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+5]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+5])

–PRO END–
Combined PRO and RNA analysis
Load DESeq objects and annotations
#dds_rna <- readRDS("RNA_dds.rds")
#dds_pro <- readRDS("dREG_dds.rds")
dREGann <- read.table("ALCL_p1000_m1000000_gene-centric_closest-dREGpeaks.txt", sep="\t", header=T)
write.table(deseq_sizes_pro, file="DEseq size factors.txt", sep="\t", quote=F, row.names=T)
Make prelim dataframes
norm_rna <- as.data.frame(counts(dds_rna, normalized=TRUE))
rld_rna2 <- rlog(dds_rna, blind=TRUE)
colnames(norm_rna) <- dds_rna$sample #Simplified names from dds metadata
rel_rna2 <- data.frame("geneID"=rownames(norm_rna))
rel_rna2$max.gene.counts <- apply(norm_rna[1:ncol(norm_rna)], 1, max, na.rm=TRUE)
i=1
while (i<=ncol(norm_rna)){
rel_rna2[paste0(dds_rna$sample[i],".gene.rel")] <- norm_rna[,i]/rel_rna2$max.gene.counts
i=i+1
}
#reorder
rel_rna2 <- rel_rna2[,c(1,3:11,2)]
norm_pro <- as.data.frame(counts(dds_pro, normalized=TRUE))
colnames(norm_pro) <- dds_pro$sample #Simplified names from dds metadata
norm_pro <- norm_pro[names(norm_rna)] #reorder sample columns to match RNA-seq data
colnames(PRO_rel)[1] <- ("closest_peak")
#rel_pro2 <- data.frame("closest_peak"=rownames(norm_pro))
PRO_rel$max.dREG.counts <- apply(norm_pro[1:ncol(norm_pro)], 1, max, na.rm=TRUE)
#PRO_rel <- PRO_rel[,c(1,27,2:26)] #reorder columns
#i=1
#while (i<=ncol(norm_pro)){
# rel_pro2[paste0(dds_pro$sample[i],".dREG.rel")] <- norm_pro[,i]/rel_pro2$max.dREG.counts
# i=i+1
#}
Selecting gene PCA contributors by variance
#Select top 1000 PCA contributors
rv <- rowVars(assay(rld_rna2))
select <- order(rv, decreasing=TRUE)[seq_len(min(1000, length(rv)))] #Changing the 500 here will select a different number of top contributors if desired
pc <- prcomp(t(assay(rld_rna2)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
sweep(aload, 2, colSums(aload), "/")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 500 elements with their annotations
top1k_rel_rna <- rel_rna2[(rel_rna2$geneID %in% rownames(aload)),] #filter rel gene dataframe for top 10k
Determining clusters (k=3)
CLARA <- clara(top1k_rel_rna[2:10],3,medoids.x=TRUE)
top1k_rel_rna$k3 <- CLARA$clustering
pheatmap(top1k_rel_rna[order(top1k_rel_rna[,11]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1k_rel_rna[11])

Data integration
gene_res <- rel_rna2[(!rel_rna2$geneID %in% top1k_rel_rna$geneID),] #filter out genes not in top 10k
gene_res$k3 <- NA #add value for genes not clustered
gene_res <- rbind(top1k_rel_rna, gene_res) #combine with top 10k genes with cluster designations
#gene_res$geneID <- sapply(strsplit(gene_res$geneID, split="[.]"), head, 1) #simplify geneIDs by removing version numbers
comb_res <- unique(merge(dREGann, gene_res, by.x="geneID", by.y="geneID", all.y=T))
colnames(comb_res)[1] <- "geneID"
comb_res <- merge(comb_res, PRO_rel, by.x="closest_peak", by.y="closest_peak", all.x=T) #combine with dREG peak data, keep genes without a closest dREG peak call
comb_res <- comb_res[,c(2:9,13:23,1,10:12,24:33)] #reorder columns
Gene-to-Enhancer concordance analysis – this generates a stat ranging from -1 (anti-correlated) to 0 (not correlated) to +1 (correlated) reported as Spearman rho
#note this will produce a lot of errors for rows that don't have a gene and a an enhancer, so I've silenced them here
spear <- c()
i <- 1
while(i<=nrow(comb_res)){
speari <- cor(as.numeric(factor(comb_res[i,9:17])),as.numeric(factor(comb_res[i,24:32])),method="spearman")
spear <- c(spear, speari)
i <- i+1
}
comb_res$Spearman.rho <- spear
Save output
write.table(comb_res, file="ALCL.final.table.txt", sep="\t", quote=F, row.names=F)
dREGcountsann <- merge(PRO_rel, dREG_readFilt, by.x="closest_peak", by.y="dREGpeakID")
dREGcountsann <- dREGcountsann[,c(1,12:18,2:11)]
write.table(dREGcountsann, file="ALCL.dREGs.txt", sep="\t", quote=F, row.names=F)
Correct output to include metadata for genes without nearest enhancer calls
library(dplyr)
#remove columns with annotation issue
comb_res_woEnh <- comb_res %>% filter(is.na(geneName))
comb_res_wEnh <- comb_res %>% filter(!is.na(geneName))
#Load original unfiltered annotations
comp_gtf <- read.table("Homo_sapiens.GRCh38.99.ucsc.gtf", sep="\t")
colnames(comp_gtf) <- c("chr", "source", "type", "start", "end", "score", "strand", "phase", "attributes")
#Function to extract metadata from gtf annotation
extract_attributes <- function(gtf_attributes, att_of_interest){
att <- strsplit(as.character(gtf_attributes), "; ")
att <- gsub("\"","",unlist(att))
att <- gsub(";","",unlist(att))
if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
}else{
return(NA)}
}
#Extract desired information from gtf (geneID, geneName, biotype)
comp_gene <- filter(comp_gtf, type == "gene")
comp_gene$geneID <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_id"))
comp_gene$geneName <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_name"))
comp_gene$biotype <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_biotype"))
#Remove offending columns
#geneName,biotype,attributes,chr,start,end,strand
comb_res_woEnh <- comb_res_woEnh[,c(1,9:34)]
#Add back information
geneInfo <- data.frame("geneID"=comp_gene$geneID, "geneName"=comp_gene$geneName, "biotype"=comp_gene$biotype, "attributes"=comp_gene$attributes, "chr"=comp_gene$chr, "start"=comp_gene$start, "end"=comp_gene$end, "strand"=comp_gene$strand)
comb_res_woEnh_corr <- merge(geneInfo, comb_res_woEnh, by.x="geneID", by.y="geneID", all.y=T)
#Modify geneName and peak_counts
comb_res_woEnh_corr$peak_count <- 0
comb_res_woEnh_corr$geneName <- paste0(comb_res_woEnh_corr$geneName)
#Correcting geneNames in original
comb_res_wEnh$geneName <- substring(comb_res_wEnh$geneName, 1)
comb_res_wEnh$geneName <- paste0(comb_res_wEnh$geneName)
#Combining it all together
comb_res_corr <- rbind(comb_res_woEnh_corr, comb_res_wEnh)
write.table(comb_res_corr, file="ALCL.combined.results.corrected.txt", sep="\t", quote=F, row.names=F)
Combining full annotation to plot based on genomic category
filtered_res <- merge(dREG_annot, comb_res_corr, by.x="peakID", by.y="closest_peak", all.y=F)
write.table(filtered_res, file="Overlaping contribtors.txt", sep='\t', quote=F, row.names=F)
ggplot(filtered_res, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Overlapping PCA Contributors for Gene Expression and dREGpeak Annotations")

Combination completed, further investigation into clusters ongoing - 28JAN22
---
title: "Combined PRO_RNA web access"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

Document creation
Libraries
```{r}
library(DESeq2)
library(Rsubread)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(cluster)
library(factoextra)
library(amap)
library(NbClust)
library(dplyr)
```

Set working directory to local workspace
Save R markup, featurecounts results and full gtf there
RNA featurecounts generated with 'Homo_sapiens.GRCh38.99.gtf.gz' file
```{r}
setwd("/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/ALCL/Full R project scripts")
getwd()
```

RNA

Load featurecounts results, configure as data frame object
Setup metadata information and check naming consistency
Remove non-counts columns in fc object
```{r}
fc <- read.table("RNA_ALCL_wholegenecounts", sep="\t", header=TRUE, row.names=1)
fc2 <- as.data.frame(fc)
orig_names <- colnames(fc2[6:14])
colData_rna <- data.frame(row.names=orig_names, sample=c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A"), subtype=c("ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK-ALCL", "ALK-ALCL", "ALK-ALCL"))
all(rownames(colData_rna) %in% colnames(fc2))
fc3 <- subset(fc2, select=-c(Chr, Start, End, Strand, Length))
colnames(fc3) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
```

Create DEseq object and save file 
Generate normalization factor with DEseq default
```{r}
dds_rna <- DESeqDataSetFromMatrix(countData=fc3, colData=colData_rna, design=~sample)
dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
row.names(deseq_sizes_rna) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
deseq_sizes_rna
saveRDS(dds_rna, file="RNA_dds.rds")
```

Generate norm counts files and save to local directory
```{r}
rawcounts_rna <- as.data.frame(counts(dds_rna, normalized=FALSE))
normcounts_rna <- as.data.frame(counts(dds_rna, normalized=TRUE))
colnames(rawcounts_rna) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
colnames(normcounts_rna) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
#rownames(rawcounts_rna) <- sapply(strsplit(rownames(rawcounts_rna), split="[.]"), head, 1)
#rownames(normcounts_rna) <- sapply(strsplit(rownames(normcounts_rna), split="[.]"), head, 1)
write.table(rawcounts_rna, file="RNA_rawcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
write.table(normcounts_rna, file="RNA_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
```

Data transformation for plot generation
Top 500 contributors is the default for PCA 
Set 'ntop' to number of observations in the dds object
```{r}
rld_rna <- rlog(dds_rna, blind=TRUE)
sampleDistsrna <- dist(t(assay(rld_rna)))
sampleDistMtxrna <- as.matrix(sampleDistsrna)
rownames(sampleDistMtxrna) <- paste0(rld_rna$sample)
colnames(sampleDistMtxrna) <- NULL
#Generate PCA plot (intgroup set equal to desired metadata variable)
plotPCA(rld_rna, intgroup = "subtype", ntop=60676) + ggtitle("RNA-seq PCA") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 500 PCA
plotPCA(rld_rna, intgroup = "subtype") + ggtitle("RNA-seq PCA Top 500") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 1000
plotPCA(rld_rna, intgroup = "subtype", ntop=1000) + ggtitle("RNA-seq PCA Top 1000") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 5000
plotPCA(rld_rna, intgroup = "subtype", ntop=5000) + ggtitle("RNA-seq PCA Top 5000") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 10000
plotPCA(rld_rna, intgroup = "subtype", ntop=10000) + ggtitle("RNA-seq PCA Top 10000") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 15000
plotPCA(rld_rna, intgroup = "subtype", ntop=15000) + ggtitle("RNA-seq PCA Top 15000") + scale_color_manual(breaks = c("ALK+ALCL","ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
```

Additional dendrogram plots
```{r}
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtxrna, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
         cluster_rows=TRUE, cluster_cols=TRUE, main="ALCL RNA Sample Distance Clustering")
plot(hclust(sampleDistsrna), labels=rownames(sampleDistMtxrna), main="ALCL RNA Sample Distance Dendrogram")
```

Assemble counts results
```{r}
avg_rna <- data.frame(EnsemblID=rownames(normcounts_rna))
avg_rna$KARPAS299 <- (normcounts_rna$KARPAS299)
avg_rna$L82 <- (normcounts_rna$L82)
avg_rna$SUPM2 <- (normcounts_rna$SUPM2)
avg_rna$SUDHL1 <- (normcounts_rna$SUDHL1)
avg_rna$KIJK <- (normcounts_rna$KIJK)
avg_rna$SR786 <- (normcounts_rna$SR786)
avg_rna$FEPD <- (normcounts_rna$FEPD)
avg_rna$DL40 <- (normcounts_rna$DL40)
avg_rna$MAC2A <- (normcounts_rna$MAC2A)

avg_rna$max <- apply(avg_rna[c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")], 1, max, na.rm=TRUE)
rel_rna <- data.frame(EnsemblID=avg_rna$EnsemblID)
rel_rna$KARPAS299 <- avg_rna$KARPAS299 / avg_rna$max
rel_rna$L82 <- avg_rna$L82 / avg_rna$max
rel_rna$SUPM2 <- avg_rna$SUPM2 / avg_rna$max
rel_rna$SUDHL1 <- avg_rna$SUDHL1 / avg_rna$max
rel_rna$KIJK <- avg_rna$KIJK / avg_rna$max
rel_rna$SR786 <- avg_rna$SR786 / avg_rna$max
rel_rna$FEPD <- avg_rna$FEPD / avg_rna$max
rel_rna$DL40 <- avg_rna$DL40 / avg_rna$max
rel_rna$MAC2A <- avg_rna$MAC2A / avg_rna$max
```

Data transform for variance assessment and sub-clustering
```{r}
rv1 <- rowVars(assay(rld_rna))
select1 <- order(rv1, decreasing=TRUE)
pc1 <- prcomp(t(assay(rld_rna)[select1,]))
loadings1 <- as.data.frame(pc1$rotation)
aload1 <- abs(loadings1)
pcRank <- data.frame("EnsemblID"=rownames(loadings1), "PCA.rank"=seq.int(nrow(loadings1)))
sweep(aload1, 2, colSums(aload1), "/")

peakVar <- data.frame("EnsemblID"=rownames(assay(rld_rna)))
peakVar$rlog.variance <- rowVars(as.matrix(assay(rld_rna)))
peakVar <- merge(pcRank, peakVar, by.x="EnsemblID", by.y="EnsemblID")
ggplot(peakVar, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000,10000,15000), linetype="dotted") + ggtitle("ALCL Variance per PC Contributor")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar <- peakVar[order(peakVar$PCA.rank),]
peakVar$cumulative.Variance <- cumsum(peakVar$rlog.variance)/sum(peakVar$rlog.variance)
ggplot(peakVar, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000,10000,15000), linetype="dotted") + ggtitle("ALCL Cumulative Variance per PC Rank")
```

PCA top contributors selection (5K to 20K)
```{r message=FALSE, warning=FALSE}
rv <- rowVars(assay(rld_rna))
select <- order(rv, decreasing=TRUE)[seq_len(min(1000, length(rv)))] 
#Changing the 1000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_rna)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 1000 ALCL RNA PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 10000 elements with their annotations
top1000 <- rel_rna[(rel_rna$EnsemblID %in% rownames(aload)),]
write.table(top10000, file="Top1000.txt", sep='\t')

RNA_rel <- rel_rna[(rel_rna$EnsemblID %in% top1000$EnsemblID),]
saveRDS(RNA_rel, file="RNA_rel.rds")
colnames(RNA_rel) <- c("EnsemblID", "KARPAS299.gene", "L82.gene", "SUPM2.gene", "SUDHL1.gene", "KIJK.gene", "SR786.gene", "FEPD.gene", "DL40.gene", "MAC2A.gene")
head(top1000)
head(RNA_rel)
#Gap Stat Calculation
#set.seed(42) 
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
gap_stat <- clusGap(RNA_rel[2:10], FUN = kmeans, nstart = 20, K.max = 24, B = 50) 
#... = number of samples + 1
#Cluster Stat Plots
fviz_nbclust(top10000[2:10], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
fviz_nbclust(top10000[2:10], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk2 <- clara(top1000[2:10],2,medoids.x=TRUE)
CLARAk3 <- clara(top1000[2:10],3,medoids.x=TRUE)
CLARAk4 <- clara(top1000[2:10],4,medoids.x=TRUE)
CLARAk5 <- clara(top1000[2:10],5,medoids.x=TRUE)
CLARAk6 <- clara(top1000[2:10],6,medoids.x=TRUE)
CLARAk7 <- clara(top1000[2:10],7,medoids.x=TRUE)
CLARAk8 <- clara(top1000[2:10],8,medoids.x=TRUE)

top1000$k2 <- CLARAk2$clustering
top1000$k3 <- CLARAk3$clustering
top1000$k4 <- CLARAk4$clustering
top1000$k5 <- CLARAk5$clustering
top1000$k6 <- CLARAk6$clustering
top1000$k7 <- CLARAk7$clustering
top1000$k8 <- CLARAk8$clustering


#Displaying Heatmaps
#k-means 20
pheatmap(top1000[order(top1000[, 10+1]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+1])
#k-means 26
pheatmap(top1000[order(top1000[, 10+2]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+2])
#k-means 30
pheatmap(top1000[order(top1000[, 10+3]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+3])
#k-means 36
pheatmap(top1000[order(top1000[, 10+4]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+4])
#k-means 15
pheatmap(top1000[order(top1000[, 10+5]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+5])
#k-means 25
pheatmap(top1000[order(top1000[, 10+6]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+6])
#k-means 22 
pheatmap(top1000[order(top1000[, 10+7]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1000[10+7])
```

--RNA END--

PRO

```{r}
#Prepare dREG peak saf for featureCounts
dREG <- read.table("ALCL_s0.5p0.025_filtered.bed")
colnames(dREG) <- c("chromosome", "start", "end", "dREGpeakID", "dREGscore", "strand", "centerStart",
                    "centerEnd")
write.table(dREG, file="dREG readtable.txt", sep="\t", quote=F, row.names=F)
dREGsaf <- data.frame("GeneID"=dREG$dREGpeakID, "Chr"=dREG$chromosome, "Start"=dREG$start, "End"=dREG$end, "Strand"="+")
write.table(dREGsaf, file="dREG.saf", sep="\t", quote=F, row.names=F)
#Load sample SAM file list
#profiles <- c("")

#Run feautreCounts
#fc_dREG <- featureCounts(profiles, annot.ext=dREGsaf, isGTFAnnotationFile=FALSE, minMQS=10, countChimericFragments=FALSE, isPairedEnd=FALSE, strandSpecific=0, nthreads=8)
fc_pro <- read.table("PRO_ALCL_dREGcounts", header=T, row.names=1)
fc_pro <- as.data.frame(fc_pro)
fc_pro <- subset(fc_pro, select=-c(Chr, Start, End, Strand, Length))
colnames(fc_pro) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")

#Load sample metadata (can add as much sample metadata as desired including cell type, drug treatment, etc)
colData_pro <- data.frame(row.names=colnames(fc_pro$counts), sample=c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A"), subtype=c("ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "ALK-ALCL", "ALK-ALCL", "ALK-ALCL"))
all((colData_pro$sample) %in% colnames(fc_pro))

#Create DESeq object and generate normalized counts
dds_pro <- DESeqDataSetFromMatrix(countData=fc_pro, colData=colData_pro, design=~sample)
dds_pro <- estimateSizeFactors(dds_pro)
deseq_sizes_pro <- as.data.frame(sizeFactors(dds_pro))
deseq_sizes_pro
variable.names(dds_pro)

keep <- rowMax(counts(dds_pro, normalize=TRUE)) >= 20
dds_pro <- dds_pro[keep,]
saveRDS(dds_pro, file="dREG_dds.rds")
dREG_readFilt <- dREG[dREG$dREGpeakID %in% rownames(dds_pro),]
write.table(dREG_readFilt, file="dREGpeaks.readFilt.bed", sep="\t", col.names=FALSE, row.names=FALSE,
            quote=FALSE)
```

Generate norm counts files and save to local directory
```{r}
rawcounts_pro <- as.data.frame(counts(dds_pro, normalized=FALSE))
normcounts_pro <- as.data.frame(counts(dds_pro, normalized=TRUE))
colnames(rawcounts_pro) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
colnames(normcounts_pro) <- c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
write.table(rawcounts_pro, file="pro_rawcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
write.table(normcounts_pro, file="pro_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
```

Data transformation for plot generation
Top 500 contributors is the default for PCA 
Set 'ntop' to number of observations in the dds object
```{r}
rld_pro <- rlog(dds_pro, blind=TRUE)
sampleDistspro <- dist(t(assay(rld_pro)))
sampleDistMtxpro <- as.matrix(sampleDistspro)
rownames(sampleDistMtxpro) <- paste0(rld_pro$sample)
colnames(sampleDistMtxpro) <- NULL
#Generate PCA plot (intgroup set equal to desired metadata variable)
plotPCA(rld_pro, intgroup = "subtype", ntop=62744) + ggtitle("pro-seq PCA") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 500 PCA
plotPCA(rld_pro, intgroup = "subtype") + ggtitle("pro-seq PCA Top 500") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 1000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=1000) + ggtitle("pro-seq PCA Top 1000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 5000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=5000) + ggtitle("pro-seq PCA Top 5000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 10000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=10000) + ggtitle("pro-seq PCA Top 10000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 15000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=15000) + ggtitle("pro-seq PCA Top 15000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 20000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=20000) + ggtitle("pro-seq PCA Top 20000") + scale_color_manual(breaks = c("ALK+ALCL", "ALK-ALCL"), values=c("chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
```

Additional dendrogram plots
```{r}
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtxpro, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
         cluster_rows=TRUE, cluster_cols=TRUE, main="ALCL Sample Distance Clustering")
plot(hclust(sampleDistspro), labels=rownames(sampleDistMtxpro), main="ALCL Sample Distance Dendrogram")
```

Assemble counts results
```{r}
avg_pro <- data.frame(peakID=rownames(normcounts_pro))
avg_pro$KARPAS299 <- (normcounts_pro$KARPAS299)
avg_pro$L82 <- (normcounts_pro$L82)
avg_pro$SUPM2 <- (normcounts_pro$SUPM2)
avg_pro$SUDHL1 <- (normcounts_pro$SUDHL1)
avg_pro$KIJK <- (normcounts_pro$KIJK)
avg_pro$SR786 <- (normcounts_pro$SR786)
avg_pro$FEPD <- (normcounts_pro$FEPD)
avg_pro$DL40 <- (normcounts_pro$DL40)
avg_pro$MAC2A <- (normcounts_pro$MAC2A)

avg_pro$max <- apply(avg_pro[c("KARPAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")], 1, max, na.rm=TRUE)
rel_pro <- data.frame(peakID=avg_pro$peakID)
rel_pro$KARPAS299 <- avg_pro$KARPAS299 / avg_pro$max
rel_pro$L82 <- avg_pro$L82 / avg_pro$max
rel_pro$SUPM2 <- avg_pro$SUPM2 / avg_pro$max
rel_pro$SUDHL1 <- avg_pro$SUDHL1 / avg_pro$max
rel_pro$KIJK <- avg_pro$KIJK / avg_pro$max
rel_pro$SR786 <- avg_pro$SR786 / avg_pro$max
rel_pro$FEPD <- avg_pro$FEPD / avg_pro$max
rel_pro$DL40 <- avg_pro$DL40 / avg_pro$max
rel_pro$MAC2A <- avg_pro$MAC2A / avg_pro$max

```

PCA top contributors selection (5000 tried first)
```{r message=FALSE, warning=FALSE}
dREG_annot <- read.table("ALCL_p1000_dREGpeak_annotation.txt", sep="\t", header=TRUE)

PRO_rel <- rel_pro
#check this
colnames(PRO_rel) <- c("dREGpeakID", "KARPAS299.enhancer", "L82.enhancer", "SUPM2.enhancer", "SUDHL1.enhancer", "KIJK.enhancer",  "SR786.enhancer", "FEPD.enhancer", "DL40.enhancer",  "MAC2A.ehancer")
saveRDS(PRO_rel, file="pro_rel.rds") 
head(PRO_rel)

rv <- rowVars(assay(rld_pro))
select <- order(rv, decreasing=TRUE)[seq_len(min(5000, length(rv)))] 
#Changing the 5000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_pro)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 5000 pro PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 5000 elements with their annotations
top5000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload)),]
write.table(top5000, file="Top5000.txt", sep='\t')
ggplot(top5000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA Contributors dREGpeak Annotations")
ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
pro_5000_rel <- rel_pro[(rel_pro$peakID %in% top5000$peakID),]
#Gap Stat Calculation
#set.seed(42) 
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
gap_stat <- clusGap(pro_5000_rel[2:10], FUN = kmeans, nstart = 20, K.max = 24, B = 50) 
#... = number of samples + 1
#Cluster Stat Plots
fviz_nbclust(pro_5000_rel[2:10], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
fviz_nbclust(pro_5000_rel[2:10], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk4 <- clara(pro_5000_rel[2:10],4,medoids.x=TRUE)
CLARAk5 <- clara(pro_5000_rel[2:10],5,medoids.x=TRUE)
CLARAk6 <- clara(pro_5000_rel[2:10],6,medoids.x=TRUE)
CLARAk7 <- clara(pro_5000_rel[2:10],7,medoids.x=TRUE)
CLARAk8 <- clara(pro_5000_rel[2:10],8,medoids.x=TRUE)

pro_5000_rel$k4 <- CLARAk4$clustering
pro_5000_rel$k5 <- CLARAk5$clustering
pro_5000_rel$k6 <- CLARAk6$clustering
pro_5000_rel$k7 <- CLARAk7$clustering
pro_5000_rel$k8 <- CLARAk8$clustering

#Displaying Heatmaps
#k-means 4
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+1]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+1])
#k-means 5
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+2]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+2])
#k-means 6
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+3]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+3])
#k-means 7
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+4]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+4])
#k-means 8
pheatmap(pro_5000_rel[order(pro_5000_rel[, 10+5]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[10+5])
```

--PRO END--

Combined PRO and RNA analysis

Load DESeq objects and annotations
```{r}
#dds_rna <- readRDS("RNA_dds.rds")
#dds_pro <- readRDS("dREG_dds.rds")
dREGann <- read.table("ALCL_p1000_m1000000_gene-centric_closest-dREGpeaks.txt", sep="\t", header=T)
write.table(deseq_sizes_pro, file="DEseq size factors.txt", sep="\t", quote=F, row.names=T)
```

Make prelim dataframes
```{r}
norm_rna <- as.data.frame(counts(dds_rna, normalized=TRUE))
rld_rna2 <- rlog(dds_rna, blind=TRUE)

colnames(norm_rna) <- dds_rna$sample #Simplified names from dds metadata

rel_rna2 <- data.frame("geneID"=rownames(norm_rna))
rel_rna2$max.gene.counts <- apply(norm_rna[1:ncol(norm_rna)], 1, max, na.rm=TRUE)
i=1
while (i<=ncol(norm_rna)){
  rel_rna2[paste0(dds_rna$sample[i],".gene.rel")] <- norm_rna[,i]/rel_rna2$max.gene.counts
  i=i+1
}
#reorder
rel_rna2 <- rel_rna2[,c(1,3:11,2)]

norm_pro <- as.data.frame(counts(dds_pro, normalized=TRUE))

colnames(norm_pro) <- dds_pro$sample #Simplified names from dds metadata

norm_pro <- norm_pro[names(norm_rna)] #reorder sample columns to match RNA-seq data

colnames(PRO_rel)[1] <- ("closest_peak")
#rel_pro2 <- data.frame("closest_peak"=rownames(norm_pro))
PRO_rel$max.dREG.counts <- apply(norm_pro[1:ncol(norm_pro)], 1, max, na.rm=TRUE)
#PRO_rel <- PRO_rel[,c(1,27,2:26)] #reorder columns
#i=1
#while (i<=ncol(norm_pro)){
#  rel_pro2[paste0(dds_pro$sample[i],".dREG.rel")] <- norm_pro[,i]/rel_pro2$max.dREG.counts
#  i=i+1
#}
```

Selecting gene PCA contributors by variance
```{r}
#Select top 1000 PCA contributors
rv <- rowVars(assay(rld_rna2))
select <- order(rv, decreasing=TRUE)[seq_len(min(1000, length(rv)))] #Changing the 500 here will select a different number of top contributors if desired
pc <- prcomp(t(assay(rld_rna2)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
sweep(aload, 2, colSums(aload), "/")
 
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
 
#Merge top 500 elements with their annotations
top1k_rel_rna <- rel_rna2[(rel_rna2$geneID %in% rownames(aload)),] #filter rel gene dataframe for top 10k
```

Determining clusters (k=3)
```{r}
CLARA <- clara(top1k_rel_rna[2:10],3,medoids.x=TRUE)

top1k_rel_rna$k3 <- CLARA$clustering

pheatmap(top1k_rel_rna[order(top1k_rel_rna[,11]),][,2:10], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top1k_rel_rna[11])
```

Data integration
```{r}
gene_res <- rel_rna2[(!rel_rna2$geneID %in% top1k_rel_rna$geneID),] #filter out genes not in top 10k
gene_res$k3 <- NA #add value for genes not clustered
gene_res <- rbind(top1k_rel_rna, gene_res) #combine with top 10k genes with cluster designations
#gene_res$geneID <- sapply(strsplit(gene_res$geneID, split="[.]"), head, 1) #simplify geneIDs by removing version numbers

comb_res <- unique(merge(dREGann, gene_res, by.x="geneID", by.y="geneID", all.y=T))
colnames(comb_res)[1] <- "geneID" 
comb_res <- merge(comb_res, PRO_rel, by.x="closest_peak", by.y="closest_peak", all.x=T) #combine with dREG peak data, keep genes without a closest dREG peak call

comb_res <- comb_res[,c(2:9,13:23,1,10:12,24:33)] #reorder columns
```

Gene-to-Enhancer concordance analysis -- this generates a stat ranging from -1 (anti-correlated) to 0 (not correlated) to +1 (correlated) reported as Spearman rho
```{r message=F, warning=F}
#note this will produce a lot of errors for rows that don't have a gene and a an enhancer, so I've silenced them here
spear <- c()
i <- 1
while(i<=nrow(comb_res)){
  speari <- cor(as.numeric(factor(comb_res[i,9:17])),as.numeric(factor(comb_res[i,24:32])),method="spearman")
  spear <- c(spear, speari)
  i <- i+1
}

comb_res$Spearman.rho <- spear
```

Save output
```{r}
write.table(comb_res, file="ALCL.final.table.txt", sep="\t", quote=F, row.names=F)
dREGcountsann <- merge(PRO_rel, dREG_readFilt, by.x="closest_peak", by.y="dREGpeakID")
dREGcountsann <- dREGcountsann[,c(1,12:18,2:11)]
write.table(dREGcountsann, file="ALCL.dREGs.txt", sep="\t", quote=F, row.names=F)
```

Correct output to include metadata for genes without nearest enhancer calls
```{r}
library(dplyr)

#remove columns with annotation issue
comb_res_woEnh <- comb_res %>% filter(is.na(geneName))
comb_res_wEnh <- comb_res %>% filter(!is.na(geneName))

#Load original unfiltered annotations
comp_gtf <- read.table("Homo_sapiens.GRCh38.99.ucsc.gtf", sep="\t")
colnames(comp_gtf) <- c("chr", "source", "type", "start", "end", "score", "strand", "phase", "attributes")

#Function to extract metadata from gtf annotation
extract_attributes <- function(gtf_attributes, att_of_interest){
  att <- strsplit(as.character(gtf_attributes), "; ")
  att <- gsub("\"","",unlist(att))
  att <- gsub(";","",unlist(att))
  if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
    return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
  }else{
    return(NA)}
}

#Extract desired information from gtf (geneID, geneName, biotype)
comp_gene <- filter(comp_gtf, type == "gene")
comp_gene$geneID <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_id"))
comp_gene$geneName <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_name"))
comp_gene$biotype <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_biotype"))

#Remove offending columns
#geneName,biotype,attributes,chr,start,end,strand
comb_res_woEnh <- comb_res_woEnh[,c(1,9:34)]

#Add back information
geneInfo <- data.frame("geneID"=comp_gene$geneID, "geneName"=comp_gene$geneName, "biotype"=comp_gene$biotype, "attributes"=comp_gene$attributes, "chr"=comp_gene$chr, "start"=comp_gene$start, "end"=comp_gene$end, "strand"=comp_gene$strand)
comb_res_woEnh_corr <- merge(geneInfo, comb_res_woEnh, by.x="geneID", by.y="geneID", all.y=T)

#Modify geneName and peak_counts
comb_res_woEnh_corr$peak_count <- 0
comb_res_woEnh_corr$geneName <- paste0(comb_res_woEnh_corr$geneName)

#Correcting geneNames in original
comb_res_wEnh$geneName <- substring(comb_res_wEnh$geneName, 1)
comb_res_wEnh$geneName <- paste0(comb_res_wEnh$geneName)

#Combining it all together
comb_res_corr <- rbind(comb_res_woEnh_corr, comb_res_wEnh)

write.table(comb_res_corr, file="ALCL.combined.results.corrected.txt", sep="\t", quote=F, row.names=F)
```

Combining full annotation to plot based on genomic category
```{r}
filtered_res <- merge(dREG_annot, comb_res_corr, by.x="peakID", by.y="closest_peak", all.y=F)
write.table(filtered_res, file="Overlaping contribtors.txt", sep='\t', quote=F, row.names=F)
ggplot(filtered_res, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Overlapping PCA Contributors for Gene Expression and dREGpeak Annotations")
```


Combination completed, further investigation into clusters ongoing - 28JAN22