Document creation Libraries
library(DESeq2)
library(Rsubread)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ tibble 3.1.4 ✓ purrr 0.3.4
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 2.0.1 ✓ forcats 0.5.1
── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::collapse() masks IRanges::collapse()
x gridExtra::combine() masks dplyr::combine(), Biobase::combine(), BiocGenerics::combine()
x dplyr::count() masks matrixStats::count()
x dplyr::desc() masks IRanges::desc()
x tidyr::expand() masks S4Vectors::expand()
x plotly::filter() masks dplyr::filter(), stats::filter()
x dplyr::first() masks S4Vectors::first()
x dplyr::lag() masks stats::lag()
x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
x plotly::rename() masks dplyr::rename(), S4Vectors::rename()
x plotly::slice() masks dplyr::slice(), IRanges::slice()
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(ggrepel)
library(ggpubr)
Attaching package: ‘ggpubr’
The following object is masked from ‘package:dendextend’:
rotate
library(cluster)
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(amap)
library(NbClust)
library(dplyr)
Set working directory to local workspace Save R markup, featurecounts results and full gtf there RNA featurecounts generated with ‘Homo_sapiens.GRCh38.99.gtf.gz’ file
setwd("/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/25 TCLs/MKJ R scripts")
getwd()
[1] "/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/25 TCLs/MKJ R scripts"
RNA
Load featurecounts results, configure as data frame object Setup metadata information and check naming consistency Remove non-counts columns in fc object
fc <- read.table("WholeGeneCounts_Weinstock002_RNA", sep="\t", header=TRUE, row.names=1)
fc2 <- as.data.frame(fc)
orig_names <- colnames(fc2[6:30])
colData_rna <- data.frame(row.names=orig_names, sample=c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299", "KARPAS384", "KHYG1", "KIJK", "L82", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1", "Su9T01", "SUDHL1", "SUPM2"), subtype=c("HS_TCL", "ALK-ALCL", "ALK-ALCL", "CTCL", "CTCL", "ATLL", "ALK+ALCL", "SQGD_TCL", "NKL", "ALK+ALCL", "ALK+ALCL", "ALK-ALCL", "CTCL", "T_LGL", "NKT", "CTCL", "NKT", "PTCL_NOS", "CTCL", "PTCL_NOS", "ALK+ALCL", "ATLL", "ATLL", "ALK+ALCL", "ALK+ALCL"))
all(rownames(colData_rna) %in% colnames(fc2))
[1] TRUE
fc3 <- subset(fc2, select=-c(Chr, Start, End, Strand, Length))
colnames(fc3) <- c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299", "KARPAS384", "KHYG1", "KIJK", "L82", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1", "Su9T01", "SUDHL1", "SUPM2")
Create DEseq object and save file Generate normalization factor with DEseq default
dds_rna <- DESeqDataSetFromMatrix(countData=fc3, colData=colData_rna, design=~sample)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
row.names(deseq_sizes_rna) <- c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299", "KARPAS384", "KHYG1", "KIJK", "L82", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1", "Su9T01", "SUDHL1", "SUPM2")
deseq_sizes_rna
saveRDS(dds_rna, file="RNA_dds.rds")
Generate norm counts files and save to local directory
rawcounts_rna <- as.data.frame(counts(dds_rna, normalized=FALSE))
normcounts_rna <- as.data.frame(counts(dds_rna, normalized=TRUE))
colnames(rawcounts_rna) <- c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299", "KARPAS384", "KHYG1", "KIJK", "L82", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1", "Su9T01", "SUDHL1", "SUPM2")
colnames(normcounts_rna) <- c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299", "KARPAS384", "KHYG1", "KIJK", "L82", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1", "Su9T01", "SUDHL1", "SUPM2")
rownames(rawcounts_rna) <- sapply(strsplit(rownames(rawcounts_rna), split="[.]"), head, 1)
rownames(normcounts_rna) <- sapply(strsplit(rownames(normcounts_rna), split="[.]"), head, 1)
write.table(rawcounts_rna, file="RNA_rawcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
write.table(normcounts_rna, file="RNA_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
Data transformation for plot generation Top 500 contributors is the default for PCA Set ‘ntop’ to number of observations in the dds object
rld_rna <- rlog(dds_rna, blind=TRUE)
sampleDistsrna <- dist(t(assay(rld_rna)))
sampleDistMtxrna <- as.matrix(sampleDistsrna)
rownames(sampleDistMtxrna) <- paste0(rld_rna$sample)
colnames(sampleDistMtxrna) <- NULL
#Generate PCA plot (intgroup set equal to desired metadata variable)
plotPCA(rld_rna, intgroup = "subtype", ntop=60676) + ggtitle("RNA-seq PCA") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 500 PCA
plotPCA(rld_rna, intgroup = "subtype") + ggtitle("RNA-seq PCA Top 500") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 1000
plotPCA(rld_rna, intgroup = "subtype", ntop=1000) + ggtitle("RNA-seq PCA Top 1000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 5000
plotPCA(rld_rna, intgroup = "subtype", ntop=5000) + ggtitle("RNA-seq PCA Top 5000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 10000
plotPCA(rld_rna, intgroup = "subtype", ntop=10000) + ggtitle("RNA-seq PCA Top 10000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

#top 15000
plotPCA(rld_rna, intgroup = "subtype", ntop=15000) + ggtitle("RNA-seq PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)

Additional dendrogram plots
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtxrna, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
cluster_rows=TRUE, cluster_cols=TRUE, main="RNA Sample Distance Clustering")

plot(hclust(sampleDistsrna), labels=rownames(sampleDistMtxrna), main="RNA Sample Distance Dendrogram")

Assemble counts results
avg_rna <- data.frame(EnsemblID=rownames(normcounts_rna))
avg_rna$DERL2 <- (normcounts_rna$DERL2)
avg_rna$DL40 <- (normcounts_rna$DL40)
avg_rna$FEPD <- (normcounts_rna$FEPD)
avg_rna$HH <- (normcounts_rna$HH)
avg_rna$HUT78 <- (normcounts_rna$HUT78)
avg_rna$HUT102 <- (normcounts_rna$HUT102)
avg_rna$KARPAS299 <- (normcounts_rna$KARPAS299)
avg_rna$KARPAS384 <- (normcounts_rna$KARPAS384)
avg_rna$KHYG1 <- (normcounts_rna$KHYG1)
avg_rna$KIJK <- (normcounts_rna$KIJK)
avg_rna$L82 <- (normcounts_rna$L82)
avg_rna$MAC2A <- (normcounts_rna$MAC2A)
avg_rna$MJ <- (normcounts_rna$MJ)
avg_rna$MOTN1 <- (normcounts_rna$MOTN1)
avg_rna$MTA <- (normcounts_rna$MTA)
avg_rna$MYLA <- (normcounts_rna$MYLA)
avg_rna$NKL <- (normcounts_rna$NKL)
avg_rna$OCILY12 <- (normcounts_rna$OCILY12)
avg_rna$SeAx <- (normcounts_rna$SeAx)
avg_rna$SMZ1 <- (normcounts_rna$SMZ1)
avg_rna$SR786 <- (normcounts_rna$SR786)
avg_rna$ST1 <- (normcounts_rna$ST1)
avg_rna$Su9T01 <- (normcounts_rna$Su9T01)
avg_rna$SUDHL1 <- (normcounts_rna$SUDHL1)
avg_rna$SUPM2 <- (normcounts_rna$SUPM2)
avg_rna$max <- apply(avg_rna[c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299", "KARPAS384", "KHYG1", "KIJK", "L82", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1", "Su9T01", "SUDHL1", "SUPM2")], 1, max, na.rm=TRUE)
rel_rna <- data.frame(EnsemblID=avg_rna$EnsemblID)
rel_rna$DERL2 <- avg_rna$DERL2 / avg_rna$max
rel_rna$DL40 <- avg_rna$DL40 / avg_rna$max
rel_rna$FEPD <- avg_rna$FEPD / avg_rna$max
rel_rna$HH <- avg_rna$HH / avg_rna$max
rel_rna$HUT78 <- avg_rna$HUT78 / avg_rna$max
rel_rna$HUT102 <- avg_rna$HUT102 / avg_rna$max
rel_rna$KARPAS299 <- avg_rna$KARPAS299 / avg_rna$max
rel_rna$KARPAS384 <- avg_rna$KARPAS384 / avg_rna$max
rel_rna$KHYG1 <- avg_rna$KHYG1 / avg_rna$max
rel_rna$KIJK <- avg_rna$KIJK / avg_rna$max
rel_rna$L82 <- avg_rna$L82 / avg_rna$max
rel_rna$MAC2A <- avg_rna$MAC2A / avg_rna$max
rel_rna$MJ <- avg_rna$MJ / avg_rna$max
rel_rna$MOTN1 <- avg_rna$MOTN1 / avg_rna$max
rel_rna$MTA <- avg_rna$MTA / avg_rna$max
rel_rna$MYLA <- avg_rna$MYLA / avg_rna$max
rel_rna$NKL <- avg_rna$NKL / avg_rna$max
rel_rna$OCILY12 <- avg_rna$OCILY12 / avg_rna$max
rel_rna$SeAx <- avg_rna$SeAx / avg_rna$max
rel_rna$SMZ1 <- avg_rna$SMZ1 / avg_rna$max
rel_rna$SR786 <- avg_rna$SR786 / avg_rna$max
rel_rna$ST1 <- avg_rna$ST1 / avg_rna$max
rel_rna$Su9T01 <- avg_rna$Su9T01 / avg_rna$max
rel_rna$SUDHL1 <- avg_rna$SUDHL1 / avg_rna$max
rel_rna$SUPM2 <- avg_rna$SUPM2 / avg_rna$max
Data transform for variance assessment and sub-clustering
rv1 <- rowVars(assay(rld_rna))
select1 <- order(rv1, decreasing=TRUE)
pc1 <- prcomp(t(assay(rld_rna)[select1,]))
loadings1 <- as.data.frame(pc1$rotation)
aload1 <- abs(loadings1)
pcRank <- data.frame("EnsemblID"=rownames(loadings1), "PCA.rank"=seq.int(nrow(loadings1)))
sweep(aload1, 2, colSums(aload1), "/")
peakVar <- data.frame("EnsemblID"=rownames(assay(rld_rna)))
peakVar$rlog.variance <- rowVars(as.matrix(assay(rld_rna)))
peakVar <- merge(pcRank, peakVar, by.x="EnsemblID", by.y="EnsemblID")
ggplot(peakVar, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000,10000,15000), linetype="dotted") + ggtitle("Variance per PC Contributor")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar <- peakVar[order(peakVar$PCA.rank),]
peakVar$cumulative.Variance <- cumsum(peakVar$rlog.variance)/sum(peakVar$rlog.variance)
ggplot(peakVar, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000,10000,15000), linetype="dotted") + ggtitle("Cumulative Variance per PC Rank")

Removing Alk+ALCL to determine if PCA separation improves
dds_sub <- dds_rna[,-c(7,10,11,21,24,25)] #Excluding sample numbers of the ALK+ALCL
estimateSizeFactors(dds_sub) #Must re-calculate size factors after removing samples
class: DESeqDataSet
dim: 60676 19
metadata(1): version
assays(1): counts
rownames(60676): ENSG00000223972 ENSG00000227232 ... ENSG00000277475 ENSG00000268674
rowData names(0):
colnames(19):
X.n.scratch3.users.m.mkj13.RNA.mapping.Dedup.Mitoremoved.DERL2.chrM.removed.bam
X.n.scratch3.users.m.mkj13.RNA.mapping.Dedup.Mitoremoved.DL40.chrM.removed.bam ...
X.n.scratch3.users.m.mkj13.RNA.mapping.Dedup.Mitoremoved.STI.chrM.removed.bam
X.n.scratch3.users.m.mkj13.RNA.mapping.Dedup.Mitoremoved.Su9T01.chrM.removed.bam
colData names(3): sample subtype sizeFactor
dds_sub
class: DESeqDataSet
dim: 60676 19
metadata(1): version
assays(1): counts
rownames(60676): ENSG00000223972 ENSG00000227232 ... ENSG00000277475 ENSG00000268674
rowData names(0):
colnames(19):
X.n.scratch3.users.m.mkj13.RNA.mapping.Dedup.Mitoremoved.DERL2.chrM.removed.bam
X.n.scratch3.users.m.mkj13.RNA.mapping.Dedup.Mitoremoved.DL40.chrM.removed.bam ...
X.n.scratch3.users.m.mkj13.RNA.mapping.Dedup.Mitoremoved.STI.chrM.removed.bam
X.n.scratch3.users.m.mkj13.RNA.mapping.Dedup.Mitoremoved.Su9T01.chrM.removed.bam
colData names(3): sample subtype sizeFactor
rld_sub <- rlog(dds_sub, blind=TRUE)
colnames(rld_sub) <- paste0(dds_sub$sample)
sampleDists_sub <- dist(t(assay(rld_sub)))
sampleDistMtx_sub <- as.matrix(sampleDists_sub)
rownames(sampleDistMtx_sub) <- colnames(rld_sub)
colnames(sampleDistMtx_sub) <- NULL
hclust_sub <- hclust(sampleDists_sub)
plot(hclust_sub, labels=rownames(sampleDistMtx_sub), main="RNA-seq subset Gene Dendrogram")

PCAs
plotPCA(rld_sub, intgroup = "subtype") + ggtitle("RNA-seq subset PCA Top 500") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=1000) + ggtitle("RNA-seq subset PCA Top 1000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=5000) + ggtitle("RNA-seq subset PCA Top 5000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=10000) + ggtitle("RNA-seq subset PCA Top 10000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=15000) + ggtitle("RNA-seq subset PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=60676) + ggtitle("RNA-seq subset PCA All Contributors") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

PCA top contributors selection (5K to 20K)
rv <- rowVars(assay(rld_rna))
select <- order(rv, decreasing=TRUE)[seq_len(min(10000, length(rv)))]
#Changing the 10000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_rna)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 10000 RNA PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 10000 elements with their annotations
top10000 <- rel_rna[(rel_rna$EnsemblID %in% rownames(aload)),]
write.table(top10000, file="Top10000.txt", sep='\t')
RNA_rel <- rel_rna[(rel_rna$EnsemblID %in% top10000$EnsemblID),]
saveRDS(RNA_rel, file="RNA_rel.rds")
colnames(RNA_rel) <- c("EnsemblID", "DERL2.gene", "DL40.gene", "FEPD.gene", "HH.gene", "HUT78.gene", "HUT102.gene", "KARPAS299.gene", "KARPAS384.gene", "KHYG1.gene", "KIJK.gene", "L82.gene", "MAC2A.gene", "MJ.gene", "MOTN1.gene", "MTA.gene", "MYLA.gene", "NKL.gene", "OCILY12.gene", "SeAx.gene", "SMZ1.gene", "SR786.gene", "ST1.gene", "Su9T01.gene", "SUDHL1.gene", "SUPM2.gene")
head(top10000)
head(RNA_rel)
#Gap Stat Calculation
#set.seed(42)
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(RNA_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50)
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(top10000[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(top10000[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk20 <- clara(top10000[2:26],20,medoids.x=TRUE)
CLARAk26 <- clara(top10000[2:26],26,medoids.x=TRUE)
CLARAk30 <- clara(top10000[2:26],30,medoids.x=TRUE)
CLARAk36 <- clara(top10000[2:26],36,medoids.x=TRUE)
CLARAk15 <- clara(top10000[2:26],15,medoids.x=TRUE)
CLARAk25 <- clara(top10000[2:26],25,medoids.x=TRUE)
CLARAk22 <- clara(top10000[2:26],22,medoids.x=TRUE)
top10000$k20 <- CLARAk20$clustering
top10000$k26 <- CLARAk26$clustering
top10000$k30 <- CLARAk30$clustering
top10000$k36 <- CLARAk36$clustering
top10000$k15 <- CLARAk15$clustering
top10000$k15 <- CLARAk15$clustering
top10000$k25 <- CLARAk25$clustering
top10000$k22 <- CLARAk22$clustering
#Displaying Heatmaps
#k-means 20
pheatmap(top10000[order(top10000[, 26+1]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+1])

#k-means 26
pheatmap(top10000[order(top10000[, 26+2]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+2])

#k-means 30
pheatmap(top10000[order(top10000[, 26+3]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+3])

#k-means 36
pheatmap(top10000[order(top10000[, 26+4]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+4])

#k-means 15
pheatmap(top10000[order(top10000[, 26+5]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+5])

#k-means 25
pheatmap(top10000[order(top10000[, 26+6]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+6])

#k-means 22
pheatmap(top10000[order(top10000[, 26+7]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+7])

–RNA END–
PRO
#Prepare dREG peak saf for featureCounts
dREG <- read.table("TCLs_s0.5p0.025_filtered.bed")
colnames(dREG) <- c("chromosome", "start", "end", "dREGpeakID", "dREGscore", "strand", "centerStart",
"centerEnd")
write.table(dREG, file="dREG readtable.txt", sep="\t", quote=F, row.names=F)
dREGsaf <- data.frame("GeneID"=dREG$dREGpeakID, "Chr"=dREG$chromosome, "Start"=dREG$start, "End"=dREG$end, "Strand"="+")
write.table(dREGsaf, file="dREG.saf", sep="\t", quote=F, row.names=F)
#Load sample SAM file list
#profiles <- c("")
#Run feautreCounts
#fc_dREG <- featureCounts(profiles, annot.ext=dREGsaf, isGTFAnnotationFile=FALSE, minMQS=10, countChimericFragments=FALSE, isPairedEnd=FALSE, strandSpecific=0, nthreads=8)
fc_pro <- read.table("dREGCounts_Weinstock002_pro", header=T, row.names=1)
fc_pro <- as.data.frame(fc_pro)
fc_pro <- subset(fc_pro, select=-c(Chr, Start, End, Strand, Length))
colnames(fc_pro) <- c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "SMZ1", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx", "SUDHL1")
#Load sample metadata (can add as much sample metadata as desired including cell type, drug treatment, etc)
colData_pro <- data.frame(row.names=colnames(fc_pro$counts), sample=c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "SMZ1", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx", "SUDHL1"), subtype=c("ATLL", "ATLL", "ATLL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "PTCL_NOS", "CTCL", "ALK-ALCL", "ALK+ALCL", "ALK+ALCL", "ALK-ALCL", "SQGD_TCL", "ALK-ALCL", "PTCL_NOS", "CTCL", "CTCL", "CTCL", "NKT", "T_LGL", "NKT", "HS_TCL", "NKL", "CTCL", "ALK+ALCL"))
all((colData_pro$sample) %in% colnames(fc_pro))
[1] TRUE
#Create DESeq object and generate normalized counts
dds_pro <- DESeqDataSetFromMatrix(countData=fc_pro, colData=colData_pro, design=~sample)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
dds_pro <- estimateSizeFactors(dds_pro)
deseq_sizes_pro <- as.data.frame(sizeFactors(dds_pro))
deseq_sizes_pro
variable.names(dds_pro)
[1] "HUT102" "ST1" "Su9T01" "KARPAS299" "L82" "SUPM2" "SMZ1" "MYLA"
[9] "MAC2A" "KIJK" "SR786" "FEPD" "KARPAS384" "DL40" "OCILY12" "HUT78"
[17] "HH" "MJ" "MTA" "MOTN1" "NKL" "DERL2" "KHYG1" "SeAx"
[25] "SUDHL1"
keep <- rowMax(counts(dds_pro, normalize=TRUE)) >= 25
dds_pro <- dds_pro[keep,]
saveRDS(dds_pro, file="dREG_dds.rds")
dREG_readFilt <- dREG[dREG$dREGpeakID %in% rownames(dds_pro),]
write.table(dREG_readFilt, file="dREGpeaks.readFilt.bed", sep="\t", col.names=FALSE, row.names=FALSE,
quote=FALSE)
Generate norm counts files and save to local directory
rawcounts_pro <- as.data.frame(counts(dds_pro, normalized=FALSE))
normcounts_pro <- as.data.frame(counts(dds_pro, normalized=TRUE))
colnames(rawcounts_pro) <- c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "SMZ1", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx", "SUDHL1")
colnames(normcounts_pro) <- c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "SMZ1", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx", "SUDHL1")
write.table(rawcounts_pro, file="pro_rawcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
write.table(normcounts_pro, file="pro_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
Data transformation for plot generation Top 500 contributors is the default for PCA Set ‘ntop’ to number of observations in the dds object
rld_pro <- rlog(dds_pro, blind=TRUE)
sampleDistspro <- dist(t(assay(rld_pro)))
sampleDistMtxpro <- as.matrix(sampleDistspro)
rownames(sampleDistMtxpro) <- paste0(rld_pro$sample)
colnames(sampleDistMtxpro) <- NULL
#Generate PCA plot (intgroup set equal to desired metadata variable)
plotPCA(rld_pro, intgroup = "subtype", ntop=120888) + ggtitle("pro-seq PCA") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 500 PCA
plotPCA(rld_pro, intgroup = "subtype") + ggtitle("pro-seq PCA Top 500") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 1000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=1000) + ggtitle("pro-seq PCA Top 1000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 5000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=5000) + ggtitle("pro-seq PCA Top 5000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 10000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=10000) + ggtitle("pro-seq PCA Top 10000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 15000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=15000) + ggtitle("pro-seq PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

#top 20000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=20000) + ggtitle("pro-seq PCA Top 20000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)

Additional dendrogram plots
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtxpro, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
cluster_rows=TRUE, cluster_cols=TRUE, main="pro Sample Distance Clustering")

plot(hclust(sampleDistspro), labels=rownames(sampleDistMtxpro), main="pro Sample Distance Dendrogram")

Assemble counts results
avg_pro <- data.frame(peakID=rownames(normcounts_pro))
avg_pro$DERL2 <- (normcounts_pro$DERL2)
avg_pro$DL40 <- (normcounts_pro$DL40)
avg_pro$FEPD <- (normcounts_pro$FEPD)
avg_pro$HH <- (normcounts_pro$HH)
avg_pro$HUT78 <- (normcounts_pro$HUT78)
avg_pro$HUT102 <- (normcounts_pro$HUT102)
avg_pro$KARPAS299 <- (normcounts_pro$KARPAS299)
avg_pro$KARPAS384 <- (normcounts_pro$KARPAS384)
avg_pro$KHYG1 <- (normcounts_pro$KHYG1)
avg_pro$KIJK <- (normcounts_pro$KIJK)
avg_pro$L82 <- (normcounts_pro$L82)
avg_pro$MAC2A <- (normcounts_pro$MAC2A)
avg_pro$MJ <- (normcounts_pro$MJ)
avg_pro$MOTN1 <- (normcounts_pro$MOTN1)
avg_pro$MTA <- (normcounts_pro$MTA)
avg_pro$MYLA <- (normcounts_pro$MYLA)
avg_pro$NKL <- (normcounts_pro$NKL)
avg_pro$OCILY12 <- (normcounts_pro$OCILY12)
avg_pro$SeAx <- (normcounts_pro$SeAx)
avg_pro$SMZ1 <- (normcounts_pro$SMZ1)
avg_pro$SR786 <- (normcounts_pro$SR786)
avg_pro$ST1 <- (normcounts_pro$ST1)
avg_pro$Su9T01 <- (normcounts_pro$Su9T01)
avg_pro$SUDHL1 <- (normcounts_pro$SUDHL1)
avg_pro$SUPM2 <- (normcounts_pro$SUPM2)
avg_pro$max <- apply(avg_pro[c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299", "KARPAS384", "KHYG1", "KIJK", "L82", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1", "Su9T01", "SUDHL1", "SUPM2")], 1, max, na.rm=TRUE)
rel_pro <- data.frame(peakID=avg_pro$peakID)
rel_pro$DERL2 <- avg_pro$DERL2 / avg_pro$max
rel_pro$DL40 <- avg_pro$DL40 / avg_pro$max
rel_pro$FEPD <- avg_pro$FEPD / avg_pro$max
rel_pro$HH <- avg_pro$HH / avg_pro$max
rel_pro$HUT78 <- avg_pro$HUT78 / avg_pro$max
rel_pro$HUT102 <- avg_pro$HUT102 / avg_pro$max
rel_pro$KARPAS299 <- avg_pro$KARPAS299 / avg_pro$max
rel_pro$KARPAS384 <- avg_pro$KARPAS384 / avg_pro$max
rel_pro$KHYG1 <- avg_pro$KHYG1 / avg_pro$max
rel_pro$KIJK <- avg_pro$KIJK / avg_pro$max
rel_pro$L82 <- avg_pro$L82 / avg_pro$max
rel_pro$MAC2A <- avg_pro$MAC2A / avg_pro$max
rel_pro$MJ <- avg_pro$MJ / avg_pro$max
rel_pro$MOTN1 <- avg_pro$MOTN1 / avg_pro$max
rel_pro$MTA <- avg_pro$MTA / avg_pro$max
rel_pro$MYLA <- avg_pro$MYLA / avg_pro$max
rel_pro$NKL <- avg_pro$NKL / avg_pro$max
rel_pro$OCILY12 <- avg_pro$OCILY12 / avg_pro$max
rel_pro$SeAx <- avg_pro$SeAx / avg_pro$max
rel_pro$SMZ1 <- avg_pro$SMZ1 / avg_pro$max
rel_pro$SR786 <- avg_pro$SR786 / avg_pro$max
rel_pro$ST1 <- avg_pro$ST1 / avg_pro$max
rel_pro$Su9T01 <- avg_pro$Su9T01 / avg_pro$max
rel_pro$SUDHL1 <- avg_pro$SUDHL1 / avg_pro$max
rel_pro$SUPM2 <- avg_pro$SUPM2 / avg_pro$max
PCA top contributors selection (5000 tried first)
dREG_annot <- read.table("002_TCLs_p1000_dREGpeak_annotation.txt", sep="\t", header=TRUE)
PRO_rel <- rel_pro
#check this
colnames(PRO_rel) <- c("dREGpeakID", "DERL2.enhancer", "DL40.enhancer", "FEPD.enhancer", "HH.enhancer", "HUT78.ehancer", "HUT102.enhancer", "KARPAS299.enhancer", "KARPAS384.enhancer", "KHYG1.enhancer", "KIJK.enhancer", "L82.enhancer", "MAC2A.ehancer", "MJ.enhancer", "MOTN1.enhancer", "MTA.enhancer", "MYLA.enhancer", "NKL.enhancer", "OCILY12.enhancer", "SeAx.ehancer", "SMZ1.enhancer", "SR786.enhancer", "ST1.enhancer", "Su9T01.enhancer", "SUDHL1.enhancer", "SUPM2.enhancer")
saveRDS(PRO_rel, file="pro_rel.rds")
head(PRO_rel)
rv <- rowVars(assay(rld_pro))
select <- order(rv, decreasing=TRUE)[seq_len(min(5000, length(rv)))]
#Changing the 5000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_pro)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 5000 pro PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 5000 elements with their annotations
top5000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload)),]
write.table(top5000, file="Top5000.txt", sep='\t')
ggplot(top5000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA Contributors dREGpeak Annotations")

ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
pro_5000_rel <- rel_pro[(rel_pro$peakID %in% top5000$peakID),]
#Gap Stat Calculation
#set.seed(42)
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(pro_5000_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50)
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(pro_5000_rel[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(pro_5000_rel[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk4 <- clara(pro_5000_rel[2:26],4,medoids.x=TRUE)
CLARAk5 <- clara(pro_5000_rel[2:26],5,medoids.x=TRUE)
CLARAk6 <- clara(pro_5000_rel[2:26],6,medoids.x=TRUE)
CLARAk7 <- clara(pro_5000_rel[2:26],7,medoids.x=TRUE)
CLARAk8 <- clara(pro_5000_rel[2:26],8,medoids.x=TRUE)
pro_5000_rel$k4 <- CLARAk4$clustering
pro_5000_rel$k5 <- CLARAk5$clustering
pro_5000_rel$k6 <- CLARAk6$clustering
pro_5000_rel$k7 <- CLARAk7$clustering
pro_5000_rel$k8 <- CLARAk8$clustering
#Displaying Heatmaps
#k-means 4
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+1]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+1])

#k-means 5
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+2]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+2])

#k-means 6
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+3]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+3])

#k-means 7
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+4]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+4])

#k-means 8
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+5]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+5])

PCA top contributors (10K)
rv <- rowVars(assay(rld_pro))
select <- order(rv, decreasing=TRUE)[seq_len(min(10000, length(rv)))]
#Changing the 10000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_pro)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 10000 pro PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 10000 elements with their annotations
top10000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload)),]
write.table(top10000, file="Top10000.txt", sep='\t')
ggplot(top10000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 10000 PCA Contributors dREGpeak Annotations")

ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
PRO_10000_rel <- rel_pro[(rel_pro$peakID %in% top10000$peakID),]
#Gap Stat Calculation
#set.seed(42)
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(pro_10000_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50)
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(pro_10000_rel[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(pro_10000_rel[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk5 <- clara(PRO_10000_rel[2:26],5,medoids.x=TRUE)
CLARAk8 <- clara(PRO_10000_rel[2:26],8,medoids.x=TRUE)
CLARAk10 <- clara(PRO_10000_rel[2:26],10,medoids.x=TRUE)
CLARAk13 <- clara(PRO_10000_rel[2:26],13,medoids.x=TRUE)
CLARAk15 <- clara(PRO_10000_rel[2:26],15,medoids.x=TRUE)
CLARAk6 <- clara(PRO_10000_rel[2:26],6,medoids.x=TRUE)
CLARAk7 <- clara(PRO_10000_rel[2:26],7,medoids.x=TRUE)
CLARAk4 <- clara(PRO_10000_rel[2:26],4,medoids.x=TRUE)
PRO_10000_rel$k5 <- CLARAk5$clustering
PRO_10000_rel$k8 <- CLARAk8$clustering
PRO_10000_rel$k10 <- CLARAk10$clustering
PRO_10000_rel$k13 <- CLARAk13$clustering
PRO_10000_rel$k15 <- CLARAk15$clustering
PRO_10000_rel$k6 <- CLARAk6$clustering
PRO_10000_rel$k7 <- CLARAk7$clustering
PRO_10000_rel$k4 <- CLARAk4$clustering
#Displaying Heatmaps
#k-means 5
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+1]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+1])

#k-means 8
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+2]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+2])

#k-means 10
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+3]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+3])

#k-means 13
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+4]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+4])

#k-means 15
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+5]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+5])

#k-means 6
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+6]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+6])

#k-means 7
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+7]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+7])

#k-means 4
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+8]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+8])

Separating by disease subset and re-clustering
dds_pro_sub <- dds_pro[,-c(7,13,15,19,20,21,22,23)] #Excluding sample numbers of entities that are not Alk+Alcl, Alk-Alcl, Atll, and Ctcl
estimateSizeFactors(dds_pro_sub) #Must re-calculate size factors after removing samples
class: DESeqDataSet
dim: 120888 17
metadata(1): version
assays(1): counts
rownames(120888): dREGpeak000003 dREGpeak000004 ... dREGpeak312454 dREGpeak312455
rowData names(0):
colnames(17): HUT102 ST1 ... SeAx SUDHL1
colData names(3): sample subtype sizeFactor
deseq_sizes_pro_sub <- as.data.frame(sizeFactors(dds_pro_sub))
deseq_sizes_pro_sub
rld_pro_sub <- rlog(dds_pro_sub, blind=TRUE)
colnames(rld_pro_sub) <- paste0(dds_pro_sub$sample)
sampleDists_pro_sub <- dist(t(assay(rld_pro_sub)))
sampleDistMtx_pro_sub <- as.matrix(sampleDists_pro_sub)
rownames(sampleDistMtx_pro_sub) <- colnames(rld_pro_sub)
colnames(sampleDistMtx_pro_sub) <- NULL
hclust_pro_sub <- hclust(sampleDists_pro_sub)
plot(hclust_pro_sub, labels=rownames(sampleDistMtx_pro_sub), main="Largest Entity dREG Dendrogram")

pro subset PCAs
#plotPCA(rld_sub, intgroup = "subtype") + ggtitle("RNA-seq subset PCA Top 500") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)
plotPCA(rld_pro_sub, intgroup = "subtype", ntop=1000) + ggtitle("pro-seq subset PCA Top 1000") + scale_color_manual(breaks = c("ATLL","ALK+ALCL", "CTCL", "ALK-ALCL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtx_pro_sub), max.overlaps=50)

plotPCA(rld_pro_sub, intgroup = "subtype", ntop=5000) + ggtitle("pro-seq subset PCA Top 5000") + scale_color_manual(breaks = c("ATLL","ALK+ALCL", "CTCL", "ALK-ALCL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtx_pro_sub), max.overlaps=50)

plotPCA(rld_pro_sub, intgroup = "subtype", ntop=10000) + ggtitle("pro-seq subset PCA Top 10000") + scale_color_manual(breaks = c("ATLL","ALK+ALCL", "CTCL", "ALK-ALCL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtx_pro_sub), max.overlaps=50)

#plotPCA(rld_sub, intgroup = "subtype", ntop=15000) + ggtitle("RNA-seq subset PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)
#plotPCA(rld_sub, intgroup = "subtype", ntop=60676) + ggtitle("RNA-seq subset PCA All Contributors") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)
Generate norm counts file
normcounts_pro_sub <- as.data.frame(counts(dds_pro_sub, normalized=TRUE))
colnames(normcounts_pro_sub) <- c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "DL40", "HUT78", "HH", "MJ", "SeAx", "SUDHL1")
Additional dendrogram plots
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtx_pro_sub, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
cluster_rows=TRUE, cluster_cols=TRUE, main="pro subset Sample Distance Clustering")

plot(hclust(sampleDists_pro_sub), labels=rownames(sampleDistMtx_pro_sub), main="pro subset Sample Distance Dendrogram")

Assemble subset counts and average
avg_pro_sub <- data.frame(peakID=rownames(normcounts_pro_sub))
avg_pro_sub$HUT102 <- (normcounts_pro_sub$HUT102)
avg_pro_sub$ST1 <- (normcounts_pro_sub$ST1)
avg_pro_sub$Su9T01 <- (normcounts_pro_sub$Su9T01)
avg_pro_sub$KARPAS299 <- (normcounts_pro_sub$KARPAS299)
avg_pro_sub$L82 <- (normcounts_pro_sub$L82)
avg_pro_sub$SUPM2 <- (normcounts_pro_sub$SUPM2)
avg_pro_sub$MYLA <- (normcounts_pro_sub$MYLA)
avg_pro_sub$MAC2A <- (normcounts_pro_sub$MAC2A)
avg_pro_sub$KIJK <- (normcounts_pro_sub$KIJK)
avg_pro_sub$SR786 <- (normcounts_pro_sub$SR786)
avg_pro_sub$FEPD <- (normcounts_pro_sub$FEPD)
avg_pro_sub$DL40 <- (normcounts_pro_sub$DL40)
avg_pro_sub$HUT78 <- (normcounts_pro_sub$HUT78)
avg_pro_sub$HH <- (normcounts_pro_sub$HH)
avg_pro_sub$MJ <- (normcounts_pro_sub$MJ)
avg_pro_sub$SeAx <- (normcounts_pro_sub$SeAx)
avg_pro_sub$SUDHL1 <- (normcounts_pro_sub$SUDHL1)
avg_pro_sub$max <- apply(avg_pro_sub[c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "DL40", "HUT78", "HH", "MJ", "SeAx", "SUDHL1")], 1, max, na.rm=TRUE)
rel_pro_sub <- data.frame(peakID=avg_pro_sub$peakID)
rel_pro_sub$HUT102 <- avg_pro_sub$HUT102 / avg_pro_sub$max
rel_pro_sub$ST1 <- avg_pro_sub$ST1 / avg_pro_sub$max
rel_pro_sub$Su9T01 <- avg_pro_sub$Su9T01 / avg_pro_sub$max
rel_pro_sub$KARPAS299 <- avg_pro_sub$KARPAS299 / avg_pro_sub$max
rel_pro_sub$L82 <- avg_pro_sub$L82 / avg_pro_sub$max
rel_pro_sub$SUPM2 <- avg_pro_sub$SUPM2 / avg_pro_sub$max
rel_pro_sub$MYLA <- avg_pro_sub$MYLA / avg_pro_sub$max
rel_pro_sub$MAC2A <- avg_pro_sub$MAC2A / avg_pro_sub$max
rel_pro_sub$KIJK <- avg_pro_sub$KIJK / avg_pro_sub$max
rel_pro_sub$SR786 <- avg_pro_sub$SR786 / avg_pro_sub$max
rel_pro_sub$FEPD <- avg_pro_sub$FEPD / avg_pro_sub$max
rel_pro_sub$DL40 <- avg_pro_sub$DL40 / avg_pro_sub$max
rel_pro_sub$HUT78 <- avg_pro_sub$HUT78 / avg_pro_sub$max
rel_pro_sub$HH <- avg_pro_sub$HH / avg_pro_sub$max
rel_pro_sub$MJ <- avg_pro_sub$MJ / avg_pro_sub$max
rel_pro_sub$SeAx <- avg_pro_sub$SeAx / avg_pro_sub$max
rel_pro_sub$SUDHL1 <- avg_pro_sub$SUDHL1 / avg_pro_sub$max
PCA top contributors selection (5000)
#check this
colnames(rel_pro_sub) <- c("dREGpeakID", "HUT102.enhancer", "ST1.enhancer", "Su9T01.enhancer", "KARPAS299.enhancer", "L82.enhancer", "SUPM2.enhancer", "MYLA.enhancer", "MAC2A.ehancer", "KIJK.enhancer", "SR786.enhancer", "FEPD.enhancer", "DL40.enhancer", "HUT78.ehancer", "HH.enhancer", "MJ.enhancer", "SeAx.ehancer", "SUDHL1.enhancer")
rv2 <- rowVars(assay(rld_pro_sub))
select2 <- order(rv2, decreasing=TRUE)[seq_len(min(5000, length(rv2)))]
#Changing the 5000 here will select a Different number of top contributors if desired
pc2 <- prcomp(t(assay(rld_pro_sub)[select2,]))
loadings2 <- as.data.frame(pc2$rotation)
aload2 <- abs(loadings2)
write.table(sweep(aload, 2, colSums(aload2), "/"), file="Top 5000 pro sub PCA Contributors.txt")
##Merge top 5000 elements with their annotations
top5000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload2)),]
write.table(top5000, file="Top5000_pro_sub.txt", sep='\t')
ggplot(top5000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA subset Contributors dREGpeak Annotations")

ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
PRO_5000_rel <- rel_pro_sub[(rel_pro_sub$dREGpeakID %in% top5000$peakID),]
#Gap Stat Calculation
#set.seed(42)
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(RNA_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50)
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(top10000[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(top10000[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk15 <- clara(PRO_5000_rel[2:18],15,medoids.x=TRUE)
CLARAk20 <- clara(PRO_5000_rel[2:18],20,medoids.x=TRUE)
CLARAk22 <- clara(PRO_5000_rel[2:18],22,medoids.x=TRUE)
CLARAk25 <- clara(PRO_5000_rel[2:18],25,medoids.x=TRUE)
CLARAk26 <- clara(PRO_5000_rel[2:18],26,medoids.x=TRUE)
CLARAk30 <- clara(PRO_5000_rel[2:18],30,medoids.x=TRUE)
CLARAk36 <- clara(PRO_5000_rel[2:18],36,medoids.x=TRUE)
CLARAk4 <- clara(PRO_5000_rel[2:18],4,medoids.x=TRUE)
CLARAk6 <- clara(PRO_5000_rel[2:18],6,medoids.x=TRUE)
CLARAk10 <- clara(PRO_5000_rel[2:18],10,medoids.x=TRUE)
CLARAk3 <- clara(PRO_5000_rel[2:18],3,medoids.x=TRUE)
CLARAk5 <- clara(PRO_5000_rel[2:18],5,medoids.x=TRUE)
PRO_5000_rel$k15 <- CLARAk15$clustering
PRO_5000_rel$k20 <- CLARAk20$clustering
PRO_5000_rel$k22 <- CLARAk22$clustering
PRO_5000_rel$k25 <- CLARAk25$clustering
PRO_5000_rel$k26 <- CLARAk26$clustering
PRO_5000_rel$k30 <- CLARAk30$clustering
PRO_5000_rel$k36 <- CLARAk36$clustering
PRO_5000_rel$k4 <- CLARAk4$clustering
PRO_5000_rel$k6 <- CLARAk6$clustering
PRO_5000_rel$k10 <- CLARAk10$clustering
PRO_5000_rel$k3 <- CLARAk3$clustering
PRO_5000_rel$k5 <- CLARAk5$clustering
#Displaying Heatmaps
#k-means 15
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+1]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+1])

#k-means 20
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+2]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+2])

#k-means 22
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+3]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+3])

#k-means 25
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+4]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+4])

#k-means 26
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+5]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+5])

#k-means 30
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+6]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+6])

#k-means 36
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+7]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+7])

#k-means 4
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+8]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+8])

#k-means 6
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+9]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+9])

#k-means 10
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+10]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+10])

#k-means 3
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+11]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+11])

#k-means 5
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+12]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+12])

PCA top contributors (1000)
#check this
colnames(rel_pro_sub) <- c("dREGpeakID", "HUT102.enhancer", "ST1.enhancer", "Su9T01.enhancer", "KARPAS299.enhancer", "L82.enhancer", "SUPM2.enhancer", "MYLA.enhancer", "MAC2A.ehancer", "KIJK.enhancer", "SR786.enhancer", "FEPD.enhancer", "DL40.enhancer", "HUT78.ehancer", "HH.enhancer", "MJ.enhancer", "SeAx.ehancer", "SUDHL1.enhancer")
rv3 <- rowVars(assay(rld_pro_sub))
select3 <- order(rv3, decreasing=TRUE)[seq_len(min(1000, length(rv3)))]
#Changing the 1000 here will select a Different number of top contributors if desired
pc3 <- prcomp(t(assay(rld_pro_sub)[select3,]))
loadings3 <- as.data.frame(pc3$rotation)
aload3 <- abs(loadings3)
write.table(sweep(aload3, 2, colSums(aload3), "/"), file="Top 1000 pro sub PCA Contributors.txt")
##Merge top 1000 elements with their annotations
top1000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload3)),]
write.table(top1000, file="Top1000_pro_sub.txt", sep='\t')
ggplot(top1000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA subset Contributors dREGpeak Annotations")

ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
PRO_1000_rel <- rel_pro_sub[(rel_pro_sub$dREGpeakID %in% top1000$peakID),]
#Gap Stat Calculation
#set.seed(42)
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(RNA_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50)
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(top10000[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(top10000[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk3 <- clara(PRO_1000_rel[2:18],3,medoids.x=TRUE)
CLARAk4 <- clara(PRO_1000_rel[2:18],4,medoids.x=TRUE)
CLARAk5 <- clara(PRO_1000_rel[2:18],5,medoids.x=TRUE)
CLARAk6 <- clara(PRO_1000_rel[2:18],6,medoids.x=TRUE)
CLARAk7 <- clara(PRO_1000_rel[2:18],7,medoids.x=TRUE)
CLARAk8 <- clara(PRO_1000_rel[2:18],8,medoids.x=TRUE)
PRO_1000_rel$k3 <- CLARAk3$clustering
PRO_1000_rel$k4 <- CLARAk4$clustering
PRO_1000_rel$k5 <- CLARAk5$clustering
PRO_1000_rel$k6 <- CLARAk6$clustering
PRO_1000_rel$k7 <- CLARAk7$clustering
PRO_1000_rel$k8 <- CLARAk8$clustering
#Displaying Heatmaps
#k-means 3
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+1]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+1])

#k-means 4
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+2]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+2])

#k-means 5
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+3]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+3])

#k-means 6
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+4]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+4])

#k-means 7
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+5]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+5])

#k-means 8
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+6]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+6])

Removing Alk+ALCL to determine if PCA separation improves
dds_sub <- dds_pro[,-c(4,5,6,10,11,25)] #Excluding sample numbers of the ALK+ALCL
estimateSizeFactors(dds_sub) #Must re-calculate size factors after removing samples
class: DESeqDataSet
dim: 120888 19
metadata(1): version
assays(1): counts
rownames(120888): dREGpeak000003 dREGpeak000004 ... dREGpeak312454 dREGpeak312455
rowData names(0):
colnames(19): HUT102 ST1 ... KHYG1 SeAx
colData names(3): sample subtype sizeFactor
dds_sub
class: DESeqDataSet
dim: 120888 19
metadata(1): version
assays(1): counts
rownames(120888): dREGpeak000003 dREGpeak000004 ... dREGpeak312454 dREGpeak312455
rowData names(0):
colnames(19): HUT102 ST1 ... KHYG1 SeAx
colData names(3): sample subtype sizeFactor
rld_sub <- rlog(dds_sub, blind=TRUE)
colnames(rld_sub) <- paste0(dds_sub$sample)
sampleDists_sub <- dist(t(assay(rld_sub)))
sampleDistMtx_sub <- as.matrix(sampleDists_sub)
rownames(sampleDistMtx_sub) <- colnames(rld_sub)
colnames(sampleDistMtx_sub) <- NULL
hclust_sub <- hclust(sampleDists_sub)
plot(hclust_sub, labels=rownames(sampleDistMtx_sub), main="PRO-seq subset Gene Dendrogram")

PCAs
plotPCA(rld_sub, intgroup = "subtype") + ggtitle("PRO-seq subset PCA Top 500") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=1000) + ggtitle("PRO-seq subset PCA Top 1000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=5000) + ggtitle("PRO-seq subset PCA Top 5000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=10000) + ggtitle("PRO-seq subset PCA Top 10000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=15000) + ggtitle("PRO-seq subset PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=120888) + ggtitle("PRO-seq subset PCA All Contributors") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

Additional dendrogram plots
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtx_sub, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
cluster_rows=TRUE, cluster_cols=TRUE, main="PRO subset Sample Distance Clustering")

plot(hclust(sampleDists_sub), labels=rownames(sampleDistMtx_sub), main="PRO subset Sample Distance Dendrogram")

Generate norm counts file
normcounts_sub <- as.data.frame(counts(dds_sub, normalized=TRUE))
colnames(normcounts_sub) <- c("HUT102", "ST1", "Su9T01", "SMZ1", "MYLA", "MAC2A", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx")
#write.table(normcounts_pro, file="pro_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
Assemble counts results
avg_sub <- data.frame(peakID=rownames(normcounts_sub))
avg_sub$DERL2 <- (normcounts_sub$DERL2)
avg_sub$DL40 <- (normcounts_sub$DL40)
avg_sub$FEPD <- (normcounts_sub$FEPD)
avg_sub$HH <- (normcounts_sub$HH)
avg_sub$HUT78 <- (normcounts_sub$HUT78)
avg_sub$HUT102 <- (normcounts_sub$HUT102)
avg_sub$KARPAS384 <- (normcounts_sub$KARPAS384)
avg_sub$KHYG1 <- (normcounts_sub$KHYG1)
avg_sub$MAC2A <- (normcounts_sub$MAC2A)
avg_sub$MJ <- (normcounts_sub$MJ)
avg_sub$MOTN1 <- (normcounts_sub$MOTN1)
avg_sub$MTA <- (normcounts_sub$MTA)
avg_sub$MYLA <- (normcounts_sub$MYLA)
avg_sub$NKL <- (normcounts_sub$NKL)
avg_sub$OCILY12 <- (normcounts_sub$OCILY12)
avg_sub$SeAx <- (normcounts_sub$SeAx)
avg_sub$SMZ1 <- (normcounts_sub$SMZ1)
avg_sub$ST1 <- (normcounts_sub$ST1)
avg_sub$Su9T01 <- (normcounts_sub$Su9T01)
avg_sub$max <- apply(avg_sub[c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS384", "KHYG1", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "ST1", "Su9T01")], 1, max, na.rm=TRUE)
rel_sub <- data.frame(peakID=avg_sub$peakID)
rel_sub$DERL2 <- avg_sub$DERL2 / avg_sub$max
rel_sub$DL40 <- avg_sub$DL40 / avg_sub$max
rel_sub$FEPD <- avg_sub$FEPD / avg_sub$max
rel_sub$HH <- avg_sub$HH / avg_sub$max
rel_sub$HUT78 <- avg_sub$HUT78 / avg_sub$max
rel_sub$HUT102 <- avg_sub$HUT102 / avg_sub$max
rel_sub$KARPAS384 <- avg_sub$KARPAS384 / avg_sub$max
rel_sub$KHYG1 <- avg_sub$KHYG1 / avg_sub$max
rel_sub$MAC2A <- avg_sub$MAC2A / avg_sub$max
rel_sub$MJ <- avg_sub$MJ / avg_sub$max
rel_sub$MOTN1 <- avg_sub$MOTN1 / avg_sub$max
rel_sub$MTA <- avg_sub$MTA / avg_sub$max
rel_sub$MYLA <- avg_sub$MYLA / avg_sub$max
rel_sub$NKL <- avg_sub$NKL / avg_sub$max
rel_sub$OCILY12 <- avg_sub$OCILY12 / avg_sub$max
rel_sub$SeAx <- avg_sub$SeAx / avg_sub$max
rel_sub$SMZ1 <- avg_sub$SMZ1 / avg_sub$max
rel_sub$ST1 <- avg_sub$ST1 / avg_sub$max
rel_sub$Su9T01 <- avg_sub$Su9T01 / avg_sub$max
PCA top contributors selection (5000 tried first)
#check this
colnames(rel_sub) <- c("dREGpeakID", "DERL2.enhancer", "DL40.enhancer", "FEPD.enhancer", "HH.enhancer", "HUT78.ehancer", "HUT102.enhancer", "KARPAS384.enhancer", "KHYG1.enhancer", "MAC2A.ehancer", "MJ.enhancer", "MOTN1.enhancer", "MTA.enhancer", "MYLA.enhancer", "NKL.enhancer", "OCILY12.enhancer", "SeAx.ehancer", "SMZ1.enhancer", "ST1.enhancer", "Su9T01.enhancer")
rv <- rowVars(assay(rld_sub))
select <- order(rv, decreasing=TRUE)[seq_len(min(5000, length(rv)))]
#Changing the 5000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_sub)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 5000 pro subset PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 5000 elements with their annotations
top5000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload)),]
#write.table(top5000, file="Top5000.txt", sep='\t')
#ggplot(top5000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA Contributors dREGpeak Annotations")
#ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")
#filter results down by top contributors
sub_5000_rel <- rel_sub[(rel_sub$dREGpeakID %in% top5000$peakID),]
#Gap Stat Calculation
#set.seed(42)
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(pro_5000_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50)
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(pro_5000_rel[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(pro_5000_rel[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk4 <- clara(sub_5000_rel[2:20],4,medoids.x=TRUE)
CLARAk5 <- clara(sub_5000_rel[2:20],5,medoids.x=TRUE)
CLARAk6 <- clara(sub_5000_rel[2:20],6,medoids.x=TRUE)
CLARAk7 <- clara(sub_5000_rel[2:20],7,medoids.x=TRUE)
CLARAk8 <- clara(sub_5000_rel[2:20],8,medoids.x=TRUE)
sub_5000_rel$k4 <- CLARAk4$clustering
sub_5000_rel$k5 <- CLARAk5$clustering
sub_5000_rel$k6 <- CLARAk6$clustering
sub_5000_rel$k7 <- CLARAk7$clustering
sub_5000_rel$k8 <- CLARAk8$clustering
#Displaying Heatmaps
#k-means 4
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+1]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+1])

#k-means 5
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+2]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+2])

#k-means 6
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+3]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+3])

#k-means 7
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+4]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+4])

#k-means 8
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+5]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+5])

–PRO END–
Combined PRO and RNA analysis
Load DESeq objects and annotations
#dds_rna <- readRDS("RNA_dds.rds")
#dds_pro <- readRDS("dREG_dds.rds")
dREGann <- read.table("002_TCLs_p1000_m1000000_gene-centric_closest-dREGpeaks.txt", sep="\t", header=T)
write.table(deseq_sizes_pro, file="DEseq size factors.txt", sep="\t", quote=F, row.names=T)
Make prelim dataframes
norm_rna <- as.data.frame(counts(dds_rna, normalized=TRUE))
rld_rna2 <- rlog(dds_rna, blind=TRUE)
colnames(norm_rna) <- dds_rna$sample #Simplified names from dds metadata
rel_rna2 <- data.frame("geneID"=rownames(norm_rna))
rel_rna2$max.gene.counts <- apply(norm_rna[1:ncol(norm_rna)], 1, max, na.rm=TRUE)
i=1
while (i<=ncol(norm_rna)){
rel_rna2[paste0(dds_rna$sample[i],".gene.rel")] <- norm_rna[,i]/rel_rna2$max.gene.counts
i=i+1
}
norm_pro <- as.data.frame(counts(dds_pro, normalized=TRUE))
colnames(norm_pro) <- dds_pro$sample #Simplified names from dds metadata
colnames(norm_pro)[2] <- "ST1" #Think there was a typo in this sample name
colnames(norm_rna)[7] <- "KARPAS299"
colnames(norm_rna)[8] <- "KARPAS384"
norm_pro <- norm_pro[names(norm_rna)] #reorder sample columns to match RNA-seq data
colnames(PRO_rel)[1] <- ("closest_peak")
#rel_pro2 <- data.frame("closest_peak"=rownames(norm_pro))
PRO_rel$max.dREG.counts <- apply(norm_pro[1:ncol(norm_pro)], 1, max, na.rm=TRUE)
PRO_rel <- PRO_rel[,c(1,27,2:26)] #reorder columns
#i=1
#while (i<=ncol(norm_pro)){
# rel_pro2[paste0(dds_pro$sample[i],".dREG.rel")] <- norm_pro[,i]/rel_pro2$max.dREG.counts
# i=i+1
#}
Selecting gene PCA contributors by variance
#Select top 10000 PCA contributors
rv <- rowVars(assay(rld_rna2))
select <- order(rv, decreasing=TRUE)[seq_len(min(10000, length(rv)))] #Changing the 500 here will select a different number of top contributors if desired
pc <- prcomp(t(assay(rld_rna2)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
sweep(aload, 2, colSums(aload), "/")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 500 elements with their annotations
top10k_rel_rna <- rel_rna2[(rel_rna2$geneID %in% rownames(aload)),] #filter rel gene dataframe for top 10k
Determining clusters (k=30)
CLARA <- clara(top10k_rel_rna[3:27],30,medoids.x=TRUE)
top10k_rel_rna$k30 <- CLARA$clustering
pheatmap(top10k_rel_rna[order(top10k_rel_rna[,28]),][,3:27], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10k_rel_rna[28])

Data integration
gene_res <- rel_rna2[(!rel_rna2$geneID %in% top10k_rel_rna$geneID),] #filter out genes not in top 10k
gene_res$k30 <- NA #add value for genes not clustered
gene_res <- rbind(top10k_rel_rna, gene_res) #combine with top 10k genes with cluster designations
#gene_res$geneID <- sapply(strsplit(gene_res$geneID, split="[.]"), head, 1) #simplify geneIDs by removing version numbers
comb_res <- unique(merge(dREGann, gene_res, by.x="z", by.y="geneID", all.y=T))
colnames(comb_res)[1] <- "geneID"
comb_res <- merge(comb_res, PRO_rel, by.x="closest_peak", by.y="closest_peak", all.x=T) #combine with dREG peak data, keep genes without a closest dREG peak call
comb_res <- comb_res[,c(2:9,39,13:38,10:12,1,40:65)] #reorder columns
Gene-to-Enhancer concordance analysis – this generates a stat ranging from -1 (anti-correlated) to 0 (not correlated) to +1 (correlated) reported as Spearman rho
#note this will produce a lot of errors for rows that don't have a gene and a an enhancer, so I've silenced them here
spear <- c()
i <- 1
while(i<=nrow(comb_res)){
speari <- cor(as.numeric(factor(comb_res[i,11:35])),as.numeric(factor(comb_res[i,41:65])),method="spearman")
spear <- c(spear, speari)
i <- i+1
}
comb_res$Spearman.rho <- spear
Save output
write.table(comb_res, file="weinstock002.final.table.txt", sep="\t", quote=F, row.names=F)
dREGcountsann <- merge(PRO_rel, dREG_readFilt, by.x="closest_peak", by.y="dREGpeakID")
dREGcountsann <- dREGcountsann[,c(1,28:33,2:27)]
write.table(dREGcountsann, file="weinstock002.dREGs.txt", sep="\t", quote=F, row.names=F)
Correct output to include metadata for genes without nearest enhancer calls
library(dplyr)
#remove columns with annotation issue
comb_res_woEnh <- comb_res %>% filter(is.na(geneName))
comb_res_wEnh <- comb_res %>% filter(!is.na(geneName))
#Load original unfiltered annotations
comp_gtf <- read.table("Homo_sapiens.GRCh38.99.ucsc.gtf", sep="\t")
colnames(comp_gtf) <- c("chr", "source", "type", "start", "end", "score", "strand", "phase", "attributes")
#Function to extract metadata from gtf annotation
extract_attributes <- function(gtf_attributes, att_of_interest){
att <- strsplit(as.character(gtf_attributes), "; ")
att <- gsub("\"","",unlist(att))
att <- gsub(";","",unlist(att))
if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
}else{
return(NA)}
}
#Extract desired information from gtf (geneID, geneName, biotype)
comp_gene <- filter(comp_gtf, type == "gene")
comp_gene$geneID <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_id"))
comp_gene$geneName <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_name"))
comp_gene$biotype <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_biotype"))
#Remove offending columns
#geneName,biotype,attributes,chr,start,end,strand
comb_res_woEnh <- comb_res_woEnh[,c(1,9:66)]
#Add back information
geneInfo <- data.frame("geneID"=comp_gene$geneID, "geneName"=comp_gene$geneName, "biotype"=comp_gene$biotype, "attributes"=comp_gene$attributes, "chr"=comp_gene$chr, "start"=comp_gene$start, "end"=comp_gene$end, "strand"=comp_gene$strand)
comb_res_woEnh_corr <- merge(geneInfo, comb_res_woEnh, by.x="geneID", by.y="geneID", all.y=T)
#Modify geneName and peak_counts
comb_res_woEnh_corr$peak_count <- 0
comb_res_woEnh_corr$geneName <- paste0(comb_res_woEnh_corr$geneName)
#Correcting geneNames in original
comb_res_wEnh$geneName <- substring(comb_res_wEnh$geneName, 1)
comb_res_wEnh$geneName <- paste0(comb_res_wEnh$geneName)
#Combining it all together
comb_res_corr <- rbind(comb_res_wEnh, comb_res_woEnh_corr)
write.table(comb_res_corr, file="weinstock.combined.results.corrected.txt", sep="\t", quote=F, row.names=F)
Combining full annotation to plot based on genomic category
filtered_res <- merge(dREG_annot, comb_res_corr, by.x="peakID", by.y="closest_peak", all.y=F)
write.table(filtered_res, file="Overlaping contribtors.txt", sep='\t', quote=F, row.names=F)
ggplot(filtered_res, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Overlapping PCA Contributors for Gene Expression and dREGpeak Annotations")

Generating normed counts table by top 50K contributors
rv <- rowVars(assay(rld_pro))
select <- order(rv, decreasing=TRUE)[seq_len(min(50000, length(rv)))]
#Changing the 50000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_pro)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 50000 PRO Contributors.txt")
top50k_rel_pro <- PRO_rel[(PRO_rel$closest_peak %in% rownames(aload)),] #filter rel gene dataframe for top 50k
CLARA <- clara(top50k_rel_pro[3:27],3,medoids.x=TRUE)
top50k_rel_pro$k3 <- CLARA$clustering
enhan_res <- PRO_rel[(PRO_rel$closest_peak %in% top50k_rel_pro$closest_peak),] #filter out genes not in top 50k
enhan_res$k3 <- NA #add value for genes not clustered
enhan_res <- top50k_rel_pro #combine with top 50k genes with cluster designations
comb_res <- unique(merge(dREG_annot, enhan_res, by.x="peakID", by.y="closest_peak", all.y=T))
Combination completed, further investigation into clusters ongoing - 8DEC21
---
title: "Combined PRO_RNA web access"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

Document creation
Libraries
```{r}


library(DESeq2)
library(Rsubread)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(cluster)
library(factoextra)
library(amap)
library(NbClust)
library(dplyr)
```

Set working directory to local workspace
Save R markup, featurecounts results and full gtf there
RNA featurecounts generated with 'Homo_sapiens.GRCh38.99.gtf.gz' file
```{r}
setwd("/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/25 TCLs/MKJ R scripts")
getwd()
```

RNA

Load featurecounts results, configure as data frame object
Setup metadata information and check naming consistency
Remove non-counts columns in fc object
```{r}
fc <- read.table("WholeGeneCounts_Weinstock002_RNA", sep="\t", header=TRUE, row.names=1)
fc2 <- as.data.frame(fc)
orig_names <- colnames(fc2[6:30])
colData_rna <- data.frame(row.names=orig_names, sample=c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102",  "KARPAS299",  "KARPAS384", "KHYG1", "KIJK", "L82",  "MAC2A",  "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1",  "Su9T01", "SUDHL1", "SUPM2"), subtype=c("HS_TCL", "ALK-ALCL", "ALK-ALCL",  "CTCL",  "CTCL",  "ATLL", "ALK+ALCL",  "SQGD_TCL",  "NKL", "ALK+ALCL", "ALK+ALCL",  "ALK-ALCL", "CTCL",  "T_LGL", "NKT", "CTCL", "NKT",  "PTCL_NOS", "CTCL", "PTCL_NOS", "ALK+ALCL",  "ATLL", "ATLL", "ALK+ALCL", "ALK+ALCL"))
all(rownames(colData_rna) %in% colnames(fc2))
fc3 <- subset(fc2, select=-c(Chr, Start, End, Strand, Length))
colnames(fc3) <- c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102",  "KARPAS299",  "KARPAS384", "KHYG1", "KIJK", "L82",  "MAC2A",  "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1",  "Su9T01", "SUDHL1", "SUPM2")
```

Create DEseq object and save file 
Generate normalization factor with DEseq default
```{r}
dds_rna <- DESeqDataSetFromMatrix(countData=fc3, colData=colData_rna, design=~sample)
dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
row.names(deseq_sizes_rna) <- c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299", "KARPAS384", "KHYG1", "KIJK", "L82",  "MAC2A",  "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1",  "Su9T01", "SUDHL1", "SUPM2")
deseq_sizes_rna
saveRDS(dds_rna, file="RNA_dds.rds")
```

Generate norm counts files and save to local directory
```{r}
rawcounts_rna <- as.data.frame(counts(dds_rna, normalized=FALSE))
normcounts_rna <- as.data.frame(counts(dds_rna, normalized=TRUE))
colnames(rawcounts_rna) <- c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102",  "KARPAS299",  "KARPAS384", "KHYG1", "KIJK", "L82",  "MAC2A",  "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1",  "Su9T01", "SUDHL1", "SUPM2")
colnames(normcounts_rna) <- c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102",  "KARPAS299",  "KARPAS384", "KHYG1", "KIJK", "L82",  "MAC2A",  "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1",  "Su9T01", "SUDHL1", "SUPM2")
rownames(rawcounts_rna) <- sapply(strsplit(rownames(rawcounts_rna), split="[.]"), head, 1)
rownames(normcounts_rna) <- sapply(strsplit(rownames(normcounts_rna), split="[.]"), head, 1)
write.table(rawcounts_rna, file="RNA_rawcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
write.table(normcounts_rna, file="RNA_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
```

Data transformation for plot generation
Top 500 contributors is the default for PCA 
Set 'ntop' to number of observations in the dds object
```{r}
rld_rna <- rlog(dds_rna, blind=TRUE)
sampleDistsrna <- dist(t(assay(rld_rna)))
sampleDistMtxrna <- as.matrix(sampleDistsrna)
rownames(sampleDistMtxrna) <- paste0(rld_rna$sample)
colnames(sampleDistMtxrna) <- NULL
#Generate PCA plot (intgroup set equal to desired metadata variable)
plotPCA(rld_rna, intgroup = "subtype", ntop=60676) + ggtitle("RNA-seq PCA") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 500 PCA
plotPCA(rld_rna, intgroup = "subtype") + ggtitle("RNA-seq PCA Top 500") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 1000
plotPCA(rld_rna, intgroup = "subtype", ntop=1000) + ggtitle("RNA-seq PCA Top 1000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 5000
plotPCA(rld_rna, intgroup = "subtype", ntop=5000) + ggtitle("RNA-seq PCA Top 5000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 10000
plotPCA(rld_rna, intgroup = "subtype", ntop=10000) + ggtitle("RNA-seq PCA Top 10000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
#top 15000
plotPCA(rld_rna, intgroup = "subtype", ntop=15000) + ggtitle("RNA-seq PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxrna), max.overlaps=50)
```

Additional dendrogram plots
```{r}
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtxrna, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
         cluster_rows=TRUE, cluster_cols=TRUE, main="RNA Sample Distance Clustering")
plot(hclust(sampleDistsrna), labels=rownames(sampleDistMtxrna), main="RNA Sample Distance Dendrogram")
```

Assemble counts results
```{r}
avg_rna <- data.frame(EnsemblID=rownames(normcounts_rna))
avg_rna$DERL2 <- (normcounts_rna$DERL2)
avg_rna$DL40 <- (normcounts_rna$DL40)
avg_rna$FEPD <- (normcounts_rna$FEPD)
avg_rna$HH <- (normcounts_rna$HH)
avg_rna$HUT78 <- (normcounts_rna$HUT78)
avg_rna$HUT102 <- (normcounts_rna$HUT102)
avg_rna$KARPAS299 <- (normcounts_rna$KARPAS299)
avg_rna$KARPAS384 <- (normcounts_rna$KARPAS384)
avg_rna$KHYG1 <- (normcounts_rna$KHYG1)
avg_rna$KIJK <- (normcounts_rna$KIJK)
avg_rna$L82 <- (normcounts_rna$L82)
avg_rna$MAC2A <- (normcounts_rna$MAC2A)
avg_rna$MJ <- (normcounts_rna$MJ)
avg_rna$MOTN1 <- (normcounts_rna$MOTN1)
avg_rna$MTA <- (normcounts_rna$MTA)
avg_rna$MYLA <- (normcounts_rna$MYLA)
avg_rna$NKL <- (normcounts_rna$NKL)
avg_rna$OCILY12 <- (normcounts_rna$OCILY12)
avg_rna$SeAx <- (normcounts_rna$SeAx)
avg_rna$SMZ1 <- (normcounts_rna$SMZ1)
avg_rna$SR786 <- (normcounts_rna$SR786)
avg_rna$ST1 <- (normcounts_rna$ST1)
avg_rna$Su9T01 <- (normcounts_rna$Su9T01)
avg_rna$SUDHL1 <- (normcounts_rna$SUDHL1)
avg_rna$SUPM2 <- (normcounts_rna$SUPM2)

avg_rna$max <- apply(avg_rna[c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS299",  "KARPAS384", "KHYG1", "KIJK", "L82",  "MAC2A",  "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1",  "Su9T01", "SUDHL1", "SUPM2")], 1, max, na.rm=TRUE)
rel_rna <- data.frame(EnsemblID=avg_rna$EnsemblID)
rel_rna$DERL2 <- avg_rna$DERL2 / avg_rna$max
rel_rna$DL40 <- avg_rna$DL40 / avg_rna$max
rel_rna$FEPD <- avg_rna$FEPD / avg_rna$max
rel_rna$HH <- avg_rna$HH / avg_rna$max
rel_rna$HUT78 <- avg_rna$HUT78 / avg_rna$max
rel_rna$HUT102 <- avg_rna$HUT102 / avg_rna$max
rel_rna$KARPAS299 <- avg_rna$KARPAS299 / avg_rna$max
rel_rna$KARPAS384 <- avg_rna$KARPAS384 / avg_rna$max
rel_rna$KHYG1 <- avg_rna$KHYG1 / avg_rna$max
rel_rna$KIJK <- avg_rna$KIJK / avg_rna$max
rel_rna$L82 <- avg_rna$L82 / avg_rna$max
rel_rna$MAC2A <- avg_rna$MAC2A / avg_rna$max
rel_rna$MJ <- avg_rna$MJ / avg_rna$max
rel_rna$MOTN1 <- avg_rna$MOTN1 / avg_rna$max
rel_rna$MTA <- avg_rna$MTA / avg_rna$max
rel_rna$MYLA <- avg_rna$MYLA / avg_rna$max
rel_rna$NKL <- avg_rna$NKL / avg_rna$max
rel_rna$OCILY12 <- avg_rna$OCILY12 / avg_rna$max
rel_rna$SeAx <- avg_rna$SeAx / avg_rna$max
rel_rna$SMZ1 <- avg_rna$SMZ1 / avg_rna$max
rel_rna$SR786 <- avg_rna$SR786 / avg_rna$max
rel_rna$ST1 <- avg_rna$ST1 / avg_rna$max
rel_rna$Su9T01 <- avg_rna$Su9T01 / avg_rna$max
rel_rna$SUDHL1 <- avg_rna$SUDHL1 / avg_rna$max
rel_rna$SUPM2 <- avg_rna$SUPM2 / avg_rna$max
```

Data transform for variance assessment and sub-clustering
```{r}
rv1 <- rowVars(assay(rld_rna))
select1 <- order(rv1, decreasing=TRUE)
pc1 <- prcomp(t(assay(rld_rna)[select1,]))
loadings1 <- as.data.frame(pc1$rotation)
aload1 <- abs(loadings1)
pcRank <- data.frame("EnsemblID"=rownames(loadings1), "PCA.rank"=seq.int(nrow(loadings1)))
sweep(aload1, 2, colSums(aload1), "/")

peakVar <- data.frame("EnsemblID"=rownames(assay(rld_rna)))
peakVar$rlog.variance <- rowVars(as.matrix(assay(rld_rna)))
peakVar <- merge(pcRank, peakVar, by.x="EnsemblID", by.y="EnsemblID")
ggplot(peakVar, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000,10000,15000), linetype="dotted") + ggtitle("Variance per PC Contributor")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar <- peakVar[order(peakVar$PCA.rank),]
peakVar$cumulative.Variance <- cumsum(peakVar$rlog.variance)/sum(peakVar$rlog.variance)
ggplot(peakVar, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000,10000,15000), linetype="dotted") + ggtitle("Cumulative Variance per PC Rank")
```

Removing Alk+ALCL to determine if PCA separation improves
```{r}
dds_sub <- dds_rna[,-c(7,10,11,21,24,25)] #Excluding sample numbers of the ALK+ALCL 
estimateSizeFactors(dds_sub) #Must re-calculate size factors after removing samples
dds_sub

rld_sub <- rlog(dds_sub, blind=TRUE)
colnames(rld_sub) <- paste0(dds_sub$sample)
sampleDists_sub <- dist(t(assay(rld_sub)))
sampleDistMtx_sub <- as.matrix(sampleDists_sub)
rownames(sampleDistMtx_sub) <- colnames(rld_sub)
colnames(sampleDistMtx_sub) <- NULL
hclust_sub <- hclust(sampleDists_sub)

plot(hclust_sub, labels=rownames(sampleDistMtx_sub), main="RNA-seq subset Gene Dendrogram")

```

PCAs
```{r}
plotPCA(rld_sub, intgroup = "subtype") + ggtitle("RNA-seq subset PCA Top 500") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=1000) + ggtitle("RNA-seq subset PCA Top 1000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=5000) + ggtitle("RNA-seq subset PCA Top 5000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=10000) + ggtitle("RNA-seq subset PCA Top 10000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=15000) + ggtitle("RNA-seq subset PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=60676) + ggtitle("RNA-seq subset PCA All Contributors") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)
```

PCA top contributors selection (5K to 20K)
```{r message=FALSE, warning=FALSE}
rv <- rowVars(assay(rld_rna))
select <- order(rv, decreasing=TRUE)[seq_len(min(10000, length(rv)))] 
#Changing the 10000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_rna)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 10000 RNA PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 10000 elements with their annotations
top10000 <- rel_rna[(rel_rna$EnsemblID %in% rownames(aload)),]
write.table(top10000, file="Top10000.txt", sep='\t')

RNA_rel <- rel_rna[(rel_rna$EnsemblID %in% top10000$EnsemblID),]
saveRDS(RNA_rel, file="RNA_rel.rds")
colnames(RNA_rel) <- c("EnsemblID", "DERL2.gene", "DL40.gene", "FEPD.gene", "HH.gene", "HUT78.gene", "HUT102.gene",  "KARPAS299.gene",  "KARPAS384.gene", "KHYG1.gene", "KIJK.gene", "L82.gene",  "MAC2A.gene",  "MJ.gene", "MOTN1.gene", "MTA.gene", "MYLA.gene", "NKL.gene", "OCILY12.gene", "SeAx.gene", "SMZ1.gene", "SR786.gene", "ST1.gene",  "Su9T01.gene", "SUDHL1.gene", "SUPM2.gene")
head(top10000)
head(RNA_rel)
#Gap Stat Calculation
#set.seed(42) 
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(RNA_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50) 
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(top10000[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(top10000[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk20 <- clara(top10000[2:26],20,medoids.x=TRUE)
CLARAk26 <- clara(top10000[2:26],26,medoids.x=TRUE)
CLARAk30 <- clara(top10000[2:26],30,medoids.x=TRUE)
CLARAk36 <- clara(top10000[2:26],36,medoids.x=TRUE)
CLARAk15 <- clara(top10000[2:26],15,medoids.x=TRUE)
CLARAk25 <- clara(top10000[2:26],25,medoids.x=TRUE)
CLARAk22 <- clara(top10000[2:26],22,medoids.x=TRUE)

top10000$k20 <- CLARAk20$clustering
top10000$k26 <- CLARAk26$clustering
top10000$k30 <- CLARAk30$clustering
top10000$k36 <- CLARAk36$clustering
top10000$k15 <- CLARAk15$clustering
top10000$k15 <- CLARAk15$clustering
top10000$k25 <- CLARAk25$clustering
top10000$k22 <- CLARAk22$clustering


#Displaying Heatmaps
#k-means 20
pheatmap(top10000[order(top10000[, 26+1]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+1])
#k-means 26
pheatmap(top10000[order(top10000[, 26+2]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+2])
#k-means 30
pheatmap(top10000[order(top10000[, 26+3]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+3])
#k-means 36
pheatmap(top10000[order(top10000[, 26+4]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+4])
#k-means 15
pheatmap(top10000[order(top10000[, 26+5]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+5])
#k-means 25
pheatmap(top10000[order(top10000[, 26+6]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+6])
#k-means 22 
pheatmap(top10000[order(top10000[, 26+7]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10000[26+7])
```

--RNA END--

PRO

```{r}
#Prepare dREG peak saf for featureCounts
dREG <- read.table("TCLs_s0.5p0.025_filtered.bed")
colnames(dREG) <- c("chromosome", "start", "end", "dREGpeakID", "dREGscore", "strand", "centerStart",
                    "centerEnd")
write.table(dREG, file="dREG readtable.txt", sep="\t", quote=F, row.names=F)
dREGsaf <- data.frame("GeneID"=dREG$dREGpeakID, "Chr"=dREG$chromosome, "Start"=dREG$start, "End"=dREG$end, "Strand"="+")
write.table(dREGsaf, file="dREG.saf", sep="\t", quote=F, row.names=F)
#Load sample SAM file list
#profiles <- c("")

#Run feautreCounts
#fc_dREG <- featureCounts(profiles, annot.ext=dREGsaf, isGTFAnnotationFile=FALSE, minMQS=10, countChimericFragments=FALSE, isPairedEnd=FALSE, strandSpecific=0, nthreads=8)
fc_pro <- read.table("dREGCounts_Weinstock002_pro", header=T, row.names=1)
fc_pro <- as.data.frame(fc_pro)
fc_pro <- subset(fc_pro, select=-c(Chr, Start, End, Strand, Length))
colnames(fc_pro) <- c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "SMZ1", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx", "SUDHL1")

#Load sample metadata (can add as much sample metadata as desired including cell type, drug treatment, etc)
colData_pro <- data.frame(row.names=colnames(fc_pro$counts), sample=c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "SMZ1", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx", "SUDHL1"), subtype=c("ATLL", "ATLL", "ATLL", "ALK+ALCL", "ALK+ALCL", "ALK+ALCL", "PTCL_NOS", "CTCL", "ALK-ALCL", "ALK+ALCL", "ALK+ALCL", "ALK-ALCL", "SQGD_TCL", "ALK-ALCL", "PTCL_NOS", "CTCL", "CTCL", "CTCL", "NKT", "T_LGL", "NKT", "HS_TCL", "NKL", "CTCL", "ALK+ALCL"))
all((colData_pro$sample) %in% colnames(fc_pro))

#Create DESeq object and generate normalized counts
dds_pro <- DESeqDataSetFromMatrix(countData=fc_pro, colData=colData_pro, design=~sample)
dds_pro <- estimateSizeFactors(dds_pro)
deseq_sizes_pro <- as.data.frame(sizeFactors(dds_pro))
deseq_sizes_pro
variable.names(dds_pro)

keep <- rowMax(counts(dds_pro, normalize=TRUE)) >= 25
dds_pro <- dds_pro[keep,]
saveRDS(dds_pro, file="dREG_dds.rds")
dREG_readFilt <- dREG[dREG$dREGpeakID %in% rownames(dds_pro),]
write.table(dREG_readFilt, file="dREGpeaks.readFilt.bed", sep="\t", col.names=FALSE, row.names=FALSE,
            quote=FALSE)
```

Generate norm counts files and save to local directory
```{r}
rawcounts_pro <- as.data.frame(counts(dds_pro, normalized=FALSE))
normcounts_pro <- as.data.frame(counts(dds_pro, normalized=TRUE))
colnames(rawcounts_pro) <- c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "SMZ1", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx", "SUDHL1")
colnames(normcounts_pro) <- c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "SMZ1", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx", "SUDHL1")
write.table(rawcounts_pro, file="pro_rawcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
write.table(normcounts_pro, file="pro_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
```

Data transformation for plot generation
Top 500 contributors is the default for PCA 
Set 'ntop' to number of observations in the dds object
```{r}
rld_pro <- rlog(dds_pro, blind=TRUE)
sampleDistspro <- dist(t(assay(rld_pro)))
sampleDistMtxpro <- as.matrix(sampleDistspro)
rownames(sampleDistMtxpro) <- paste0(rld_pro$sample)
colnames(sampleDistMtxpro) <- NULL
#Generate PCA plot (intgroup set equal to desired metadata variable)
plotPCA(rld_pro, intgroup = "subtype", ntop=120888) + ggtitle("pro-seq PCA") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 500 PCA
plotPCA(rld_pro, intgroup = "subtype") + ggtitle("pro-seq PCA Top 500") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 1000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=1000) + ggtitle("pro-seq PCA Top 1000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 5000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=5000) + ggtitle("pro-seq PCA Top 5000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 10000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=10000) + ggtitle("pro-seq PCA Top 10000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 15000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=15000) + ggtitle("pro-seq PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
#top 20000 PCA
plotPCA(rld_pro, intgroup = "subtype", ntop=20000) + ggtitle("pro-seq PCA Top 20000") + scale_color_manual(breaks = c("ATLL", "ALK+ALCL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtxpro), max.overlaps=50)
```

Additional dendrogram plots
```{r}
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtxpro, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
         cluster_rows=TRUE, cluster_cols=TRUE, main="pro Sample Distance Clustering")
plot(hclust(sampleDistspro), labels=rownames(sampleDistMtxpro), main="pro Sample Distance Dendrogram")
```

Assemble counts results
```{r}
avg_pro <- data.frame(peakID=rownames(normcounts_pro))
avg_pro$DERL2 <- (normcounts_pro$DERL2)
avg_pro$DL40 <- (normcounts_pro$DL40)
avg_pro$FEPD <- (normcounts_pro$FEPD)
avg_pro$HH <- (normcounts_pro$HH)
avg_pro$HUT78 <- (normcounts_pro$HUT78)
avg_pro$HUT102 <- (normcounts_pro$HUT102)
avg_pro$KARPAS299 <- (normcounts_pro$KARPAS299)
avg_pro$KARPAS384 <- (normcounts_pro$KARPAS384)
avg_pro$KHYG1 <- (normcounts_pro$KHYG1)
avg_pro$KIJK <- (normcounts_pro$KIJK)
avg_pro$L82 <- (normcounts_pro$L82)
avg_pro$MAC2A <- (normcounts_pro$MAC2A)
avg_pro$MJ <- (normcounts_pro$MJ)
avg_pro$MOTN1 <- (normcounts_pro$MOTN1)
avg_pro$MTA <- (normcounts_pro$MTA)
avg_pro$MYLA <- (normcounts_pro$MYLA)
avg_pro$NKL <- (normcounts_pro$NKL)
avg_pro$OCILY12 <- (normcounts_pro$OCILY12)
avg_pro$SeAx <- (normcounts_pro$SeAx)
avg_pro$SMZ1 <- (normcounts_pro$SMZ1)
avg_pro$SR786 <- (normcounts_pro$SR786)
avg_pro$ST1 <- (normcounts_pro$ST1)
avg_pro$Su9T01 <- (normcounts_pro$Su9T01)
avg_pro$SUDHL1 <- (normcounts_pro$SUDHL1)
avg_pro$SUPM2 <- (normcounts_pro$SUPM2)

avg_pro$max <- apply(avg_pro[c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102",  "KARPAS299",  "KARPAS384", "KHYG1", "KIJK", "L82",  "MAC2A",  "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "SR786", "ST1",  "Su9T01", "SUDHL1", "SUPM2")], 1, max, na.rm=TRUE)
rel_pro <- data.frame(peakID=avg_pro$peakID)
rel_pro$DERL2 <- avg_pro$DERL2 / avg_pro$max
rel_pro$DL40 <- avg_pro$DL40 / avg_pro$max
rel_pro$FEPD <- avg_pro$FEPD / avg_pro$max
rel_pro$HH <- avg_pro$HH / avg_pro$max
rel_pro$HUT78 <- avg_pro$HUT78 / avg_pro$max
rel_pro$HUT102 <- avg_pro$HUT102 / avg_pro$max
rel_pro$KARPAS299 <- avg_pro$KARPAS299 / avg_pro$max
rel_pro$KARPAS384 <- avg_pro$KARPAS384 / avg_pro$max
rel_pro$KHYG1 <- avg_pro$KHYG1 / avg_pro$max
rel_pro$KIJK <- avg_pro$KIJK / avg_pro$max
rel_pro$L82 <- avg_pro$L82 / avg_pro$max
rel_pro$MAC2A <- avg_pro$MAC2A / avg_pro$max
rel_pro$MJ <- avg_pro$MJ / avg_pro$max
rel_pro$MOTN1 <- avg_pro$MOTN1 / avg_pro$max
rel_pro$MTA <- avg_pro$MTA / avg_pro$max
rel_pro$MYLA <- avg_pro$MYLA / avg_pro$max
rel_pro$NKL <- avg_pro$NKL / avg_pro$max
rel_pro$OCILY12 <- avg_pro$OCILY12 / avg_pro$max
rel_pro$SeAx <- avg_pro$SeAx / avg_pro$max
rel_pro$SMZ1 <- avg_pro$SMZ1 / avg_pro$max
rel_pro$SR786 <- avg_pro$SR786 / avg_pro$max
rel_pro$ST1 <- avg_pro$ST1 / avg_pro$max
rel_pro$Su9T01 <- avg_pro$Su9T01 / avg_pro$max
rel_pro$SUDHL1 <- avg_pro$SUDHL1 / avg_pro$max
rel_pro$SUPM2 <- avg_pro$SUPM2 / avg_pro$max
```

PCA top contributors selection (5000 tried first)
```{r message=FALSE, warning=FALSE}
dREG_annot <- read.table("002_TCLs_p1000_dREGpeak_annotation.txt", sep="\t", header=TRUE)

PRO_rel <- rel_pro
#check this
colnames(PRO_rel) <- c("dREGpeakID", "DERL2.enhancer", "DL40.enhancer", "FEPD.enhancer", "HH.enhancer", "HUT78.ehancer", "HUT102.enhancer",  "KARPAS299.enhancer",  "KARPAS384.enhancer", "KHYG1.enhancer", "KIJK.enhancer", "L82.enhancer",  "MAC2A.ehancer",  "MJ.enhancer", "MOTN1.enhancer", "MTA.enhancer", "MYLA.enhancer", "NKL.enhancer", "OCILY12.enhancer", "SeAx.ehancer", "SMZ1.enhancer", "SR786.enhancer", "ST1.enhancer",  "Su9T01.enhancer", "SUDHL1.enhancer", "SUPM2.enhancer")
saveRDS(PRO_rel, file="pro_rel.rds") 
head(PRO_rel)

rv <- rowVars(assay(rld_pro))
select <- order(rv, decreasing=TRUE)[seq_len(min(5000, length(rv)))] 
#Changing the 5000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_pro)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 5000 pro PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 5000 elements with their annotations
top5000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload)),]
write.table(top5000, file="Top5000.txt", sep='\t')
ggplot(top5000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA Contributors dREGpeak Annotations")
ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
pro_5000_rel <- rel_pro[(rel_pro$peakID %in% top5000$peakID),]
#Gap Stat Calculation
#set.seed(42) 
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(pro_5000_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50) 
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(pro_5000_rel[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(pro_5000_rel[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk4 <- clara(pro_5000_rel[2:26],4,medoids.x=TRUE)
CLARAk5 <- clara(pro_5000_rel[2:26],5,medoids.x=TRUE)
CLARAk6 <- clara(pro_5000_rel[2:26],6,medoids.x=TRUE)
CLARAk7 <- clara(pro_5000_rel[2:26],7,medoids.x=TRUE)
CLARAk8 <- clara(pro_5000_rel[2:26],8,medoids.x=TRUE)

pro_5000_rel$k4 <- CLARAk4$clustering
pro_5000_rel$k5 <- CLARAk5$clustering
pro_5000_rel$k6 <- CLARAk6$clustering
pro_5000_rel$k7 <- CLARAk7$clustering
pro_5000_rel$k8 <- CLARAk8$clustering

#Displaying Heatmaps
#k-means 4
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+1]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+1])
#k-means 5
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+2]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+2])
#k-means 6
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+3]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+3])
#k-means 7
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+4]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+4])
#k-means 8
pheatmap(pro_5000_rel[order(pro_5000_rel[, 26+5]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=pro_5000_rel[26+5])
```

PCA top contributors (10K)
```{r}
rv <- rowVars(assay(rld_pro))
select <- order(rv, decreasing=TRUE)[seq_len(min(10000, length(rv)))] 
#Changing the 10000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_pro)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 10000 pro PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 10000 elements with their annotations
top10000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload)),]
write.table(top10000, file="Top10000.txt", sep='\t')
ggplot(top10000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 10000 PCA Contributors dREGpeak Annotations")
ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
PRO_10000_rel <- rel_pro[(rel_pro$peakID %in% top10000$peakID),]
#Gap Stat Calculation
#set.seed(42) 
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(pro_10000_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50) 
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(pro_10000_rel[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(pro_10000_rel[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk5 <- clara(PRO_10000_rel[2:26],5,medoids.x=TRUE)
CLARAk8 <- clara(PRO_10000_rel[2:26],8,medoids.x=TRUE)
CLARAk10 <- clara(PRO_10000_rel[2:26],10,medoids.x=TRUE)
CLARAk13 <- clara(PRO_10000_rel[2:26],13,medoids.x=TRUE)
CLARAk15 <- clara(PRO_10000_rel[2:26],15,medoids.x=TRUE)
CLARAk6 <- clara(PRO_10000_rel[2:26],6,medoids.x=TRUE)
CLARAk7 <- clara(PRO_10000_rel[2:26],7,medoids.x=TRUE)
CLARAk4 <- clara(PRO_10000_rel[2:26],4,medoids.x=TRUE)


PRO_10000_rel$k5 <- CLARAk5$clustering
PRO_10000_rel$k8 <- CLARAk8$clustering
PRO_10000_rel$k10 <- CLARAk10$clustering
PRO_10000_rel$k13 <- CLARAk13$clustering
PRO_10000_rel$k15 <- CLARAk15$clustering
PRO_10000_rel$k6 <- CLARAk6$clustering
PRO_10000_rel$k7 <- CLARAk7$clustering
PRO_10000_rel$k4 <- CLARAk4$clustering



#Displaying Heatmaps
#k-means 5
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+1]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+1])
#k-means 8
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+2]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+2])
#k-means 10
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+3]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+3])
#k-means 13
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+4]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+4])
#k-means 15
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+5]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+5])
#k-means 6
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+6]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+6])
#k-means 7
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+7]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+7])
#k-means 4
pheatmap(PRO_10000_rel[order(PRO_10000_rel[, 26+8]),][,2:26], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_10000_rel[26+8])
```


Separating by disease subset and re-clustering
```{r}
dds_pro_sub <- dds_pro[,-c(7,13,15,19,20,21,22,23)] #Excluding sample numbers of entities that are not Alk+Alcl, Alk-Alcl, Atll, and Ctcl 
estimateSizeFactors(dds_pro_sub) #Must re-calculate size factors after removing samples
deseq_sizes_pro_sub <- as.data.frame(sizeFactors(dds_pro_sub))
deseq_sizes_pro_sub 
  
rld_pro_sub <- rlog(dds_pro_sub, blind=TRUE)
colnames(rld_pro_sub) <- paste0(dds_pro_sub$sample)
sampleDists_pro_sub <- dist(t(assay(rld_pro_sub)))
sampleDistMtx_pro_sub <- as.matrix(sampleDists_pro_sub)
rownames(sampleDistMtx_pro_sub) <- colnames(rld_pro_sub)
colnames(sampleDistMtx_pro_sub) <- NULL
hclust_pro_sub <- hclust(sampleDists_pro_sub)

plot(hclust_pro_sub, labels=rownames(sampleDistMtx_pro_sub), main="Largest Entity dREG Dendrogram")
```

pro subset PCAs
```{r}
#plotPCA(rld_sub, intgroup = "subtype") + ggtitle("RNA-seq subset PCA Top 500") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_pro_sub, intgroup = "subtype", ntop=1000) + ggtitle("pro-seq subset PCA Top 1000") + scale_color_manual(breaks = c("ATLL","ALK+ALCL", "CTCL", "ALK-ALCL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtx_pro_sub), max.overlaps=50)

plotPCA(rld_pro_sub, intgroup = "subtype", ntop=5000) + ggtitle("pro-seq subset PCA Top 5000") + scale_color_manual(breaks = c("ATLL","ALK+ALCL", "CTCL", "ALK-ALCL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtx_pro_sub), max.overlaps=50)

plotPCA(rld_pro_sub, intgroup = "subtype", ntop=10000) + ggtitle("pro-seq subset PCA Top 10000") + scale_color_manual(breaks = c("ATLL","ALK+ALCL", "CTCL", "ALK-ALCL"), values=c("darkgray", "chocolate4", "chartreuse3", "cadetblue")) + geom_text_repel(label=rownames(sampleDistMtx_pro_sub), max.overlaps=50)

#plotPCA(rld_sub, intgroup = "subtype", ntop=15000) + ggtitle("RNA-seq subset PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

#plotPCA(rld_sub, intgroup = "subtype", ntop=60676) + ggtitle("RNA-seq subset PCA All Contributors") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)
```

Generate norm counts file
```{r}
normcounts_pro_sub <- as.data.frame(counts(dds_pro_sub, normalized=TRUE))
colnames(normcounts_pro_sub) <- c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "DL40", "HUT78", "HH", "MJ", "SeAx", "SUDHL1")
```

Additional dendrogram plots
```{r}
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtx_pro_sub, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
         cluster_rows=TRUE, cluster_cols=TRUE, main="pro subset Sample Distance Clustering")
plot(hclust(sampleDists_pro_sub), labels=rownames(sampleDistMtx_pro_sub), main="pro subset Sample Distance Dendrogram")
```

Assemble subset counts and average
```{r}
avg_pro_sub <- data.frame(peakID=rownames(normcounts_pro_sub))
avg_pro_sub$HUT102 <- (normcounts_pro_sub$HUT102)
avg_pro_sub$ST1 <- (normcounts_pro_sub$ST1)
avg_pro_sub$Su9T01 <- (normcounts_pro_sub$Su9T01)
avg_pro_sub$KARPAS299 <- (normcounts_pro_sub$KARPAS299)
avg_pro_sub$L82 <- (normcounts_pro_sub$L82)
avg_pro_sub$SUPM2 <- (normcounts_pro_sub$SUPM2)
avg_pro_sub$MYLA <- (normcounts_pro_sub$MYLA)
avg_pro_sub$MAC2A <- (normcounts_pro_sub$MAC2A)
avg_pro_sub$KIJK <- (normcounts_pro_sub$KIJK)
avg_pro_sub$SR786 <- (normcounts_pro_sub$SR786)
avg_pro_sub$FEPD <- (normcounts_pro_sub$FEPD)
avg_pro_sub$DL40 <- (normcounts_pro_sub$DL40)
avg_pro_sub$HUT78 <- (normcounts_pro_sub$HUT78)
avg_pro_sub$HH <- (normcounts_pro_sub$HH)
avg_pro_sub$MJ <- (normcounts_pro_sub$MJ)
avg_pro_sub$SeAx <- (normcounts_pro_sub$SeAx)
avg_pro_sub$SUDHL1 <- (normcounts_pro_sub$SUDHL1)

avg_pro_sub$max <- apply(avg_pro_sub[c("HUT102", "ST1", "Su9T01", "KARPAS299", "L82", "SUPM2", "MYLA", "MAC2A", "KIJK", "SR786", "FEPD", "DL40", "HUT78", "HH", "MJ", "SeAx", "SUDHL1")], 1, max, na.rm=TRUE)
rel_pro_sub <- data.frame(peakID=avg_pro_sub$peakID)
rel_pro_sub$HUT102 <- avg_pro_sub$HUT102 / avg_pro_sub$max
rel_pro_sub$ST1 <- avg_pro_sub$ST1 / avg_pro_sub$max
rel_pro_sub$Su9T01 <- avg_pro_sub$Su9T01 / avg_pro_sub$max
rel_pro_sub$KARPAS299 <- avg_pro_sub$KARPAS299 / avg_pro_sub$max
rel_pro_sub$L82 <- avg_pro_sub$L82 / avg_pro_sub$max
rel_pro_sub$SUPM2 <- avg_pro_sub$SUPM2 / avg_pro_sub$max
rel_pro_sub$MYLA <- avg_pro_sub$MYLA / avg_pro_sub$max
rel_pro_sub$MAC2A <- avg_pro_sub$MAC2A / avg_pro_sub$max
rel_pro_sub$KIJK <- avg_pro_sub$KIJK / avg_pro_sub$max
rel_pro_sub$SR786 <- avg_pro_sub$SR786 / avg_pro_sub$max
rel_pro_sub$FEPD <- avg_pro_sub$FEPD / avg_pro_sub$max
rel_pro_sub$DL40 <- avg_pro_sub$DL40 / avg_pro_sub$max
rel_pro_sub$HUT78 <- avg_pro_sub$HUT78 / avg_pro_sub$max
rel_pro_sub$HH <- avg_pro_sub$HH / avg_pro_sub$max
rel_pro_sub$MJ <- avg_pro_sub$MJ / avg_pro_sub$max
rel_pro_sub$SeAx <- avg_pro_sub$SeAx / avg_pro_sub$max
rel_pro_sub$SUDHL1 <- avg_pro_sub$SUDHL1 / avg_pro_sub$max
```

PCA top contributors selection (5000)
```{r message=FALSE, warning=FALSE}

#check this
colnames(rel_pro_sub) <- c("dREGpeakID", "HUT102.enhancer", "ST1.enhancer", "Su9T01.enhancer", "KARPAS299.enhancer", "L82.enhancer", "SUPM2.enhancer", "MYLA.enhancer", "MAC2A.ehancer", "KIJK.enhancer", "SR786.enhancer", "FEPD.enhancer", "DL40.enhancer", "HUT78.ehancer", "HH.enhancer", "MJ.enhancer",  "SeAx.ehancer", "SUDHL1.enhancer")

rv2 <- rowVars(assay(rld_pro_sub))
select2 <- order(rv2, decreasing=TRUE)[seq_len(min(5000, length(rv2)))] 
#Changing the 5000 here will select a Different number of top contributors if desired
pc2 <- prcomp(t(assay(rld_pro_sub)[select2,]))
loadings2 <- as.data.frame(pc2$rotation)
aload2 <- abs(loadings2)
write.table(sweep(aload, 2, colSums(aload2), "/"), file="Top 5000 pro sub PCA Contributors.txt")

##Merge top 5000 elements with their annotations
top5000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload2)),]
write.table(top5000, file="Top5000_pro_sub.txt", sep='\t')
ggplot(top5000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA subset Contributors dREGpeak Annotations")
ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
PRO_5000_rel <- rel_pro_sub[(rel_pro_sub$dREGpeakID %in% top5000$peakID),]
#Gap Stat Calculation
#set.seed(42) 
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(RNA_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50) 
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(top10000[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(top10000[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk15 <- clara(PRO_5000_rel[2:18],15,medoids.x=TRUE)
CLARAk20 <- clara(PRO_5000_rel[2:18],20,medoids.x=TRUE)
CLARAk22 <- clara(PRO_5000_rel[2:18],22,medoids.x=TRUE)
CLARAk25 <- clara(PRO_5000_rel[2:18],25,medoids.x=TRUE)
CLARAk26 <- clara(PRO_5000_rel[2:18],26,medoids.x=TRUE)
CLARAk30 <- clara(PRO_5000_rel[2:18],30,medoids.x=TRUE)
CLARAk36 <- clara(PRO_5000_rel[2:18],36,medoids.x=TRUE)
CLARAk4 <- clara(PRO_5000_rel[2:18],4,medoids.x=TRUE)
CLARAk6 <- clara(PRO_5000_rel[2:18],6,medoids.x=TRUE)
CLARAk10 <- clara(PRO_5000_rel[2:18],10,medoids.x=TRUE)
CLARAk3 <- clara(PRO_5000_rel[2:18],3,medoids.x=TRUE)
CLARAk5 <- clara(PRO_5000_rel[2:18],5,medoids.x=TRUE)


PRO_5000_rel$k15 <- CLARAk15$clustering
PRO_5000_rel$k20 <- CLARAk20$clustering
PRO_5000_rel$k22 <- CLARAk22$clustering
PRO_5000_rel$k25 <- CLARAk25$clustering
PRO_5000_rel$k26 <- CLARAk26$clustering
PRO_5000_rel$k30 <- CLARAk30$clustering
PRO_5000_rel$k36 <- CLARAk36$clustering
PRO_5000_rel$k4 <- CLARAk4$clustering
PRO_5000_rel$k6 <- CLARAk6$clustering
PRO_5000_rel$k10 <- CLARAk10$clustering
PRO_5000_rel$k3 <- CLARAk3$clustering
PRO_5000_rel$k5 <- CLARAk5$clustering


#Displaying Heatmaps
#k-means 15
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+1]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+1])
#k-means 20
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+2]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+2])
#k-means 22 
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+3]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+3])
#k-means 25
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+4]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+4])
#k-means 26
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+5]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+5])
#k-means 30
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+6]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+6])
#k-means 36
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+7]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+7])
#k-means 4
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+8]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+8])
#k-means 6
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+9]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+9])
#k-means 10
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+10]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+10])
#k-means 3
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+11]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+11])
#k-means 5
pheatmap(PRO_5000_rel[order(PRO_5000_rel[, 18+12]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_5000_rel[18+12])
```

PCA top contributors (1000)
```{r message=FALSE, warning=FALSE}

#check this
colnames(rel_pro_sub) <- c("dREGpeakID", "HUT102.enhancer", "ST1.enhancer", "Su9T01.enhancer", "KARPAS299.enhancer", "L82.enhancer", "SUPM2.enhancer", "MYLA.enhancer", "MAC2A.ehancer", "KIJK.enhancer", "SR786.enhancer", "FEPD.enhancer", "DL40.enhancer", "HUT78.ehancer", "HH.enhancer", "MJ.enhancer",  "SeAx.ehancer", "SUDHL1.enhancer")

rv3 <- rowVars(assay(rld_pro_sub))
select3 <- order(rv3, decreasing=TRUE)[seq_len(min(1000, length(rv3)))] 
#Changing the 1000 here will select a Different number of top contributors if desired
pc3 <- prcomp(t(assay(rld_pro_sub)[select3,]))
loadings3 <- as.data.frame(pc3$rotation)
aload3 <- abs(loadings3)
write.table(sweep(aload3, 2, colSums(aload3), "/"), file="Top 1000 pro sub PCA Contributors.txt")

##Merge top 1000 elements with their annotations
top1000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload3)),]
write.table(top1000, file="Top1000_pro_sub.txt", sep='\t')
ggplot(top1000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA subset Contributors dREGpeak Annotations")
ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
PRO_1000_rel <- rel_pro_sub[(rel_pro_sub$dREGpeakID %in% top1000$peakID),]
#Gap Stat Calculation
#set.seed(42) 
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(RNA_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50) 
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(top10000[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(top10000[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk3 <- clara(PRO_1000_rel[2:18],3,medoids.x=TRUE)
CLARAk4 <- clara(PRO_1000_rel[2:18],4,medoids.x=TRUE)
CLARAk5 <- clara(PRO_1000_rel[2:18],5,medoids.x=TRUE)
CLARAk6 <- clara(PRO_1000_rel[2:18],6,medoids.x=TRUE)
CLARAk7 <- clara(PRO_1000_rel[2:18],7,medoids.x=TRUE)
CLARAk8 <- clara(PRO_1000_rel[2:18],8,medoids.x=TRUE)


PRO_1000_rel$k3 <- CLARAk3$clustering
PRO_1000_rel$k4 <- CLARAk4$clustering
PRO_1000_rel$k5 <- CLARAk5$clustering
PRO_1000_rel$k6 <- CLARAk6$clustering
PRO_1000_rel$k7 <- CLARAk7$clustering
PRO_1000_rel$k8 <- CLARAk8$clustering


#Displaying Heatmaps
#k-means 3
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+1]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+1])
#k-means 4
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+2]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+2])
#k-means 5 
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+3]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+3])
#k-means 6
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+4]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+4])
#k-means 7
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+5]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+5])
#k-means 8
pheatmap(PRO_1000_rel[order(PRO_1000_rel[, 18+6]),][,2:18], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=PRO_1000_rel[18+6])
```

Removing Alk+ALCL to determine if PCA separation improves
```{r}
dds_sub <- dds_pro[,-c(4,5,6,10,11,25)] #Excluding sample numbers of the ALK+ALCL 
estimateSizeFactors(dds_sub) #Must re-calculate size factors after removing samples
dds_sub

rld_sub <- rlog(dds_sub, blind=TRUE)
colnames(rld_sub) <- paste0(dds_sub$sample)
sampleDists_sub <- dist(t(assay(rld_sub)))
sampleDistMtx_sub <- as.matrix(sampleDists_sub)
rownames(sampleDistMtx_sub) <- colnames(rld_sub)
colnames(sampleDistMtx_sub) <- NULL
hclust_sub <- hclust(sampleDists_sub)

plot(hclust_sub, labels=rownames(sampleDistMtx_sub), main="PRO-seq subset Gene Dendrogram")

```

PCAs
```{r}
plotPCA(rld_sub, intgroup = "subtype") + ggtitle("PRO-seq subset PCA Top 500") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=1000) + ggtitle("PRO-seq subset PCA Top 1000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=5000) + ggtitle("PRO-seq subset PCA Top 5000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=10000) + ggtitle("PRO-seq subset PCA Top 10000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=15000) + ggtitle("PRO-seq subset PCA Top 15000") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)

plotPCA(rld_sub, intgroup = "subtype", ntop=120888) + ggtitle("PRO-seq subset PCA All Contributors") + scale_color_manual(breaks = c("ATLL", "CTCL", "ALK-ALCL", "SQGD_TCL", "T_LGL", "NKT", "HS_TCL", "PTCL_NOS", "NKL"), values=c("darkgray", "chartreuse3", "cadetblue", "cyan3", "red", "lightpink", "black", "darkgoldenrod1", "blueviolet")) + geom_text_repel(label=rownames(sampleDistMtx_sub), max.overlaps=50)
```

Additional dendrogram plots
```{r}
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMtx_sub, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE,
         cluster_rows=TRUE, cluster_cols=TRUE, main="PRO subset Sample Distance Clustering")
plot(hclust(sampleDists_sub), labels=rownames(sampleDistMtx_sub), main="PRO subset Sample Distance Dendrogram")
```

Generate norm counts file
```{r}
normcounts_sub <- as.data.frame(counts(dds_sub, normalized=TRUE))
colnames(normcounts_sub) <- c("HUT102", "ST1", "Su9T01", "SMZ1", "MYLA", "MAC2A", "FEPD", "KARPAS384", "DL40", "OCILY12", "HUT78", "HH", "MJ", "MTA", "MOTN1", "NKL", "DERL2", "KHYG1", "SeAx")
#write.table(normcounts_pro, file="pro_normcounts.txt", sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE)
```

Assemble counts results
```{r}
avg_sub <- data.frame(peakID=rownames(normcounts_sub))
avg_sub$DERL2 <- (normcounts_sub$DERL2)
avg_sub$DL40 <- (normcounts_sub$DL40)
avg_sub$FEPD <- (normcounts_sub$FEPD)
avg_sub$HH <- (normcounts_sub$HH)
avg_sub$HUT78 <- (normcounts_sub$HUT78)
avg_sub$HUT102 <- (normcounts_sub$HUT102)
avg_sub$KARPAS384 <- (normcounts_sub$KARPAS384)
avg_sub$KHYG1 <- (normcounts_sub$KHYG1)
avg_sub$MAC2A <- (normcounts_sub$MAC2A)
avg_sub$MJ <- (normcounts_sub$MJ)
avg_sub$MOTN1 <- (normcounts_sub$MOTN1)
avg_sub$MTA <- (normcounts_sub$MTA)
avg_sub$MYLA <- (normcounts_sub$MYLA)
avg_sub$NKL <- (normcounts_sub$NKL)
avg_sub$OCILY12 <- (normcounts_sub$OCILY12)
avg_sub$SeAx <- (normcounts_sub$SeAx)
avg_sub$SMZ1 <- (normcounts_sub$SMZ1)
avg_sub$ST1 <- (normcounts_sub$ST1)
avg_sub$Su9T01 <- (normcounts_sub$Su9T01)

avg_sub$max <- apply(avg_sub[c("DERL2", "DL40", "FEPD", "HH", "HUT78", "HUT102", "KARPAS384", "KHYG1", "MAC2A", "MJ", "MOTN1", "MTA", "MYLA", "NKL", "OCILY12", "SeAx", "SMZ1", "ST1", "Su9T01")], 1, max, na.rm=TRUE)
rel_sub <- data.frame(peakID=avg_sub$peakID)
rel_sub$DERL2 <- avg_sub$DERL2 / avg_sub$max
rel_sub$DL40 <- avg_sub$DL40 / avg_sub$max
rel_sub$FEPD <- avg_sub$FEPD / avg_sub$max
rel_sub$HH <- avg_sub$HH / avg_sub$max
rel_sub$HUT78 <- avg_sub$HUT78 / avg_sub$max
rel_sub$HUT102 <- avg_sub$HUT102 / avg_sub$max
rel_sub$KARPAS384 <- avg_sub$KARPAS384 / avg_sub$max
rel_sub$KHYG1 <- avg_sub$KHYG1 / avg_sub$max
rel_sub$MAC2A <- avg_sub$MAC2A / avg_sub$max
rel_sub$MJ <- avg_sub$MJ / avg_sub$max
rel_sub$MOTN1 <- avg_sub$MOTN1 / avg_sub$max
rel_sub$MTA <- avg_sub$MTA / avg_sub$max
rel_sub$MYLA <- avg_sub$MYLA / avg_sub$max
rel_sub$NKL <- avg_sub$NKL / avg_sub$max
rel_sub$OCILY12 <- avg_sub$OCILY12 / avg_sub$max
rel_sub$SeAx <- avg_sub$SeAx / avg_sub$max
rel_sub$SMZ1 <- avg_sub$SMZ1 / avg_sub$max
rel_sub$ST1 <- avg_sub$ST1 / avg_sub$max
rel_sub$Su9T01 <- avg_sub$Su9T01 / avg_sub$max
```

PCA top contributors selection (5000 tried first)
```{r message=FALSE, warning=FALSE}

#check this
colnames(rel_sub) <- c("dREGpeakID", "DERL2.enhancer", "DL40.enhancer", "FEPD.enhancer", "HH.enhancer", "HUT78.ehancer", "HUT102.enhancer", "KARPAS384.enhancer", "KHYG1.enhancer", "MAC2A.ehancer",  "MJ.enhancer", "MOTN1.enhancer", "MTA.enhancer", "MYLA.enhancer", "NKL.enhancer", "OCILY12.enhancer", "SeAx.ehancer", "SMZ1.enhancer", "ST1.enhancer", "Su9T01.enhancer")

rv <- rowVars(assay(rld_sub))
select <- order(rv, decreasing=TRUE)[seq_len(min(5000, length(rv)))] 
#Changing the 5000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_sub)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 5000 pro subset PCA Contributors.txt")
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Merge top 5000 elements with their annotations
top5000 <- dREG_annot[(dREG_annot$peakID %in% rownames(aload)),]
#write.table(top5000, file="Top5000.txt", sep='\t')
#ggplot(top5000, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Top 5000 PCA Contributors dREGpeak Annotations")
#ggplot(dREG_annot, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("All dREGpeak Annotations")

#filter results down by top contributors
sub_5000_rel <- rel_sub[(rel_sub$dREGpeakID %in% top5000$peakID),]
#Gap Stat Calculation
#set.seed(42) 
#This sets the random number generator seed so it will always produce the same result. You can pick whatever number you'd like.
#gap_stat <- clusGap(pro_5000_rel[2:26], FUN = kmeans, nstart = 20, K.max = 24, B = 50) 
#... = number of samples + 1
#Cluster Stat Plots
#fviz_nbclust(pro_5000_rel[2:26], kmeans, method="wss", k.max=24) + theme_minimal() + ggtitle("Elbow Plot")
#fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("Gap Statistic")
#fviz_nbclust(pro_5000_rel[2:26], kmeans, method="silhouette", k.max=24) + theme_minimal() + ggtitle("Silhouette Plot")
#k-means Clustering
CLARAk4 <- clara(sub_5000_rel[2:20],4,medoids.x=TRUE)
CLARAk5 <- clara(sub_5000_rel[2:20],5,medoids.x=TRUE)
CLARAk6 <- clara(sub_5000_rel[2:20],6,medoids.x=TRUE)
CLARAk7 <- clara(sub_5000_rel[2:20],7,medoids.x=TRUE)
CLARAk8 <- clara(sub_5000_rel[2:20],8,medoids.x=TRUE)

sub_5000_rel$k4 <- CLARAk4$clustering
sub_5000_rel$k5 <- CLARAk5$clustering
sub_5000_rel$k6 <- CLARAk6$clustering
sub_5000_rel$k7 <- CLARAk7$clustering
sub_5000_rel$k8 <- CLARAk8$clustering

#Displaying Heatmaps
#k-means 4
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+1]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+1])
#k-means 5
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+2]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+2])
#k-means 6
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+3]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+3])
#k-means 7
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+4]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+4])
#k-means 8
pheatmap(sub_5000_rel[order(sub_5000_rel[, 20+5]),][,2:20], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=sub_5000_rel[20+5])
```

--PRO END--

Combined PRO and RNA analysis

Load DESeq objects and annotations
```{r}
#dds_rna <- readRDS("RNA_dds.rds")
#dds_pro <- readRDS("dREG_dds.rds")
dREGann <- read.table("002_TCLs_p1000_m1000000_gene-centric_closest-dREGpeaks.txt", sep="\t", header=T)
write.table(deseq_sizes_pro, file="DEseq size factors.txt", sep="\t", quote=F, row.names=T)
```

Make prelim dataframes
```{r}
norm_rna <- as.data.frame(counts(dds_rna, normalized=TRUE))
rld_rna2 <- rlog(dds_rna, blind=TRUE)

colnames(norm_rna) <- dds_rna$sample #Simplified names from dds metadata

rel_rna2 <- data.frame("geneID"=rownames(norm_rna))
rel_rna2$max.gene.counts <- apply(norm_rna[1:ncol(norm_rna)], 1, max, na.rm=TRUE)
i=1
while (i<=ncol(norm_rna)){
  rel_rna2[paste0(dds_rna$sample[i],".gene.rel")] <- norm_rna[,i]/rel_rna2$max.gene.counts
  i=i+1
}

norm_pro <- as.data.frame(counts(dds_pro, normalized=TRUE))

colnames(norm_pro) <- dds_pro$sample #Simplified names from dds metadata
colnames(norm_pro)[2] <- "ST1" #Think there was a typo in this sample name
colnames(norm_rna)[7] <- "KARPAS299"
colnames(norm_rna)[8] <- "KARPAS384"
norm_pro <- norm_pro[names(norm_rna)] #reorder sample columns to match RNA-seq data

colnames(PRO_rel)[1] <- ("closest_peak")
#rel_pro2 <- data.frame("closest_peak"=rownames(norm_pro))
PRO_rel$max.dREG.counts <- apply(norm_pro[1:ncol(norm_pro)], 1, max, na.rm=TRUE)
PRO_rel <- PRO_rel[,c(1,27,2:26)] #reorder columns
#i=1
#while (i<=ncol(norm_pro)){
#  rel_pro2[paste0(dds_pro$sample[i],".dREG.rel")] <- norm_pro[,i]/rel_pro2$max.dREG.counts
#  i=i+1
#}
```

Selecting gene PCA contributors by variance
```{r}
#Select top 10000 PCA contributors
rv <- rowVars(assay(rld_rna2))
select <- order(rv, decreasing=TRUE)[seq_len(min(10000, length(rv)))] #Changing the 500 here will select a different number of top contributors if desired
pc <- prcomp(t(assay(rld_rna2)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
sweep(aload, 2, colSums(aload), "/")
 
#Clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
 
#Merge top 500 elements with their annotations
top10k_rel_rna <- rel_rna2[(rel_rna2$geneID %in% rownames(aload)),] #filter rel gene dataframe for top 10k
```

Determining clusters (k=30)
```{r}
CLARA <- clara(top10k_rel_rna[3:27],30,medoids.x=TRUE)

top10k_rel_rna$k30 <- CLARA$clustering

pheatmap(top10k_rel_rna[order(top10k_rel_rna[,28]),][,3:27], cluster_cols=TRUE, cluster_rows=FALSE, border_color=NA, show_rownames=FALSE, annotation_row=top10k_rel_rna[28])
```

Data integration
```{r}
gene_res <- rel_rna2[(!rel_rna2$geneID %in% top10k_rel_rna$geneID),] #filter out genes not in top 10k
gene_res$k30 <- NA #add value for genes not clustered
gene_res <- rbind(top10k_rel_rna, gene_res) #combine with top 10k genes with cluster designations
#gene_res$geneID <- sapply(strsplit(gene_res$geneID, split="[.]"), head, 1) #simplify geneIDs by removing version numbers

comb_res <- unique(merge(dREGann, gene_res, by.x="z", by.y="geneID", all.y=T))
colnames(comb_res)[1] <- "geneID" 
comb_res <- merge(comb_res, PRO_rel, by.x="closest_peak", by.y="closest_peak", all.x=T) #combine with dREG peak data, keep genes without a closest dREG peak call

comb_res <- comb_res[,c(2:9,39,13:38,10:12,1,40:65)] #reorder columns
```

Gene-to-Enhancer concordance analysis -- this generates a stat ranging from -1 (anti-correlated) to 0 (not correlated) to +1 (correlated) reported as Spearman rho
```{r message=F, warning=F}
#note this will produce a lot of errors for rows that don't have a gene and a an enhancer, so I've silenced them here
spear <- c()
i <- 1
while(i<=nrow(comb_res)){
  speari <- cor(as.numeric(factor(comb_res[i,11:35])),as.numeric(factor(comb_res[i,41:65])),method="spearman")
  spear <- c(spear, speari)
  i <- i+1
}

comb_res$Spearman.rho <- spear
```

Save output
```{r}
write.table(comb_res, file="weinstock002.final.table.txt", sep="\t", quote=F, row.names=F)
dREGcountsann <- merge(PRO_rel, dREG_readFilt, by.x="closest_peak", by.y="dREGpeakID")
dREGcountsann <- dREGcountsann[,c(1,28:33,2:27)]
write.table(dREGcountsann, file="weinstock002.dREGs.txt", sep="\t", quote=F, row.names=F)
```

Correct output to include metadata for genes without nearest enhancer calls
```{r}
library(dplyr)

#remove columns with annotation issue
comb_res_woEnh <- comb_res %>% filter(is.na(geneName))
comb_res_wEnh <- comb_res %>% filter(!is.na(geneName))

#Load original unfiltered annotations
comp_gtf <- read.table("Homo_sapiens.GRCh38.99.ucsc.gtf", sep="\t")
colnames(comp_gtf) <- c("chr", "source", "type", "start", "end", "score", "strand", "phase", "attributes")

#Function to extract metadata from gtf annotation
extract_attributes <- function(gtf_attributes, att_of_interest){
  att <- strsplit(as.character(gtf_attributes), "; ")
  att <- gsub("\"","",unlist(att))
  att <- gsub(";","",unlist(att))
  if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
    return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
  }else{
    return(NA)}
}

#Extract desired information from gtf (geneID, geneName, biotype)
comp_gene <- filter(comp_gtf, type == "gene")
comp_gene$geneID <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_id"))
comp_gene$geneName <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_name"))
comp_gene$biotype <- unlist(lapply(comp_gene$attributes, extract_attributes, "gene_biotype"))

#Remove offending columns
#geneName,biotype,attributes,chr,start,end,strand
comb_res_woEnh <- comb_res_woEnh[,c(1,9:66)]

#Add back information
geneInfo <- data.frame("geneID"=comp_gene$geneID, "geneName"=comp_gene$geneName, "biotype"=comp_gene$biotype, "attributes"=comp_gene$attributes, "chr"=comp_gene$chr, "start"=comp_gene$start, "end"=comp_gene$end, "strand"=comp_gene$strand)
comb_res_woEnh_corr <- merge(geneInfo, comb_res_woEnh, by.x="geneID", by.y="geneID", all.y=T)

#Modify geneName and peak_counts
comb_res_woEnh_corr$peak_count <- 0
comb_res_woEnh_corr$geneName <- paste0(comb_res_woEnh_corr$geneName)

#Correcting geneNames in original
comb_res_wEnh$geneName <- substring(comb_res_wEnh$geneName, 1)
comb_res_wEnh$geneName <- paste0(comb_res_wEnh$geneName)

#Combining it all together
comb_res_corr <- rbind(comb_res_wEnh, comb_res_woEnh_corr)

write.table(comb_res_corr, file="weinstock.combined.results.corrected.txt", sep="\t", quote=F, row.names=F)
```

Combining full annotation to plot based on genomic category
```{r}
filtered_res <- merge(dREG_annot, comb_res_corr, by.x="peakID", by.y="closest_peak", all.y=F)
write.table(filtered_res, file="Overlaping contribtors.txt", sep='\t', quote=F, row.names=F)
ggplot(filtered_res, aes(x=factor(1), fill=genomicCategory))+geom_bar(width=1)+coord_polar("y") + ggtitle("Overlapping PCA Contributors for Gene Expression and dREGpeak Annotations")
```

Generating normed counts table by top 50K contributors
```{r}
rv <- rowVars(assay(rld_pro))
select <- order(rv, decreasing=TRUE)[seq_len(min(50000, length(rv)))] 
#Changing the 50000 here will select a Different number of top contributors if desired
pc <- prcomp(t(assay(rld_pro)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
write.table(sweep(aload, 2, colSums(aload), "/"), file="Top 50000 PRO Contributors.txt")

top50k_rel_pro <- PRO_rel[(PRO_rel$closest_peak %in% rownames(aload)),] #filter rel gene dataframe for top 50k
CLARA <- clara(top50k_rel_pro[3:27],3,medoids.x=TRUE)

top50k_rel_pro$k3 <- CLARA$clustering

enhan_res <- PRO_rel[(PRO_rel$closest_peak %in% top50k_rel_pro$closest_peak),] #filter out genes not in top 50k
enhan_res$k3 <- NA #add value for genes not clustered
enhan_res <- top50k_rel_pro #combine with top 50k genes with cluster designations
comb_res <- unique(merge(dREG_annot, enhan_res, by.x="peakID", by.y="closest_peak", all.y=T))


```


Combination completed, further investigation into clusters ongoing - 8DEC21