Cluster testing from Weinstock002 ALK+ALCL and ALK-ALCL data 14DEC21
Libraries
library(dendextend)
library(DESeq2)
library(colorspace)
library(RColorBrewer)
library(DESeq2)
library(ggplot2)
library(dplyr)
library(reshape2)
library(plotly)
library(gridExtra)
library(gplots)
library(pals)
library(pheatmap)
library(colorspace)
library(scico)
Full PROseq data set analysis for TCL cell lines
https://talgalili.github.io/dendextend/articles/Cluster_Analysis.html
dds_alcl <- dds_pro[,-c(1,2,3,7,8,13,15,16,17,18,19,20,21,22,23,24)] #Excluding sample numbers of entities that are not Alk+Alcl or Alk-Alcl
estimateSizeFactors(dds_alcl) #Must re-calculate size factors after removing samples
class: DESeqDataSet
dim: 120888 9
metadata(1): version
assays(1): counts
rownames(120888): dREGpeak000003 dREGpeak000004 ... dREGpeak312454 dREGpeak312455
rowData names(0):
colnames(9): KARPAS299 L82 ... DL40 SUDHL1
colData names(3): sample subtype sizeFactor
deseq_sizes_alcl <- as.data.frame(sizeFactors(dds_alcl))
deseq_sizes_alcl
rld_alcl <- rlog(dds_alcl, blind=TRUE)
#colnames(rld_alcl) <- paste0(sampleNames_alcl,"(",subTypes_alcl,")")
sampleDists_alcl <- dist(t(assay(rld_alcl)))
sampleDistMtx_alcl <- as.matrix(sampleDists_alcl)
rownames(sampleDistMtx_alcl) <- paste0(rld_alcl$sample,"_",rld_alcl$subtype)
colnames(sampleDistMtx_alcl) <- NULL
hclust_alcl <- hclust(sampleDists_alcl)
norm_alcl <- as.data.frame(counts(dds_alcl, normalized=TRUE)) #think this is redundant with normcounts_alcl but keeping it for now
colnames(norm_alcl) <- dds_alcl$sample
Do initial dendrograms and determine PCA contributors
plotPCA(rld_alcl, intgroup="subtype")

plotPCA(rld_alcl, intgroup="sample")

plot(hclust_alcl, labels=rownames(sampleDistMtx_alcl), main="PRO-seq Alcl Samples dREG Dendrogram")
pdf("dREG_hclust.sample.pdf")
plot(hclust_alcl, labels=rownames(sampleDistMtx_alcl), main="PRO-seq Alcl Samples dREG Dendrogram")
dev.off()
quartz_off_screen
2

Determine the number of PCA contributors necessary to cleanly separate subtypes
plotPCA(rld_alcl, intgroup="subtype", ntop=100) + ggtitle("Top 100")

plotPCA(rld_alcl, intgroup="subtype", ntop=200) + ggtitle("Top 200")

plotPCA(rld_alcl, intgroup="subtype", ntop=500) + ggtitle("Top 500")

plotPCA(rld_alcl, intgroup="subtype", ntop=1000) + ggtitle("Top 1000")

plotPCA(rld_alcl, intgroup="subtype", ntop=2000) + ggtitle("Top 2000")

plotPCA(rld_alcl, intgroup="subtype", ntop=5000) + ggtitle("Top 5000")

plotPCA(rld_alcl, intgroup="subtype", ntop=10000) + ggtitle("Top 10000")

plotPCA(rld_alcl, intgroup="subtype", ntop=20000) + ggtitle("Top 20000")

plotPCA(rld_alcl, intgroup="subtype", ntop=50000) + ggtitle("Top 50000")

plotPCA(rld_alcl, intgroup="subtype", ntop=120888) + ggtitle("All")
pdf("dREG_pca.all.pdf")
plotPCA(rld_alcl, intgroup="subtype", ntop=100) + ggtitle("Top 100")
plotPCA(rld_alcl, intgroup="subtype", ntop=200) + ggtitle("Top 200")
plotPCA(rld_alcl, intgroup="subtype", ntop=500) + ggtitle("Top 500")
plotPCA(rld_alcl, intgroup="subtype", ntop=1000) + ggtitle("Top 1000")
plotPCA(rld_alcl, intgroup="subtype", ntop=2000) + ggtitle("Top 2000")
plotPCA(rld_alcl, intgroup="subtype", ntop=5000) + ggtitle("Top 5000")
plotPCA(rld_alcl, intgroup="subtype", ntop=10000) + ggtitle("Top 10000")
plotPCA(rld_alcl, intgroup="subtype", ntop=20000) + ggtitle("Top 20000")
plotPCA(rld_alcl, intgroup="subtype", ntop=50000) + ggtitle("Top 50000")
plotPCA(rld_alcl, intgroup="subtype", ntop=77061) + ggtitle("All")
dev.off()
quartz_off_screen
2

#Note: can make this a nicer grid with shared axes?
Rank dREG peaks by PCA contribution and measure variance and cumulative variance
#Load dREG peak annotations
dREG_annot2 <- read.table("../002_TCLs_p1000_dREGpeak_annotation.txt", sep="\t", header=TRUE)
#Define and rank top 1k PCA contributors
rv <- rowVars(assay(rld_alcl))
select <- order(rv, decreasing=TRUE)#[seq_len(min(1000, length(rv)))]
pc <- prcomp(t(assay(rld_alcl)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank_alcl <- data.frame("peakID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
#sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Generate list of dREG peak annotations
peakInfo_alcl <- merge(dREG_annot2, pcRank_alcl, by.x="peakID", by.y="peakID")
peakVar_alcl <- data.frame("peakID"=rownames(assay(rld_alcl)))
peakVar_alcl$rlog.variance <- rowVars(as.matrix(assay(rld_alcl)))
peakVar_alcl <- merge(peakInfo_alcl, peakVar_alcl, by.x="peakID", by.y="peakID")
pdf("variance.pdf")
ggplot(peakVar_alcl, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
dev.off()
null device
1
ggplot(peakVar_alcl, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
#add plot of fraction of explained variance.. running sum / total vs rank
peakVar_alcl <- peakVar_alcl[order(peakVar_alcl$PCA.rank),]
peakVar_alcl$cumulative.Variance <- cumsum(peakVar_alcl$rlog.variance)/sum(peakVar_alcl$rlog.variance)
pdf("cumulative_variance.pdf")
ggplot(peakVar_alcl, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
dev.off()
quartz_off_screen
2

ggplot(peakVar_alcl, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")

Filter rld objects by samples… separate “positive” set from “negative” set
#Separate positiveing and negative set
filt <- which(!colnames(rld_alcl) %in% c("MAC2A", "FEPD", "DL40"))
rld_positive <- rld_alcl[,filt]
filt <- which(colnames(rld_alcl) %in% c("MAC2A", "FEPD", "DL40"))
rld_negative <- rld_alcl[,filt]
filt <- which(!colnames(norm_alcl) %in% c("MAC2A", "FEPD", "DL40"))
norm_positive <- norm_alcl[,filt]
filt <- which(colnames(norm_alcl) %in% c("MAC2A", "FEPD", "DL40"))
norm_negative <- norm_alcl[,filt]
Determine the number of PCA contributors necessary to cleanly separate subtypes
Sample grouping looks pretty good between 5k and 10k top contributors. Will look at these cutoffs in more detail.
Rank dREG peaks by PCA contribution and measure variance and cumulative variance
#Define and rank top 10k PCA contributors
rv <- rowVars(assay(rld_positive))
select <- order(rv, decreasing=TRUE)#[seq_len(min(10000, length(rv)))]
pc <- prcomp(t(assay(rld_positive)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank2 <- data.frame("peakID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
#sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Generate list of dREG peak annotations
peakInfo2 <- merge(dREG_annot2, pcRank2, by.x="peakID", by.y="peakID")
peakVar2 <- data.frame("peakID"=rownames(assay(rld_positive)))
peakVar2$rlog.variance <- rowVars(as.matrix(assay(rld_positive)))
peakVar2 <- merge(peakInfo2, peakVar2, by.x="peakID", by.y="peakID")
ggplot(peakVar2, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar2 <- peakVar2[order(peakVar2$PCA.rank),]
peakVar2$cumulative.Variance <- cumsum(peakVar2$rlog.variance)/sum(peakVar2$rlog.variance)
ggplot(peakVar2, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")

1k seems reasonable
Filter for top 1k PCA contributors
#filter normalized counts for top 5k PCA contributors
top1k <- filter(peakVar2, PCA.rank<=1000)$peakID
norm_top1k <- norm_positive[which((rownames(norm_positive) %in% top1k)==TRUE),]
rld_top1k_positive <- subset(assay(rld_positive), rownames(assay(rld_positive)) %in% top1k)
rld_top1k_negative <- subset(assay(rld_negative), rownames(assay(rld_negative)) %in% top1k)
rld_top1k_alcl <- subset(assay(rld_alcl), rownames(assay(rld_alcl)) %in% top1k)
Calculate sample distances with new top1k training and matched test sets, then calculate the correlation between the branchpoints of the two
d_positive <- rld_top1k_positive %>% dist %>% hclust %>% as.dendrogram
d_negative <- rld_top1k_negative %>% dist %>% hclust %>% as.dendrogram
d_alcl <- rld_top1k_alcl %>% dist %>% hclust %>% as.dendrogram
#compare clusters excluding unclustered to unclustered alone
d_positive_negative <- dendlist(positive = d_positive, negative = d_negative)
d_positive_negative %>% cor.dendlist
positive negative
positive 1.0000000 0.3108639
negative 0.3108639 1.0000000
d_positive_negative %>% cor.dendlist(method_coef = "spearman")
positive negative
positive 1.0000000 0.3309033
negative 0.3309033 1.0000000
Bk_plot(d_positive, d_negative, k = 2:30, xlim = c(2,30), main = "ALK+ALCL Set vs ALK-ALCL Bk plot")

#compare clusters excluding unclustered to clusters using all samples
d_positive_alcl <- dendlist(positive = d_positive, all = d_alcl)
d_positive_alcl %>% cor.dendlist
positive all
positive 1.0000000 0.7879944
all 0.7879944 1.0000000
d_positive_alcl %>% cor.dendlist(method_coef = "spearman")
positive all
positive 1.000000 0.649814
all 0.649814 1.000000
Bk_plot(d_positive, d_alcl, k = 2:30, xlim = c(2,30), main = "ALK_ALCL Set vs All ALCL Samples Bk plot")

#compare clusters all sample clustering to unclustered alone
d_alcl_negative <- dendlist(all = d_alcl, negative = d_negative)
d_alcl_negative %>% cor.dendlist
all negative
all 1.0000000 0.4397905
negative 0.4397905 1.0000000
d_alcl_negative %>% cor.dendlist(method_coef = "spearman")
all negative
all 1.0000000 0.4374482
negative 0.4374482 1.0000000
Bk_plot(d_alcl, d_negative, k = 2:30, xlim = c(2,30), main = "All ALCL Samples vs. ALK-ALCL Bk plot")

3k clusters?
Now some fun with Tanglegrams…
pre_tang_d_positive_negative <- d_positive_negative %>% ladderize %>% untangle %>% set("branches_k_color", k = 3)
positive_branches_colors <- get_leaves_branches_col(pre_tang_d_positive_negative$positive)
pre_tang_d_positive_negative %>% tanglegram(fast = T, color_lines = positive_branches_colors)

pre_tang_d_positive_alcl <- d_positive_alcl %>% ladderize %>% untangle %>% set("branches_k_color", k = 3)
positive_branches_colors <- get_leaves_branches_col(pre_tang_d_positive_alcl$positive)
pre_tang_d_positive_alcl %>% tanglegram(fast = T, color_lines = positive_branches_colors)

pre_tang_d_alcl_negative <- d_alcl_negative %>% ladderize %>% untangle %>% set("branches_k_color", k = 3)
positive_branches_colors <- get_leaves_branches_col(pre_tang_d_alcl_negative$all)
pre_tang_d_alcl_negative %>% tanglegram(fast = T, color_lines = positive_branches_colors)

The three different sets lead to drastically different cluster designations. Will attempt to look at overlap between cluster sets though it is somewhat arbitrary which samples end up being the hold-outs.
Only highlight genes clustered in common
#d_positive_negative_common <- d_positive_negative %>% prune_common_subtrees.dendlist
#d_positive_negative_common %>% untangle %>% tanglegram(common_subtrees_color_branches = TRUE)
#d_positive_alcl_common <- d_positive_alcl %>% prune_common_subtrees.dendlist
#d_positive_alcl_common %>% untangle %>% tanglegram(common_subtrees_color_branches = TRUE)
#d_alcl_negative_common <- d_alcl_negative %>% prune_common_subtrees.dendlist
#d_alcl_negative_common %>% untangle %>% tanglegram(common_subtrees_color_branches = TRUE)
Quite a few shared leaves between the positiveing set and all. Let’s see what they are…
#d_positive_negative %>% nleaves
#d_positive_negative_common %>% nleaves
#d_positive_alcl %>% nleaves
#d_positive_alcl_common %>% nleaves
#d_alcl_negative %>% nleaves
#d_alcl_negative_common %>% nleaves
n leaves preserved between the subset and all samples.
Next is to figure a way to define expression/activity groups: - k-means as before - or - dynamic branch cutting -
using top 1k PCA contributors (as determined a reasonable cutoff in the variance plots)
#filter normalized counts for top 1k PCA contributors
top1k_alcl <- filter(peakVar_alcl, PCA.rank<=1000)$peakID
bot1k_alcl <- filter(peakVar_alcl, PCA.rank>119888)$peakID
norm_top1k_alcl <- norm_alcl[which((rownames(norm_alcl) %in% top1k_alcl)==TRUE),]
rld_top1k_alcl <- subset(assay(rld_alcl), rownames(assay(rld_alcl)) %in% top1k_alcl)
colnames(norm_top1k_alcl) <- colnames(rld_top1k_alcl)
rld_bot1k_alcl <- subset(assay(rld_alcl), rownames(assay(rld_alcl)) %in% bot1k_alcl)
(row.names(rld_top1k_alcl) %in% row.names(rld_bot1k_alcl))==FALSE #check dREGs pulled into top and bottom do not match- should run T for all
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[39] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[58] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[77] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[96] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[115] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[134] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[153] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[172] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[191] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[210] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[229] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[248] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[286] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[305] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[324] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[343] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[362] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[381] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[400] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[419] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[438] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[457] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[476] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[495] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[514] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[533] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[552] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[571] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[590] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[609] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[628] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[647] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[666] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[685] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[704] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[723] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[742] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[761] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[780] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[799] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[818] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[837] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[856] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[875] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[894] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[913] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[932] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[951] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[970] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[989] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#scale() function centers values +/- the mean of each column for normalized counts or rlog normalized counts
scaledata_count_top <- t(scale(t(norm_top1k_alcl))) # Centers and scales data.
scaledata_count_top <- scaledata_count_top[complete.cases(scaledata_count_top),] # Removes rows with nulls
scaledata_rld_top <- t(scale(t(rld_top1k_alcl))) # Centers and scales data.
scaledata_rld_top <- scaledata_rld_top[complete.cases(scaledata_rld_top),] # Removes rows with nulls
scaledata_rld_alcl <- t(scale(t(assay(rld_alcl))))
scaledata_rld_alcl <- scaledata_rld_alcl[complete.cases(scaledata_rld_alcl),]
scaledata_rld_bot <- t(scale(t(rld_bot1k_alcl)))
scaledata_rld_bot <- scaledata_rld_bot[complete.cases(scaledata_rld_bot),]
#also try with relative activity scaling
scaledata_rel_top <- data.frame(row.names=rownames(norm_top1k_alcl))
scaledata_rel_top$max.dREG.counts <- apply(norm_top1k_alcl[1:ncol(norm_top1k_alcl)], 1, max, na.rm=TRUE)
i=1
while (i<=ncol(norm_top1k_alcl)){
scaledata_rel_top[paste0(names(norm_top1k_alcl[i]))] <- norm_top1k_alcl[,i]/scaledata_rel_top$max.dREG.counts
i=i+1
}
#Generate column and row hierarchical clustering for each normalization method (all spearman for now)
hr_count <- hclust(as.dist(1-cor(t(scaledata_count_top), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_count <- hclust(as.dist(1-cor(scaledata_count_top, method="spearman")), method="complete") # Clusters columns by Spearman correlation.
hr_rld <- hclust(as.dist(1-cor(t(scaledata_count_top), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_rld <- hclust(as.dist(1-cor(scaledata_rld_alcl, method="spearman")), method="complete") # Clusters columns by Spearman correlation.
#hr_bot <- hclust(as.dist(1-cor(t(scaledata_rld_bot), method="spearman")), method="complete")
hr_rel <- hclust(as.dist(1-cor(t(scaledata_rel_top[,c(2:10)]), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_rel <- hclust(as.dist(1-cor(scaledata_rel_top[,c(2:10)], method="spearman")), method="complete") # Clusters columns by Spearman correlation.
Dynamic Branch Cutting negative (3-5)
library(dendextend)
cnt_cutk3 <- cutree(hr_count,k=3) #cut at specific height h or cut for a specific number of clusters k
cnt_cutk4 <- cutree(hr_count,k=4)
cnt_cutk5 <- cutree(hr_count,k=5)
rld_cutk2 <- cutree(hr_rld,k=2)
rld_cutk3 <- cutree(hr_rld,k=3)
rld_cutk4 <- cutree(hr_rld,k=4)
rld_cutk5 <- cutree(hr_rld,k=5)
rld_cutk6 <- cutree(hr_rld,k=6)
rld_cutk7 <- cutree(hr_rld,k=7)
rld_cutk8 <- cutree(hr_rld,k=8)
rld_cutk9 <- cutree(hr_rld,k=9)
rld_bot1k_alcl <- as.integer(rep(0, 5000))
names(rld_bot1k_alcl) <- bot1k_alcl
rel_cutk3 <- cutree(hr_rel,k=3)
rel_cutk4 <- cutree(hr_rel,k=4)
rel_cutk5 <- cutree(hr_rel,k=5)
#plot the tree
TreeR_cnt = as.dendrogram(hr_count, method="average")
TreeR_rld = as.dendrogram(hr_rld, method="average")
TreeR_rel = as.dendrogram(hr_rel, method="average")
#define annotation bars
the_bars_k3 <- cbind(cnt_cutk3, rld_cutk3, rel_cutk3)
the_bars_k4 <- cbind(cnt_cutk4, rld_cutk4, rel_cutk4)
the_bars_k5 <- cbind(cnt_cutk5, rld_cutk5, rel_cutk5)
the_bars_cnt <- cbind(cnt_cutk3, cnt_cutk4, cnt_cutk5)
the_bars_rld <- cbind(rld_cutk3, rld_cutk4, rld_cutk5)
the_bars_rel <- cbind(rel_cutk3, rel_cutk4, rel_cutk5)
Compare clusters
plot(TreeR_rld,
leaflab = "none",
main = "dREG Peak Clustering (rlog) k=3",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_k3, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

plot(TreeR_rld,
leaflab = "none",
main = "dREG Peak Clustering (rlog) k=4",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_k4, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

plot(TreeR_rld,
leaflab = "none",
main = "dREG Peak Clustering (rlog) k=5",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_k5, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

Clustering is exactly the same regardless of method now!! This is great!
Compare clusters
plot(TreeR_cnt,
leaflab = "none",
main = "dREG Peak Clustering (counts)",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_cnt, TreeR_cnt, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

plot(TreeR_rld,
leaflab = "none",
main = "dREG Peak Clustering (rlog)",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_rld, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

plot(TreeR_rel,
leaflab = "none",
main = "dREG Peak Clustering (relative activity)",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_rel, TreeR_rel, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

Let’s see how well these clusters separate expression groups associated with different cell line subtypes…
data_alcl <- c( "KARPAS299_ALK+ALCL",
"L82_ALK+ALCL",
"SUPM2_ALK+ALCL",
"MAC2A_ALK-ALCL",
"KIJK_ALK+ALCL",
"SR786_ALK+ALCL",
"FEPD_ALK-ALCL",
"DL40_ALK-ALCL",
"SUDHL1_ALK+ALCL")
#Defining box plot functions
clust.core = function(i, dat, clusters) {
ind = (clusters == i)
colMeans(dat[ind,])
}
box <- function(clusters, scaledata_count_top, title){
cores <- sapply(unique(clusters), clust.core, scaledata_count_top, clusters)
#make a dataframe of cores by subtype
d <- data.frame(cbind(data_alcl, cores)) #subType list from beginning!
colnames(d) <-c("sample", paste0("clust_",1:ncol(cores)))
#get the data frame into long format for plotting
dmolten <- melt(d, id.vars = "sample")
#order by subtype
dmolten <- dmolten[order(dmolten$sample),]
dmolten$sample <- ordered(dmolten$sample, levels=unique(dmolten$sample))
# make the plot
p1 <- ggplot(dmolten, aes(x=sample, y=as.numeric(value), col=variable)) +
geom_boxplot() +
# geom_point() +
# geom_line() +
# scale_x_continuous(minor_breaks = NULL, breaks=c(as.numeric(levels(factor(dmolten$subtype))))) +
# scale_y_continuous() +
xlab("Sample") +
ylab("Activity") +
labs(title= paste0("Cluster Core Expression by Sample (",title,")"),color = "Cluster")
return(p1)
}
Make boxplots
box(rld_cutk2, scaledata_rld_alcl, "cutk2 rlog")

box(rld_cutk3, scaledata_rld_alcl, "cutk3 rlog")

box(rld_cutk4, scaledata_rld_alcl, "cutk4 rlog")

box(rld_cutk5, scaledata_rld_alcl, "cutk5 rlog")

box(rld_cutk6, scaledata_rld_alcl, "cutk6 rlog")

box(rld_cutk7, scaledata_rld_alcl, "cutk7 rlog")

box(rld_cutk8, scaledata_rld_alcl, "cutk8 rlog")

box(rld_cutk9, scaledata_rld_alcl, "cutk9 rlog")

box(c(rld_cutk3,rld_bot1k_alcl), rbind(scaledata_rld_alcl,scaledata_rld_bot), "cutk3 + static [\"clust_5\"] rlog")

box(rel_cutk3, scaledata_rel_top[,2:10], "cutk3 rel")

box(rel_cutk4, scaledata_rel_top[,2:10], "cutk4 rel")

box(rel_cutk5, scaledata_rel_top[,2:10], "cutk5 rel")

box(rel_cutk3, scaledata_count_top, "cutk3 count")

box(rel_cutk4, scaledata_count_top, "cutk4 count")

box(rel_cutk5, scaledata_count_top, "cutk5 count")

Properly by subtype instead of individual samples
data_alcl <- c( "ALK+ALCL",
"ALK+ALCL",
"ALK+ALCL",
"ALK-ALCL",
"ALK+ALCL",
"ALK+ALCL",
"ALK-ALCL",
"ALK-ALCL",
"ALK+ALCL")
#Defining box plot functions
clust.core = function(i, dat, clusters) {
ind = (clusters == i)
colMeans(dat[ind,])
}
box <- function(clusters, scaledata_count_top, title){
cores <- sapply(unique(clusters), clust.core, scaledata_count_top, clusters)
#make a dataframe of cores by subtype
d <- data.frame(cbind(data_alcl, cores)) #subType list from beginning!
colnames(d) <-c("subtype", paste0("clust_",1:ncol(cores)))
#get the data frame into long format for plotting
dmolten <- melt(d, id.vars = "subtype")
#order by subtype
dmolten <- dmolten[order(dmolten$subtype),]
dmolten$subtype <- ordered(dmolten$subtype, levels=unique(dmolten$subtype))
# make the plot
p1 <- ggplot(dmolten, aes(x=subtype, y=as.numeric(value), col=variable)) +
geom_boxplot() +
# geom_point() +
# geom_line() +
# scale_x_continuous(minor_breaks = NULL, breaks=c(as.numeric(levels(factor(dmolten$subtype))))) +
# scale_y_continuous() +
xlab("Subtype") +
ylab("Activity") +
labs(title= paste0("Cluster Core Expression by Subtype (",title,")"),color = "Cluster")
return(p1)
}
Make boxplots
box(rld_cutk2, scaledata_rld_alcl, "cutk2 rlog")

box(rld_cutk3, scaledata_rld_alcl, "cutk3 rlog")

box(rld_cutk4, scaledata_rld_alcl, "cutk4 rlog")

box(rld_cutk5, scaledata_rld_alcl, "cutk5 rlog")

box(rld_cutk6, scaledata_rld_alcl, "cutk6 rlog")

box(rld_cutk7, scaledata_rld_alcl, "cutk7 rlog")

box(rld_cutk8, scaledata_rld_alcl, "cutk8 rlog")

box(rld_cutk9, scaledata_rld_alcl, "cutk9 rlog")

box(c(rld_cutk3,rld_bot1k_alcl), rbind(scaledata_rld_alcl,scaledata_rld_bot), "cutk3 + static [\"clust_5\"] rlog")

box(rel_cutk3, scaledata_rel_top[,2:10], "cutk3 rel")

box(rel_cutk4, scaledata_rel_top[,2:10], "cutk4 rel")

box(rel_cutk5, scaledata_rel_top[,2:10], "cutk5 rel")

box(rel_cutk3, scaledata_count_top, "cutk3 count")

box(rel_cutk4, scaledata_count_top, "cutk4 count")

box(rel_cutk5, scaledata_count_top, "cutk5 count")

Can’t make much sense of unique clustering between the samples or their subtypes. Considering subsetting out original data into clustering and nonclustering samples and moving on with individual cluster treatments for each from there.
Want to make a heatmap with the above clusters and a cluster-trait correlation heatmap…
Trait correlation (edit for this dataset)
traits <- data.frame(row.names=colnames(scaledata_rld_alcl))
#here I'm adding vectors of subtype traits to the dataframe
traits$'ALK+ALCL' <- c(1,1,1,0,1,1,0,0,1) #corrected
traits$'ALK-ALCL' <- c(0,0,0,1,0,0,1,1,0) #corrected
#Extract the gene/sample numbers
nPeaks = nrow(scaledata_rld_alcl)
nSamples = ncol(scaledata_rld_alcl)
#p value calculation from WGCNA
corPvalueStudent = function(cor, nSamples) {
T=sqrt(nSamples-2) * cor/sqrt(1-cor^2)
2*pt(abs(T),nSamples-2, lower.tail = FALSE)
}
Make correlation heatmaps
nSamples=9
#Finalize clustering and branch cuts
clusters <- rld_cutk2
cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=2)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk3
cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=3)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk4
cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=4)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk5
cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=5)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk6
cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=6)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk7
cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=7)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk8
cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=8)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk9
cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=9)",
xlab = "Traits",
ylab = "Clusters")

#clusters <- c(rld_cutk4,rld_bop5k_alcl)
#cores <- sapply(unique(clusters), clust.core, rbind(scaledata_rld_top, scaledata_bot), clusters)
#correlate the cores to traits:
#moduleTraitCor = cor(cores, traits, use= "p")
#moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
#textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
#signif(moduleTraitPvalue, 1), ")", sep= "")
#dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
#heatmap.2(moduleTraitCor,
#dendrogram = "none",
#Colv =FALSE,
#scale = c("none"),
#col="heat.colors",
#na.rm=TRUE,
#cellnote = textMatrix,
#notecol="grey30",
#notecex=1,
#trace=c("none"),
#cexRow = 0.8,
#cexCol = 0.8,
#main = "Cluster-Trait Correlation (k=4 + Static [5k])",
#xlab = "Traits",
#ylab = "Clusters")
Make annotated scaled rlog heatmap
rld_df <- as.data.frame(scaledata_rld_top)
rld_df$k3 <- as.data.frame(rld_cutk3)$rld_cutk3
rld_df$k4 <- as.data.frame(rld_cutk4)$rld_cutk4
colors <- c('#e6194B','#3cb44b','#ffe119','#4363d8','#f58231','#911eb4','#42d4f4','#f032e6','#bfef45','#fabed4','#469990','#dcbeff','#9A6324','#fffac8','#800000','#aaffc3','#808000','#ffd8b1','#000075','#a9a9a9','#ffffff','#000000')
#High contrast colors for figures was pulled from https://sashamaps.net/docs/resources/20-colors/
pheatmap(rld_top1k_alcl,
#rld_df[order(rld_df[,12]),][,1:11],
#gaps_row = c(879,2298,4213),
cluster_cols=T,
cluster_rows=hr_rld,
#clustering_distance_rows = "euclidean",
#clustering_method = "complete",
scale="row",
cutree_rows=4,
cutree_cols=3,
border_color=NA,
show_rownames=F,
main="Top 1k dREG Peak PCA Contributors Hierarchical",
breaks=seq(-2,2,by=0.01),
color=ocean.balance(401),
#color=scico(401, palette = 'berlin'),
#color=colorRampPalette(c("blue", "white", "red"))(41),
annotation_row=rld_df[10:11],
annotation_col=data.frame(row.names=colnames(rld_df[,1:9]), Subtype = data_alcl),
annotation_colors=list(k3=c("blue", "green", "red"),
k4=c("purple", "blue", "green", "goldenrod1"),
Subtype=c("ALK+ALCL"= 'chocolate4', "ALK-ALCL"= 'cadetblue'))
)

---
title: "ALCL CLustering"
output: html_notebook
---

Cluster testing from Weinstock002 ALK+ALCL and ALK-ALCL data 14DEC21

Libraries
```{r}
library(dendextend)
library(DESeq2)
library(colorspace)
library(RColorBrewer)
library(DESeq2)
library(ggplot2)
library(dplyr)
library(reshape2)
library(plotly)
library(gridExtra)
library(gplots)
library(pals)
library(pheatmap)
library(colorspace)
library(scico)
```

Full PROseq data set analysis for TCL cell lines

https://talgalili.github.io/dendextend/articles/Cluster_Analysis.html
```{r}
dds_alcl <- dds_pro[,-c(1,2,3,7,8,13,15,16,17,18,19,20,21,22,23,24)] #Excluding sample numbers of entities that are not Alk+Alcl or Alk-Alcl
estimateSizeFactors(dds_alcl) #Must re-calculate size factors after removing samples
deseq_sizes_alcl <- as.data.frame(sizeFactors(dds_alcl))
deseq_sizes_alcl 
rld_alcl <- rlog(dds_alcl, blind=TRUE)
#colnames(rld_alcl) <- paste0(sampleNames_alcl,"(",subTypes_alcl,")")
sampleDists_alcl <- dist(t(assay(rld_alcl)))
sampleDistMtx_alcl <- as.matrix(sampleDists_alcl)
rownames(sampleDistMtx_alcl) <- paste0(rld_alcl$sample,"_",rld_alcl$subtype)
colnames(sampleDistMtx_alcl) <- NULL
hclust_alcl <- hclust(sampleDists_alcl)

norm_alcl <- as.data.frame(counts(dds_alcl, normalized=TRUE)) #think this is redundant with normcounts_alcl but keeping it for now
colnames(norm_alcl) <- dds_alcl$sample
```

Do initial dendrograms and determine PCA contributors
```{r}
plotPCA(rld_alcl, intgroup="subtype") 
plotPCA(rld_alcl, intgroup="sample") 
plot(hclust_alcl, labels=rownames(sampleDistMtx_alcl), main="PRO-seq Alcl Samples dREG Dendrogram")
pdf("dREG_hclust.sample.pdf")
plot(hclust_alcl, labels=rownames(sampleDistMtx_alcl), main="PRO-seq Alcl Samples dREG Dendrogram")
dev.off()
```


Determine the number of PCA contributors necessary to cleanly separate subtypes
```{r}
  plotPCA(rld_alcl, intgroup="subtype", ntop=100) + ggtitle("Top 100")
  plotPCA(rld_alcl, intgroup="subtype", ntop=200) + ggtitle("Top 200")
  plotPCA(rld_alcl, intgroup="subtype", ntop=500) + ggtitle("Top 500")
  plotPCA(rld_alcl, intgroup="subtype", ntop=1000) + ggtitle("Top 1000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=2000) + ggtitle("Top 2000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=5000) + ggtitle("Top 5000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=10000) + ggtitle("Top 10000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=20000) + ggtitle("Top 20000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=50000) + ggtitle("Top 50000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=120888) + ggtitle("All")
  
pdf("dREG_pca.all.pdf")
  plotPCA(rld_alcl, intgroup="subtype", ntop=100) + ggtitle("Top 100")
  plotPCA(rld_alcl, intgroup="subtype", ntop=200) + ggtitle("Top 200")
  plotPCA(rld_alcl, intgroup="subtype", ntop=500) + ggtitle("Top 500")
  plotPCA(rld_alcl, intgroup="subtype", ntop=1000) + ggtitle("Top 1000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=2000) + ggtitle("Top 2000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=5000) + ggtitle("Top 5000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=10000) + ggtitle("Top 10000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=20000) + ggtitle("Top 20000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=50000) + ggtitle("Top 50000")
  plotPCA(rld_alcl, intgroup="subtype", ntop=77061) + ggtitle("All")
dev.off()
#Note: can make this a nicer grid with shared axes?
```

Rank dREG peaks by PCA contribution and measure variance and cumulative variance
```{r}
#Load dREG peak annotations
dREG_annot2 <- read.table("../002_TCLs_p1000_dREGpeak_annotation.txt", sep="\t", header=TRUE)
#Define and rank top 1k PCA contributors
rv <- rowVars(assay(rld_alcl)) 
select <- order(rv, decreasing=TRUE)#[seq_len(min(1000, length(rv)))]
pc <- prcomp(t(assay(rld_alcl)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank_alcl <- data.frame("peakID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
#sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)

#Generate list of dREG peak annotations
peakInfo_alcl <- merge(dREG_annot2, pcRank_alcl, by.x="peakID", by.y="peakID")
peakVar_alcl <- data.frame("peakID"=rownames(assay(rld_alcl)))
peakVar_alcl$rlog.variance <- rowVars(as.matrix(assay(rld_alcl)))
peakVar_alcl <- merge(peakInfo_alcl, peakVar_alcl, by.x="peakID", by.y="peakID")
pdf("variance.pdf")
ggplot(peakVar_alcl, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
dev.off()
ggplot(peakVar_alcl, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar_alcl <- peakVar_alcl[order(peakVar_alcl$PCA.rank),]
peakVar_alcl$cumulative.Variance <- cumsum(peakVar_alcl$rlog.variance)/sum(peakVar_alcl$rlog.variance)
pdf("cumulative_variance.pdf")
ggplot(peakVar_alcl, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
dev.off()
ggplot(peakVar_alcl, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
```


Filter rld objects by samples... separate "positive" set from "negative" set 
```{r}
#Separate positiveing and negative set
filt <- which(!colnames(rld_alcl) %in% c("MAC2A", "FEPD", "DL40"))
rld_positive <- rld_alcl[,filt]

filt <- which(colnames(rld_alcl) %in% c("MAC2A", "FEPD", "DL40"))
rld_negative <- rld_alcl[,filt]

filt <- which(!colnames(norm_alcl) %in% c("MAC2A", "FEPD", "DL40"))
norm_positive <- norm_alcl[,filt]

filt <- which(colnames(norm_alcl) %in% c("MAC2A", "FEPD", "DL40"))
norm_negative <- norm_alcl[,filt]
```

Determine the number of PCA contributors necessary to cleanly separate subtypes

Sample grouping looks pretty good between 5k and 10k top contributors. Will look at these cutoffs in more detail.

Rank dREG peaks by PCA contribution and measure variance and cumulative variance
```{r}
#Define and rank top 10k PCA contributors
rv <- rowVars(assay(rld_positive)) 
select <- order(rv, decreasing=TRUE)#[seq_len(min(10000, length(rv)))]
pc <- prcomp(t(assay(rld_positive)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank2 <- data.frame("peakID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
#sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)

#Generate list of dREG peak annotations
peakInfo2 <- merge(dREG_annot2, pcRank2, by.x="peakID", by.y="peakID")
peakVar2 <- data.frame("peakID"=rownames(assay(rld_positive)))
peakVar2$rlog.variance <- rowVars(as.matrix(assay(rld_positive)))
peakVar2 <- merge(peakInfo2, peakVar2, by.x="peakID", by.y="peakID")
ggplot(peakVar2, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar2 <- peakVar2[order(peakVar2$PCA.rank),]
peakVar2$cumulative.Variance <- cumsum(peakVar2$rlog.variance)/sum(peakVar2$rlog.variance)
ggplot(peakVar2, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")
```

1k seems reasonable

Filter for top 1k PCA contributors
```{r}
#filter normalized counts for top 5k PCA contributors
top1k <- filter(peakVar2, PCA.rank<=1000)$peakID
norm_top1k <- norm_positive[which((rownames(norm_positive) %in% top1k)==TRUE),]
rld_top1k_positive <- subset(assay(rld_positive), rownames(assay(rld_positive)) %in% top1k)
rld_top1k_negative <- subset(assay(rld_negative), rownames(assay(rld_negative)) %in% top1k)
rld_top1k_alcl <- subset(assay(rld_alcl), rownames(assay(rld_alcl)) %in% top1k)
```

Calculate sample distances with new top1k training and matched test sets, then calculate the correlation between the branchpoints of the two
```{r}
d_positive <- rld_top1k_positive %>% dist %>% hclust %>% as.dendrogram
d_negative <- rld_top1k_negative %>% dist %>% hclust %>% as.dendrogram
d_alcl <- rld_top1k_alcl %>% dist %>% hclust %>% as.dendrogram

#compare clusters excluding unclustered to unclustered alone
d_positive_negative <- dendlist(positive = d_positive, negative = d_negative)
d_positive_negative %>% cor.dendlist
d_positive_negative %>% cor.dendlist(method_coef = "spearman")
Bk_plot(d_positive, d_negative, k = 2:30, xlim = c(2,30), main = "ALK+ALCL Set vs ALK-ALCL Bk plot")

#compare clusters excluding unclustered to clusters using all samples
d_positive_alcl <- dendlist(positive = d_positive, all = d_alcl)
d_positive_alcl %>% cor.dendlist
d_positive_alcl %>% cor.dendlist(method_coef = "spearman")
Bk_plot(d_positive, d_alcl, k = 2:30, xlim = c(2,30), main = "ALK_ALCL Set vs All ALCL Samples Bk plot")

#compare clusters all sample clustering to unclustered alone
d_alcl_negative <- dendlist(all = d_alcl, negative = d_negative)
d_alcl_negative %>% cor.dendlist
d_alcl_negative %>% cor.dendlist(method_coef = "spearman")
Bk_plot(d_alcl, d_negative, k = 2:30, xlim = c(2,30), main = "All ALCL Samples vs. ALK-ALCL Bk plot")
```

3k clusters?

Now some fun with Tanglegrams...
```{r}
pre_tang_d_positive_negative <- d_positive_negative %>% ladderize %>% untangle %>% set("branches_k_color", k = 3)
positive_branches_colors <- get_leaves_branches_col(pre_tang_d_positive_negative$positive)
pre_tang_d_positive_negative %>% tanglegram(fast = T, color_lines = positive_branches_colors)

pre_tang_d_positive_alcl <- d_positive_alcl %>% ladderize %>%  untangle %>% set("branches_k_color", k = 3)
positive_branches_colors <- get_leaves_branches_col(pre_tang_d_positive_alcl$positive)
pre_tang_d_positive_alcl %>% tanglegram(fast = T, color_lines = positive_branches_colors)

pre_tang_d_alcl_negative <- d_alcl_negative %>% ladderize %>%  untangle %>% set("branches_k_color", k = 3)
positive_branches_colors <- get_leaves_branches_col(pre_tang_d_alcl_negative$all)
pre_tang_d_alcl_negative %>% tanglegram(fast = T, color_lines = positive_branches_colors)
```

The three different sets lead to drastically different cluster designations. Will attempt to look at overlap between cluster sets though it is somewhat arbitrary which samples end up being the hold-outs.

Only highlight genes clustered in common 
```{r}
#d_positive_negative_common <- d_positive_negative %>% prune_common_subtrees.dendlist
#d_positive_negative_common %>% untangle %>%  tanglegram(common_subtrees_color_branches = TRUE)

#d_positive_alcl_common <- d_positive_alcl %>% prune_common_subtrees.dendlist
#d_positive_alcl_common %>% untangle %>%  tanglegram(common_subtrees_color_branches = TRUE)

#d_alcl_negative_common <- d_alcl_negative %>% prune_common_subtrees.dendlist
#d_alcl_negative_common %>% untangle %>%  tanglegram(common_subtrees_color_branches = TRUE)
```

Quite a few shared leaves between the positiveing set and all. Let's see what they are... 
```{r}
#d_positive_negative %>% nleaves
#d_positive_negative_common %>% nleaves

#d_positive_alcl %>% nleaves
#d_positive_alcl_common %>% nleaves

#d_alcl_negative %>% nleaves
#d_alcl_negative_common %>% nleaves
```
n leaves preserved between the subset and all samples.

Next is to figure a way to define expression/activity groups:
- k-means as before -
or
- dynamic branch cutting -

using top 1k PCA contributors (as determined a reasonable cutoff in the variance plots)
```{r}
#filter normalized counts for top 1k PCA contributors
top1k_alcl <- filter(peakVar_alcl, PCA.rank<=1000)$peakID
bot1k_alcl <- filter(peakVar_alcl, PCA.rank>119888)$peakID
norm_top1k_alcl <- norm_alcl[which((rownames(norm_alcl) %in% top1k_alcl)==TRUE),]

rld_top1k_alcl <- subset(assay(rld_alcl), rownames(assay(rld_alcl)) %in% top1k_alcl)
colnames(norm_top1k_alcl) <- colnames(rld_top1k_alcl)
rld_bot1k_alcl <- subset(assay(rld_alcl), rownames(assay(rld_alcl)) %in% bot1k_alcl)
(row.names(rld_top1k_alcl) %in% row.names(rld_bot1k_alcl))==FALSE #check dREGs pulled into top and bottom do not match- should run T for all

#scale() function centers values +/- the mean of each column for normalized counts or rlog normalized counts
scaledata_count_top <- t(scale(t(norm_top1k_alcl))) # Centers and scales data.
scaledata_count_top <- scaledata_count_top[complete.cases(scaledata_count_top),] # Removes rows with nulls
scaledata_rld_top <- t(scale(t(rld_top1k_alcl))) # Centers and scales data.
scaledata_rld_top <- scaledata_rld_top[complete.cases(scaledata_rld_top),] # Removes rows with nulls

scaledata_rld_alcl <- t(scale(t(assay(rld_alcl))))
scaledata_rld_alcl <- scaledata_rld_alcl[complete.cases(scaledata_rld_alcl),]

scaledata_rld_bot <- t(scale(t(rld_bot1k_alcl)))
scaledata_rld_bot <- scaledata_rld_bot[complete.cases(scaledata_rld_bot),] 

#also try with relative activity scaling
scaledata_rel_top <- data.frame(row.names=rownames(norm_top1k_alcl))
scaledata_rel_top$max.dREG.counts <- apply(norm_top1k_alcl[1:ncol(norm_top1k_alcl)], 1, max, na.rm=TRUE)
i=1
while (i<=ncol(norm_top1k_alcl)){
  scaledata_rel_top[paste0(names(norm_top1k_alcl[i]))] <- norm_top1k_alcl[,i]/scaledata_rel_top$max.dREG.counts
  i=i+1
}

#Generate column and row hierarchical clustering for each normalization method (all spearman for now)
hr_count <- hclust(as.dist(1-cor(t(scaledata_count_top), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_count <- hclust(as.dist(1-cor(scaledata_count_top, method="spearman")), method="complete") # Clusters columns by Spearman correlation.

hr_rld <- hclust(as.dist(1-cor(t(scaledata_count_top), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_rld <- hclust(as.dist(1-cor(scaledata_rld_alcl, method="spearman")), method="complete") # Clusters columns by Spearman correlation.

#hr_bot <- hclust(as.dist(1-cor(t(scaledata_rld_bot), method="spearman")), method="complete")

hr_rel <- hclust(as.dist(1-cor(t(scaledata_rel_top[,c(2:10)]), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_rel <- hclust(as.dist(1-cor(scaledata_rel_top[,c(2:10)], method="spearman")), method="complete") # Clusters columns by Spearman correlation.
```

Dynamic Branch Cutting negative (3-5)
```{r}
library(dendextend)

cnt_cutk3 <- cutree(hr_count,k=3) #cut at specific height h or cut for a specific number of clusters k
cnt_cutk4 <- cutree(hr_count,k=4)
cnt_cutk5 <- cutree(hr_count,k=5)

rld_cutk2 <- cutree(hr_rld,k=2)
rld_cutk3 <- cutree(hr_rld,k=3)
rld_cutk4 <- cutree(hr_rld,k=4)
rld_cutk5 <- cutree(hr_rld,k=5)
rld_cutk6 <- cutree(hr_rld,k=6)
rld_cutk7 <- cutree(hr_rld,k=7)
rld_cutk8 <- cutree(hr_rld,k=8)
rld_cutk9 <- cutree(hr_rld,k=9)

rld_bot1k_alcl <- as.integer(rep(0, 5000))
names(rld_bot1k_alcl) <- bot1k_alcl

rel_cutk3 <- cutree(hr_rel,k=3)
rel_cutk4 <- cutree(hr_rel,k=4)
rel_cutk5 <- cutree(hr_rel,k=5)

#plot the tree
TreeR_cnt = as.dendrogram(hr_count, method="average")
TreeR_rld = as.dendrogram(hr_rld, method="average")
TreeR_rel = as.dendrogram(hr_rel, method="average")

#define annotation bars
the_bars_k3 <- cbind(cnt_cutk3, rld_cutk3, rel_cutk3)
the_bars_k4 <- cbind(cnt_cutk4, rld_cutk4, rel_cutk4)
the_bars_k5 <- cbind(cnt_cutk5, rld_cutk5, rel_cutk5)

the_bars_cnt <- cbind(cnt_cutk3, cnt_cutk4, cnt_cutk5)
the_bars_rld <- cbind(rld_cutk3, rld_cutk4, rld_cutk5)
the_bars_rel <- cbind(rel_cutk3, rel_cutk4, rel_cutk5)
```

Compare clusters
```{r}
plot(TreeR_rld,
     leaflab = "none",
     main = "dREG Peak Clustering (rlog) k=3",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_k3, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

plot(TreeR_rld,
     leaflab = "none",
     main = "dREG Peak Clustering (rlog) k=4",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_k4, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

plot(TreeR_rld,
     leaflab = "none",
     main = "dREG Peak Clustering (rlog) k=5",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_k5, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)
```

Clustering is exactly the same regardless of method now!! This is great!

Compare clusters
```{r}
plot(TreeR_cnt,
     leaflab = "none",
     main = "dREG Peak Clustering (counts)",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_cnt, TreeR_cnt, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

plot(TreeR_rld,
     leaflab = "none",
     main = "dREG Peak Clustering (rlog)",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_rld, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

plot(TreeR_rel,
     leaflab = "none",
     main = "dREG Peak Clustering (relative activity)",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_rel, TreeR_rel, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)
```

Let's see how well these clusters separate expression groups associated with different cell line subtypes...
```{r}
data_alcl <- c( "KARPAS299_ALK+ALCL", 
                 "L82_ALK+ALCL",
                 "SUPM2_ALK+ALCL",
                 "MAC2A_ALK-ALCL",
                 "KIJK_ALK+ALCL",
                 "SR786_ALK+ALCL",
              "FEPD_ALK-ALCL",
                 "DL40_ALK-ALCL", 
              "SUDHL1_ALK+ALCL")

#Defining box plot functions
clust.core = function(i, dat, clusters) {
  ind = (clusters == i)
  colMeans(dat[ind,])
}

box <- function(clusters, scaledata_count_top, title){
cores <- sapply(unique(clusters), clust.core, scaledata_count_top, clusters)

#make a dataframe of cores by subtype
d <- data.frame(cbind(data_alcl, cores)) #subType list from beginning!
colnames(d) <-c("sample", paste0("clust_",1:ncol(cores)))

#get the data frame into long format for plotting
dmolten <- melt(d, id.vars = "sample")
#order by subtype
dmolten <- dmolten[order(dmolten$sample),]
dmolten$sample <- ordered(dmolten$sample, levels=unique(dmolten$sample))

# make the plot
p1 <- ggplot(dmolten, aes(x=sample, y=as.numeric(value), col=variable)) + 
  geom_boxplot() +
#  geom_point() + 
#  geom_line() +
#  scale_x_continuous(minor_breaks = NULL, breaks=c(as.numeric(levels(factor(dmolten$subtype))))) +
#  scale_y_continuous() +
  xlab("Sample") +
  ylab("Activity") +
  labs(title= paste0("Cluster Core Expression by Sample (",title,")"),color = "Cluster")
return(p1)
}
```

Make boxplots
```{r}
box(rld_cutk2, scaledata_rld_alcl, "cutk2 rlog")
box(rld_cutk3, scaledata_rld_alcl, "cutk3 rlog")
box(rld_cutk4, scaledata_rld_alcl, "cutk4 rlog")
box(rld_cutk5, scaledata_rld_alcl, "cutk5 rlog")
box(rld_cutk6, scaledata_rld_alcl, "cutk6 rlog")
box(rld_cutk7, scaledata_rld_alcl, "cutk7 rlog")
box(rld_cutk8, scaledata_rld_alcl, "cutk8 rlog")
box(rld_cutk9, scaledata_rld_alcl, "cutk9 rlog")

box(c(rld_cutk3,rld_bot1k_alcl), rbind(scaledata_rld_alcl,scaledata_rld_bot), "cutk3 + static [\"clust_5\"] rlog")

box(rel_cutk3, scaledata_rel_top[,2:10], "cutk3 rel")
box(rel_cutk4, scaledata_rel_top[,2:10], "cutk4 rel")
box(rel_cutk5, scaledata_rel_top[,2:10], "cutk5 rel")

box(rel_cutk3, scaledata_count_top, "cutk3 count")
box(rel_cutk4, scaledata_count_top, "cutk4 count")
box(rel_cutk5, scaledata_count_top, "cutk5 count")
```

Properly by subtype instead of individual samples
```{r}
data_alcl <- c(  "ALK+ALCL", 
                 "ALK+ALCL",
                 "ALK+ALCL",
                 "ALK-ALCL",
                 "ALK+ALCL",
                 "ALK+ALCL",
              "ALK-ALCL",
                 "ALK-ALCL",
              "ALK+ALCL")

#Defining box plot functions
clust.core = function(i, dat, clusters) {
  ind = (clusters == i)
  colMeans(dat[ind,])
}

box <- function(clusters, scaledata_count_top, title){
cores <- sapply(unique(clusters), clust.core, scaledata_count_top, clusters)

#make a dataframe of cores by subtype
d <- data.frame(cbind(data_alcl, cores)) #subType list from beginning!
colnames(d) <-c("subtype", paste0("clust_",1:ncol(cores)))

#get the data frame into long format for plotting
dmolten <- melt(d, id.vars = "subtype")
#order by subtype
dmolten <- dmolten[order(dmolten$subtype),]
dmolten$subtype <- ordered(dmolten$subtype, levels=unique(dmolten$subtype))

# make the plot
p1 <- ggplot(dmolten, aes(x=subtype, y=as.numeric(value), col=variable)) + 
  geom_boxplot() +
#  geom_point() + 
#  geom_line() +
#  scale_x_continuous(minor_breaks = NULL, breaks=c(as.numeric(levels(factor(dmolten$subtype))))) +
#  scale_y_continuous() +
  xlab("Subtype") +
  ylab("Activity") +
  labs(title= paste0("Cluster Core Expression by Subtype (",title,")"),color = "Cluster")
return(p1)
}
```

Make boxplots
```{r}
box(rld_cutk2, scaledata_rld_alcl, "cutk2 rlog")
box(rld_cutk3, scaledata_rld_alcl, "cutk3 rlog")
box(rld_cutk4, scaledata_rld_alcl, "cutk4 rlog")
box(rld_cutk5, scaledata_rld_alcl, "cutk5 rlog")
box(rld_cutk6, scaledata_rld_alcl, "cutk6 rlog")
box(rld_cutk7, scaledata_rld_alcl, "cutk7 rlog")
box(rld_cutk8, scaledata_rld_alcl, "cutk8 rlog")
box(rld_cutk9, scaledata_rld_alcl, "cutk9 rlog")

box(c(rld_cutk3,rld_bot1k_alcl), rbind(scaledata_rld_alcl,scaledata_rld_bot), "cutk3 + static [\"clust_5\"] rlog")

box(rel_cutk3, scaledata_rel_top[,2:10], "cutk3 rel")
box(rel_cutk4, scaledata_rel_top[,2:10], "cutk4 rel")
box(rel_cutk5, scaledata_rel_top[,2:10], "cutk5 rel")

box(rel_cutk3, scaledata_count_top, "cutk3 count")
box(rel_cutk4, scaledata_count_top, "cutk4 count")
box(rel_cutk5, scaledata_count_top, "cutk5 count")
```

Can't make much sense of unique clustering between the samples or their subtypes. Considering subsetting out original data into clustering and nonclustering samples and moving on with individual cluster treatments for each from there.

Want to make a heatmap with the above clusters and a cluster-trait correlation heatmap...

Trait correlation (edit for this dataset)
```{r}
traits <- data.frame(row.names=colnames(scaledata_rld_alcl)) 
#here I'm adding vectors of subtype traits to the dataframe
traits$'ALK+ALCL' <-       c(1,1,1,0,1,1,0,0,1) #corrected
traits$'ALK-ALCL' <- c(0,0,0,1,0,0,1,1,0) #corrected

#Extract the gene/sample numbers
nPeaks = nrow(scaledata_rld_alcl) 
nSamples = ncol(scaledata_rld_alcl)

#p value calculation from WGCNA
corPvalueStudent = function(cor, nSamples) {
  T=sqrt(nSamples-2) * cor/sqrt(1-cor^2)
  2*pt(abs(T),nSamples-2, lower.tail = FALSE)
}
```

Make correlation heatmaps
```{r}
nSamples=9

#Finalize clustering and branch cuts
clusters <- rld_cutk2

cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=2)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk3

cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=3)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk4

cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=4)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk5

cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=5)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk6

cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=6)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk7

cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=7)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk8

cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=8)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk9

cores <- sapply(unique(clusters), clust.core, scaledata_rld_alcl, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=9)",
          xlab = "Traits",
          ylab = "Clusters")

#clusters <- c(rld_cutk4,rld_bop5k_alcl)

#cores <- sapply(unique(clusters), clust.core, rbind(scaledata_rld_top, scaledata_bot), clusters)
#correlate the cores to traits:
#moduleTraitCor = cor(cores, traits, use= "p")
#moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
#textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  #signif(moduleTraitPvalue, 1), ")", sep= "")
#dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
#heatmap.2(moduleTraitCor,
          #dendrogram = "none",
          #Colv =FALSE,
          #scale = c("none"),
          #col="heat.colors",
          #na.rm=TRUE,
          #cellnote = textMatrix,
          #notecol="grey30",
          #notecex=1,
          #trace=c("none"),
          #cexRow = 0.8,
          #cexCol = 0.8,
          #main = "Cluster-Trait Correlation (k=4 + Static [5k])",
          #xlab = "Traits",
          #ylab = "Clusters")
```

Make annotated scaled rlog heatmap
```{r}
rld_df <- as.data.frame(scaledata_rld_top)
rld_df$k3 <- as.data.frame(rld_cutk3)$rld_cutk3
rld_df$k4 <- as.data.frame(rld_cutk4)$rld_cutk4

colors <- c('#e6194B','#3cb44b','#ffe119','#4363d8','#f58231','#911eb4','#42d4f4','#f032e6','#bfef45','#fabed4','#469990','#dcbeff','#9A6324','#fffac8','#800000','#aaffc3','#808000','#ffd8b1','#000075','#a9a9a9','#ffffff','#000000')
#High contrast colors for figures was pulled from https://sashamaps.net/docs/resources/20-colors/

pheatmap(rld_top1k_alcl,
         #rld_df[order(rld_df[,12]),][,1:11], 
         #gaps_row = c(879,2298,4213), 
         cluster_cols=T, 
         cluster_rows=hr_rld,
         #clustering_distance_rows = "euclidean",
         #clustering_method = "complete",
         scale="row",
         cutree_rows=4,
         cutree_cols=3, 
         border_color=NA, 
         show_rownames=F, 
         main="Top 1k dREG Peak PCA Contributors Hierarchical",
         breaks=seq(-2,2,by=0.01),
         color=ocean.balance(401),
         #color=scico(401, palette = 'berlin'),
         #color=colorRampPalette(c("blue", "white", "red"))(41),
         annotation_row=rld_df[10:11], 
         annotation_col=data.frame(row.names=colnames(rld_df[,1:9]), Subtype = data_alcl), 
         annotation_colors=list(k3=c("blue", "green", "red"),
                                k4=c("purple", "blue", "green", "goldenrod1"), 
                                Subtype=c("ALK+ALCL"= 'chocolate4', "ALK-ALCL"= 'cadetblue'))
         )
```



