Cluster testing from Weinstock002 data 8DEC21
Libraries
library(dendextend)
library(DESeq2)
library(colorspace)
library(RColorBrewer)
library(DESeq2)
library(ggplot2)
library(dplyr)
library(reshape2)
library(plotly)
library(gridExtra)
library(gplots)
library(pals)
library(pheatmap)
library(colorspace)
library(scico)
Full PROseq data set analysis for TCL cell lines
https://talgalili.github.io/dendextend/articles/Cluster_Analysis.html
rld_all <- rlog(dds_pro, blind=TRUE)
#colnames(rld_all) <- paste0(sampleNames_all,"(",subTypes_all,")")
sampleDists_all <- dist(t(assay(rld_all)))
sampleDistMtx_all <- as.matrix(sampleDists_all)
rownames(sampleDistMtx_all) <- paste0(rld_all$sample,"_",rld_all$subtype)
colnames(sampleDistMtx_all) <- NULL
hclust_all <- hclust(sampleDists_all)
norm_all <- as.data.frame(counts(dds_pro, normalized=TRUE)) #think this is redundant with normcounts_all but keeping it for now
colnames(norm_all) <- dds_pro$sample
Do initial dendrograms and determine PCA contributors
plotPCA(rld_all, intgroup="subtype")

plotPCA(rld_all, intgroup="sample")

plot(hclust_all, labels=rownames(sampleDistMtx_all), main="PRO-seq All Sample dREG Dendrogram")
pdf("dREG_hclust.sample.pdf")
plot(hclust_all, labels=rownames(sampleDistMtx_all), main="PRO-seq All Sample dREG Dendrogram")
dev.off()
quartz_off_screen
2

Determine if you can adjust the number of PCA contributors to cleanly separate subtypes
plotPCA(rld_all, intgroup="subtype", ntop=100) + ggtitle("Top 100")

plotPCA(rld_all, intgroup="subtype", ntop=200) + ggtitle("Top 200")

plotPCA(rld_all, intgroup="subtype", ntop=500) + ggtitle("Top 500")

plotPCA(rld_all, intgroup="subtype", ntop=1000) + ggtitle("Top 1000")

plotPCA(rld_all, intgroup="subtype", ntop=2000) + ggtitle("Top 2000")

plotPCA(rld_all, intgroup="subtype", ntop=5000) + ggtitle("Top 5000")

plotPCA(rld_all, intgroup="subtype", ntop=10000) + ggtitle("Top 10000")

plotPCA(rld_all, intgroup="subtype", ntop=20000) + ggtitle("Top 20000")

plotPCA(rld_all, intgroup="subtype", ntop=50000) + ggtitle("Top 50000")

plotPCA(rld_all, intgroup="subtype", ntop=120888) + ggtitle("All")
pdf("dREG_pca.all.pdf")
plotPCA(rld_all, intgroup="subtype", ntop=100) + ggtitle("Top 100")
plotPCA(rld_all, intgroup="subtype", ntop=200) + ggtitle("Top 200")
plotPCA(rld_all, intgroup="subtype", ntop=500) + ggtitle("Top 500")
plotPCA(rld_all, intgroup="subtype", ntop=1000) + ggtitle("Top 1000")
plotPCA(rld_all, intgroup="subtype", ntop=2000) + ggtitle("Top 2000")
plotPCA(rld_all, intgroup="subtype", ntop=5000) + ggtitle("Top 5000")
plotPCA(rld_all, intgroup="subtype", ntop=10000) + ggtitle("Top 10000")
plotPCA(rld_all, intgroup="subtype", ntop=20000) + ggtitle("Top 20000")
plotPCA(rld_all, intgroup="subtype", ntop=50000) + ggtitle("Top 50000")
plotPCA(rld_all, intgroup="subtype", ntop=77061) + ggtitle("All")
dev.off()
quartz_off_screen
2

#Note: can make this a nicer grid with shared axes?
Rank dREG peaks by PCA contribution and measure variance and cumulative variance
#Load dREG peak annotations
dREG_annot2 <- read.table("../002_TCLs_p1000_dREGpeak_annotation.txt", sep="\t", header=TRUE)
#Define and rank top 10k PCA contributors
rv <- rowVars(assay(rld_all))
select <- order(rv, decreasing=TRUE)#[seq_len(min(10000, length(rv)))]
pc <- prcomp(t(assay(rld_all)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank_all <- data.frame("peakID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
#sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Generate list of dREG peak annotations
peakInfo_all <- merge(dREG_annot2, pcRank_all, by.x="peakID", by.y="peakID")
peakVar_all <- data.frame("peakID"=rownames(assay(rld_all)))
peakVar_all$rlog.variance <- rowVars(as.matrix(assay(rld_all)))
peakVar_all <- merge(peakInfo_all, peakVar_all, by.x="peakID", by.y="peakID")
pdf("variance.pdf")
ggplot(peakVar_all, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
dev.off()
null device
1
ggplot(peakVar_all, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
#add plot of fraction of explained variance.. running sum / total vs rank
peakVar_all <- peakVar_all[order(peakVar_all$PCA.rank),]
peakVar_all$cumulative.Variance <- cumsum(peakVar_all$rlog.variance)/sum(peakVar_all$rlog.variance)
pdf("cumulative_variance.pdf")
ggplot(peakVar_all, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
dev.off()
quartz_off_screen
2

ggplot(peakVar_all, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")

Filter rld objects by samples… separate training set from “test” set (SUM149 and SUM159)
#Separate training and test set
filt <- which(!colnames(rld_all) %in% c("HUT78","HUT102", "Su9T01", "FEPD", "MOTN1", "OCILY12", "SMZ1"))
rld_train <- rld_all[,filt]
filt <- which(colnames(rld_all) %in% c("HUT78","HUT102", "Su9T01", "FEPD", "MOTN1", "OCILY12", "SMZ1"))
rld_test <- rld_all[,filt]
filt <- which(!colnames(norm_all) %in% c("HUT78","HUT102", "Su9T01", "FEPD", "MOTN1", "OCILY12", "SMZ1"))
norm_train <- norm_all[,filt]
filt <- which(colnames(norm_all) %in% c("HUT78","HUT102", "Su9T01", "FEPD", "MOTN1", "OCILY12", "SMZ1"))
norm_test <- norm_all[,filt]
Determine the number of PCA contributors necessary to cleanly separate subtypes
Sample grouping looks pretty good between 5k and 10k top contributors. Will look at these cutoffs in more detail.
Rank dREG peaks by PCA contribution and measure variance and cumulative variance
#Define and rank top 10k PCA contributors
rv <- rowVars(assay(rld_train))
select <- order(rv, decreasing=TRUE)#[seq_len(min(10000, length(rv)))]
pc <- prcomp(t(assay(rld_train)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank2 <- data.frame("peakID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
#sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)
#Generate list of dREG peak annotations
peakInfo2 <- merge(dREG_annot2, pcRank2, by.x="peakID", by.y="peakID")
peakVar2 <- data.frame("peakID"=rownames(assay(rld_train)))
peakVar2$rlog.variance <- rowVars(as.matrix(assay(rld_train)))
peakVar2 <- merge(peakInfo2, peakVar2, by.x="peakID", by.y="peakID")
ggplot(peakVar2, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar2 <- peakVar2[order(peakVar2$PCA.rank),]
peakVar2$cumulative.Variance <- cumsum(peakVar2$rlog.variance)/sum(peakVar2$rlog.variance)
ggplot(peakVar2, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")

5k seems reasonable
Filter for top 5k PCA contributors
#filter normalized counts for top 5k PCA contributors
top5k <- filter(peakVar2, PCA.rank<=5000)$peakID
norm_top5k <- norm_train[which((rownames(norm_train) %in% top5k)==TRUE),]
rld_top5k_train <- subset(assay(rld_train), rownames(assay(rld_train)) %in% top5k)
rld_top5k_test <- subset(assay(rld_test), rownames(assay(rld_test)) %in% top5k)
rld_top5k_all <- subset(assay(rld_all), rownames(assay(rld_test)) %in% top5k)
Calculate sample distances with new top5k training and matched test sets, then calculate the correlation between the branchpoints of the two
d_train <- rld_top5k_train %>% dist %>% hclust %>% as.dendrogram
d_test <- rld_top5k_test %>% dist %>% hclust %>% as.dendrogram
d_all <- rld_top5k_all %>% dist %>% hclust %>% as.dendrogram
#compare clusters excluding unclustered to unclustered alone
d_train_test <- dendlist(train = d_train, test = d_test)
d_train_test %>% cor.dendlist
train test
train 1.0000000 0.3858571
test 0.3858571 1.0000000
d_train_test %>% cor.dendlist(method_coef = "spearman")
train test
train 1.0000000 0.3896877
test 0.3896877 1.0000000
Bk_plot(d_train, d_test, k = 2:30, xlim = c(2,30), main = "TCL Train Set vs non-significant clusters Bk plot")

#compare clusters excluding unclustered to clusters using all samples
d_train_all <- dendlist(train = d_train, all = d_all)
d_train_all %>% cor.dendlist
train all
train 1.0000000 0.4299217
all 0.4299217 1.0000000
d_train_all %>% cor.dendlist(method_coef = "spearman")
train all
train 1.00000 0.44151
all 0.44151 1.00000
Bk_plot(d_train, d_all, k = 2:30, xlim = c(2,30), main = "TCL Train Set vs All Samples Bk plot")

#compare clusters all sample clustering to unclustered alone
d_all_test <- dendlist(all = d_all, test = d_test)
d_all_test %>% cor.dendlist
all test
all 1.0000000 0.5231135
test 0.5231135 1.0000000
d_all_test %>% cor.dendlist(method_coef = "spearman")
all test
all 1.0000000 0.5082297
test 0.5082297 1.0000000
Bk_plot(d_all, d_test, k = 2:30, xlim = c(2,30), main = "All Samples vs. non-significant clusters Bk plot")

3k clusters?
Now some fun with Tanglegrams…
pre_tang_d_train_test <- d_train_test %>% ladderize %>% untangle %>% set("branches_k_color", k = 3)
train_branches_colors <- get_leaves_branches_col(pre_tang_d_train_test$train)
pre_tang_d_train_test %>% tanglegram(fast = T, color_lines = train_branches_colors)

pre_tang_d_train_all <- d_train_all %>% ladderize %>% untangle %>% set("branches_k_color", k = 3)
train_branches_colors <- get_leaves_branches_col(pre_tang_d_train_all$train)
pre_tang_d_train_all %>% tanglegram(fast = T, color_lines = train_branches_colors)

pre_tang_d_all_test <- d_all_test %>% ladderize %>% untangle %>% set("branches_k_color", k = 3)
train_branches_colors <- get_leaves_branches_col(pre_tang_d_all_test$all)
pre_tang_d_all_test %>% tanglegram(fast = T, color_lines = train_branches_colors)

The three different sets lead to drastically different cluster designations. Will attempt to look at overlap between cluster sets though it is somewhat arbitrary which samples end up being the hold-outs.
Only highlight genes clustered in common
#d_train_test_common <- d_train_test %>% prune_common_subtrees.dendlist
#d_train_test_common %>% untangle %>% tanglegram(common_subtrees_color_branches = TRUE)
#d_train_all_common <- d_train_all %>% prune_common_subtrees.dendlist
#d_train_all_common %>% untangle %>% tanglegram(common_subtrees_color_branches = TRUE)
#d_all_test_common <- d_all_test %>% prune_common_subtrees.dendlist
#d_all_test_common %>% untangle %>% tanglegram(common_subtrees_color_branches = TRUE)
Quite a few shared leaves between the training set and all. Let’s see what they are…
#d_train_test %>% nleaves
#d_train_test_common %>% nleaves
#d_train_all %>% nleaves
#d_train_all_common %>% nleaves
#d_all_test %>% nleaves
#d_all_test_common %>% nleaves
n leaves preserved between the subset and all samples.
Next is to figure a way to define expression/activity groups: - k-means as before - or - dynamic branch cutting -
using top 5k PCA contributors (as determined a reasonable cutoff in the variance plots)
#filter normalized counts for top 5k PCA contributors
top5k_all <- filter(peakVar_all, PCA.rank<=5000)$peakID
bot5k_all <- filter(peakVar_all, PCA.rank>115888)$peakID
norm_top5k_all <- norm_all[which((rownames(norm_all) %in% top5k_all)==TRUE),]
rld_top5k_all <- subset(assay(rld_all), rownames(assay(rld_all)) %in% top5k_all)
colnames(norm_top5k_all) <- colnames(rld_top5k_all)
rld_bot5k_all <- subset(assay(rld_all), rownames(assay(rld_all)) %in% bot5k_all)
(row.names(rld_top5k_all) %in% row.names(rld_bot5k_all))==FALSE #check dREGs pulled into top and bottom do not match- should run T for all
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[39] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[58] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[77] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[96] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[115] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[134] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[153] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[172] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[191] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[210] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[229] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[248] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[286] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[305] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[324] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[343] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[362] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[381] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[400] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[419] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[438] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[457] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[476] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[495] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[514] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[533] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[552] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[571] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[590] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[609] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[628] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[647] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[666] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[685] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[704] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[723] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[742] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[761] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[780] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[799] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[818] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[837] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[856] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[875] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[894] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[913] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[932] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[951] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[970] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[989] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[ reached getOption("max.print") -- omitted 4000 entries ]
#scale() function centers values +/- the mean of each column for normalized counts or rlog normalized counts
scaledata_count_top <- t(scale(t(norm_top5k_all))) # Centers and scales data.
scaledata_count_top <- scaledata_count_top[complete.cases(scaledata_count_top),] # Removes rows with nulls
scaledata_rld_top <- t(scale(t(rld_top5k_all))) # Centers and scales data.
scaledata_rld_top <- scaledata_rld_top[complete.cases(scaledata_rld_top),] # Removes rows with nulls
scaledata_rld_all <- t(scale(t(assay(rld_all))))
scaledata_rld_all <- scaledata_rld_all[complete.cases(scaledata_rld_all),]
scaledata_bot <- t(scale(t(rld_bot5k_all)))
scaledata_bot <- scaledata_bot[complete.cases(scaledata_bot),]
#also try with relative activity scaling
scaledata_rel_top <- data.frame(row.names=rownames(norm_top5k_all))
scaledata_rel_top$max.dREG.counts <- apply(norm_top5k_all[1:ncol(norm_top5k_all)], 1, max, na.rm=TRUE)
i=1
while (i<=ncol(norm_top5k_all)){
scaledata_rel_top[paste0(names(norm_top5k_all[i]))] <- norm_top5k_all[,i]/scaledata_rel_top$max.dREG.counts
i=i+1
}
#Generate column and row hierarchical clustering for each normalization method (all spearman for now)
hr_count <- hclust(as.dist(1-cor(t(scaledata_count_top), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_count <- hclust(as.dist(1-cor(scaledata_count_top, method="spearman")), method="complete") # Clusters columns by Spearman correlation.
hr_rld <- hclust(as.dist(1-cor(t(scaledata_count_top), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_rld <- hclust(as.dist(1-cor(scaledata_rld_top, method="spearman")), method="complete") # Clusters columns by Spearman correlation.
hr_bot <- hclust(as.dist(1-cor(t(scaledata_bot), method="spearman")), method="complete")
hr_rel <- hclust(as.dist(1-cor(t(scaledata_rel_top[,c(2:26)]), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_rel <- hclust(as.dist(1-cor(scaledata_rel_top[,c(2:26)], method="spearman")), method="complete") # Clusters columns by Spearman correlation.
Dynamic Branch Cutting Test (3-5)
library(dendextend)
cnt_cutk3 <- cutree(hr_count,k=3) #cut at specific height h or cut for a specific number of clusters k
cnt_cutk4 <- cutree(hr_count,k=4)
cnt_cutk5 <- cutree(hr_count,k=5)
rld_cutk2 <- cutree(hr_rld,k=2)
rld_cutk3 <- cutree(hr_rld,k=3)
rld_cutk4 <- cutree(hr_rld,k=4)
rld_cutk5 <- cutree(hr_rld,k=5)
rld_cutk6 <- cutree(hr_rld,k=6)
rld_cutk7 <- cutree(hr_rld,k=7)
rld_cutk8 <- cutree(hr_rld,k=8)
rld_cutk9 <- cutree(hr_rld,k=9)
rld_cutk10 <- cutree(hr_rld,k=10)
rld_bot5k_all <- as.integer(rep(0, 5000))
names(rld_bot5k_all) <- bot5k_all
rel_cutk3 <- cutree(hr_rel,k=3)
rel_cutk4 <- cutree(hr_rel,k=4)
rel_cutk5 <- cutree(hr_rel,k=5)
#plot the tree
TreeR_cnt = as.dendrogram(hr_count, method="average")
TreeR_rld = as.dendrogram(hr_rld, method="average")
TreeR_rel = as.dendrogram(hr_rel, method="average")
#define annotation bars
the_bars_k3 <- cbind(cnt_cutk3, rld_cutk3, rel_cutk3)
the_bars_k4 <- cbind(cnt_cutk4, rld_cutk4, rel_cutk4)
the_bars_k5 <- cbind(cnt_cutk5, rld_cutk5, rel_cutk5)
the_bars_cnt <- cbind(cnt_cutk3, cnt_cutk4, cnt_cutk5)
the_bars_rld <- cbind(rld_cutk3, rld_cutk4, rld_cutk5)
the_bars_rel <- cbind(rel_cutk3, rel_cutk4, rel_cutk5)
Compare clusters
plot(TreeR_rld,
leaflab = "none",
main = "dREG Peak Clustering (rlog) k=3",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_k3, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

plot(TreeR_rld,
leaflab = "none",
main = "dREG Peak Clustering (rlog) k=4",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_k4, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

plot(TreeR_rld,
leaflab = "none",
main = "dREG Peak Clustering (rlog) k=5",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_k5, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

Clustering is exactly the same regardless of method now!! This is great!
Compare clusters
plot(TreeR_cnt,
leaflab = "none",
main = "dREG Peak Clustering (counts)",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_cnt, TreeR_cnt, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

plot(TreeR_rld,
leaflab = "none",
main = "dREG Peak Clustering (rlog)",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_rld, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

plot(TreeR_rel,
leaflab = "none",
main = "dREG Peak Clustering (relative activity)",
ylab = "Height")
#this makes the bar
colored_bars(the_bars_rel, TreeR_rel, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

Let’s see how well these clusters separate expression groups associated with different cell line subtypes…
data_all <- c("HUT102_ATLL",
"ST1_ATLL",
"Su9T01_ATLL",
"KARPAS299_ALK+ALCL",
"L82_ALK+ALCL",
"SUPM2_ALK+ALCL",
"SMZ1_PTCL_NOS",
"MYLA_CTCL",
"MAC2A_ALK-ALCL",
"KIJK_ALK+ALCL",
"SR786_ALK+ALCL",
"FEPD_ALK-ALCL",
"KARPAS384_SQGD_TCL",
"DL40_ALK-ALCL",
"OCILY12_PTCL_NOS",
"HUT78_CTCL",
"HH_CTCL",
"MJ_CTCL",
"MTA_NKT",
"MOTN1_T_LGL",
"NKL_NKT",
"DERL2_HS_TCL",
"KHYG1_NKL",
"SeAx_CTCL",
"SUDHL1_ALK+ALCL")
#Defining box plot functions
clust.core = function(i, dat, clusters) {
ind = (clusters == i)
colMeans(dat[ind,])
}
box <- function(clusters, scaledata_count_top, title){
cores <- sapply(unique(clusters), clust.core, scaledata_count_top, clusters)
#make a dataframe of cores by subtype
d <- data.frame(cbind(data_all, cores)) #subType list from beginning!
colnames(d) <-c("sample", paste0("clust_",1:ncol(cores)))
#get the data frame into long format for plotting
dmolten <- melt(d, id.vars = "sample")
#order by subtype
dmolten <- dmolten[order(dmolten$sample),]
dmolten$sample <- ordered(dmolten$sample, levels=unique(dmolten$sample))
# make the plot
p1 <- ggplot(dmolten, aes(x=sample, y=as.numeric(value), col=variable)) +
geom_boxplot() +
# geom_point() +
# geom_line() +
# scale_x_continuous(minor_breaks = NULL, breaks=c(as.numeric(levels(factor(dmolten$subtype))))) +
# scale_y_continuous() +
xlab("Sample") +
ylab("Activity") +
labs(title= paste0("Cluster Core Expression by Sample (",title,")"),color = "Cluster")
return(p1)
}
Make boxplots
box(rld_cutk2, scaledata_rld_all, "cutk2 rlog")

box(rld_cutk3, scaledata_rld_all, "cutk3 rlog")

box(rld_cutk4, scaledata_rld_all, "cutk4 rlog")

box(rld_cutk5, scaledata_rld_all, "cutk5 rlog")

box(rld_cutk6, scaledata_rld_all, "cutk6 rlog")

box(rld_cutk7, scaledata_rld_all, "cutk7 rlog")

box(rld_cutk8, scaledata_rld_all, "cutk8 rlog")

box(rld_cutk9, scaledata_rld_all, "cutk9 rlog")

box(rld_cutk10, scaledata_rld_all, "cutk10 rlog")

box(c(rld_cutk3,rld_bot5k_all), rbind(scaledata_rld_all,scaledata_bot), "cutk3 + static [\"clust_5\"] rlog")

box(rel_cutk3, scaledata_rel_top[,2:26], "cutk3 rel")

box(rel_cutk4, scaledata_rel_top[,2:26], "cutk4 rel")

box(rel_cutk5, scaledata_rel_top[,2:26], "cutk5 rel")

box(rel_cutk3, scaledata_count_top, "cutk3 count")

box(rel_cutk4, scaledata_count_top, "cutk4 count")

box(rel_cutk5, scaledata_count_top, "cutk5 count")

Properly by subtype instead of individual samples
data_all <- c("ATLL",
"ATLL",
"ATLL",
"ALK+ALCL",
"ALK+ALCL",
"ALK+ALCL",
"PTCL_NOS",
"CTCL",
"ALK-ALCL",
"ALK+ALCL",
"ALK+ALCL",
"ALK-ALCL",
"SQGD_TCL",
"ALK-ALCL",
"PTCL_NOS",
"CTCL",
"CTCL",
"CTCL",
"NKT",
"T_LGL",
"NKT",
"HS_TCL",
"NKL",
"CTCL",
"ALK+ALCL")
#Defining box plot functions
clust.core = function(i, dat, clusters) {
ind = (clusters == i)
colMeans(dat[ind,])
}
box <- function(clusters, scaledata_count_top, title){
cores <- sapply(unique(clusters), clust.core, scaledata_count_top, clusters)
#make a dataframe of cores by subtype
d <- data.frame(cbind(data_all, cores)) #subType list from beginning!
colnames(d) <-c("subtype", paste0("clust_",1:ncol(cores)))
#get the data frame into long format for plotting
dmolten <- melt(d, id.vars = "subtype")
#order by subtype
dmolten <- dmolten[order(dmolten$subtype),]
dmolten$subtype <- ordered(dmolten$subtype, levels=unique(dmolten$subtype))
# make the plot
p1 <- ggplot(dmolten, aes(x=subtype, y=as.numeric(value), col=variable)) +
geom_boxplot() +
# geom_point() +
# geom_line() +
# scale_x_continuous(minor_breaks = NULL, breaks=c(as.numeric(levels(factor(dmolten$subtype))))) +
# scale_y_continuous() +
xlab("Subtype") +
ylab("Activity") +
labs(title= paste0("Cluster Core Expression by Subtype (",title,")"),color = "Cluster")
return(p1)
}
Make boxplots
box(rld_cutk2, scaledata_rld_all, "cutk2 rlog")

box(rld_cutk3, scaledata_rld_all, "cutk3 rlog")

box(rld_cutk4, scaledata_rld_all, "cutk4 rlog")

box(rld_cutk5, scaledata_rld_all, "cutk5 rlog")

box(rld_cutk6, scaledata_rld_all, "cutk6 rlog")

box(rld_cutk7, scaledata_rld_all, "cutk7 rlog")

box(rld_cutk8, scaledata_rld_all, "cutk8 rlog")

box(rld_cutk9, scaledata_rld_all, "cutk9 rlog")

box(rld_cutk10, scaledata_rld_all, "cutk10 rlog")

box(c(rld_cutk3,rld_bot5k_all), rbind(scaledata_rld_all,scaledata_bot), "cutk3 + static [\"clust_5\"] rlog")

box(rel_cutk3, scaledata_rel_top[,2:26], "cutk3 rel")

box(rel_cutk4, scaledata_rel_top[,2:26], "cutk4 rel")

box(rel_cutk5, scaledata_rel_top[,2:26], "cutk5 rel")

box(rel_cutk3, scaledata_count_top, "cutk3 count")

box(rel_cutk4, scaledata_count_top, "cutk4 count")

box(rel_cutk5, scaledata_count_top, "cutk5 count")

Can’t make much sense of unique clustering between the samples or their subtypes. Considering subsetting out original data into clustering and nonclustering samples and moving on with individual cluster treatments for each from there.
Want to make a heatmap with the above clusters and a cluster-trait correlation heatmap…
Trait correlation (edit for this dataset)
traits <- data.frame(row.names=colnames(scaledata_rld_all))
#here I'm adding vectors of subtype traits to the dataframe
traits$'ATLL' <- c(1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0) #corrected
traits$'ALK+ALCL' <- c(0,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1) #corrected
traits$'ALK-ALCL' <- c(0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0) #corrected
traits$'NKT' <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0) #corrected
traits$'CTCL' <- c(0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0) #corrected
traits$'T_LGL' <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0) #corrected
traits$'SQGD_TCL' <- c(0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0) #corrected
traits$'NKL' <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0) #corrected
traits$'PTCL_NOS' <- c(0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0) #corrected
traits$'HS_TCL' <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0) #corrected
#Extract the gene/sample numbers
nPeaks = nrow(scaledata_rld_all)
nSamples = ncol(scaledata_rld_all)
#p value calculation from WGCNA
corPvalueStudent = function(cor, nSamples) {
T=sqrt(nSamples-2) * cor/sqrt(1-cor^2)
2*pt(abs(T),nSamples-2, lower.tail = FALSE)
}
Make correlation heatmaps
nSamples=25
#Finalize clustering and branch cuts
clusters <- rld_cutk2
cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=2)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk3
cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=3)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk4
cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=4)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk5
cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=5)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk6
cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=6)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk7
cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=7)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk8
cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=8)",
xlab = "Traits",
ylab = "Clusters")

clusters <- rld_cutk9
cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
dendrogram = "none",
Colv =FALSE,
scale = c("none"),
col="heat.colors",
na.rm=TRUE,
cellnote = textMatrix,
notecol="grey30",
notecex=1,
trace=c("none"),
cexRow = 0.8,
cexCol = 0.8,
main = "Cluster-Trait Correlation (k=9)",
xlab = "Traits",
ylab = "Clusters")

#clusters <- c(rld_cutk4,rld_bop5k_all)
#cores <- sapply(unique(clusters), clust.core, rbind(scaledata_rld_top, scaledata_bot), clusters)
#correlate the cores to traits:
#moduleTraitCor = cor(cores, traits, use= "p")
#moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
#textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
#signif(moduleTraitPvalue, 1), ")", sep= "")
#dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
#heatmap.2(moduleTraitCor,
#dendrogram = "none",
#Colv =FALSE,
#scale = c("none"),
#col="heat.colors",
#na.rm=TRUE,
#cellnote = textMatrix,
#notecol="grey30",
#notecex=1,
#trace=c("none"),
#cexRow = 0.8,
#cexCol = 0.8,
#main = "Cluster-Trait Correlation (k=4 + Static [5k])",
#xlab = "Traits",
#ylab = "Clusters")
Make annotated scaled rlog heatmap
rld_df <- as.data.frame(scaledata_rld_top)
rld_df$k3 <- as.data.frame(rld_cutk3)$rld_cutk3
rld_df$k4 <- as.data.frame(rld_cutk4)$rld_cutk4
colors <- c('#e6194B','#3cb44b','#ffe119','#4363d8','#f58231','#911eb4','#42d4f4','#f032e6','#bfef45','#fabed4','#469990','#dcbeff','#9A6324','#fffac8','#800000','#aaffc3','#808000','#ffd8b1','#000075','#a9a9a9','#ffffff','#000000')
#High contrast colors for figures was pulled from https://sashamaps.net/docs/resources/20-colors/
pheatmap(rld_top5k_all,
#rld_df[order(rld_df[,12]),][,1:11],
#gaps_row = c(879,2298,4213),
cluster_cols=T,
cluster_rows=hr_rld,
#clustering_distance_rows = "euclidean",
#clustering_method = "complete",
scale="row",
cutree_rows=4,
cutree_cols=3,
border_color=NA,
show_rownames=F,
main="Top 5k dREG Peak PCA Contributors Hierarchical",
breaks=seq(-2,2,by=0.01),
color=ocean.balance(401),
#color=scico(401, palette = 'berlin'),
#color=colorRampPalette(c("blue", "white", "red"))(41),
annotation_row=rld_df[26:27],
annotation_col=data.frame(row.names=colnames(rld_df[,1:25]), Subtype = data_all),
annotation_colors=list(k3=c("blue", "green", "red"),
k4=c("purple", "blue", "green", "goldenrod1"),
Subtype=c("ATLL"= 'darkgray', "ALK+ALCL"= 'chocolate4', "CTCL"= 'chartreuse3', "ALK-ALCL"= 'cadetblue', "SQGD_TCL"= 'cyan3', "T_LGL"= 'red', "NKT"= 'lightpink', "HS_TCL"= 'black', "PTCL_NOS"= 'darkgoldenrod1', "NKL"= 'blueviolet'))
)

k means 4 looks like it’s picking up some significant groupings between samples. Will move forward with k=4
Redone cluster map with increased cut clusters and ordered by subtype Make annotated scaled rlog heatmap
rld_df2 <- as.data.frame(scaledata_rld_top)
rld_df2$k5 <- as.data.frame(rld_cutk5)$rld_cutk5
rld_df2$k6 <- as.data.frame(rld_cutk6)$rld_cutk6
rld_df2$k7 <- as.data.frame(rld_cutk7)$rld_cutk7
rld_df2$k8 <- as.data.frame(rld_cutk8)$rld_cutk8
rld_df2$k9 <- as.data.frame(rld_cutk9)$rld_cutk9
rld_df2$k10 <- as.data.frame(rld_cutk10)$rld_cutk10
colors <- c('#e6194B','#3cb44b','#ffe119','#4363d8','#f58231','#911eb4','#42d4f4','#f032e6','#bfef45','#fabed4','#469990','#dcbeff','#9A6324','#fffac8','#800000','#aaffc3','#808000','#ffd8b1','#000075','#a9a9a9','#ffffff','#000000')
#High contrast colors for figures was pulled from https://sashamaps.net/docs/resources/20-colors/
rld_df2 <- rld_df2[,c(9,12,14,25,6,5,11,4,10,19,21,23,16,8,24,18,17,22,1:3,20,13,7,15,26:31)]
data <- c("ALK-ALCL","ALK-ALCL","ALK-ALCL","ALK+ALCL","ALK+ALCL","ALK+ALCL","ALK+ALCL","ALK+ALCL","ALK+ALCL","NKT","NKT","NKL","CTCL","CTCL","CTCL","CTCL","CTCL","HS_TCL","ATLL","ATLL","ATLL","T_LGL","SQGD_TCL","PTCL_NOS","PTCL_NOS")
rld2 <- rld_top5k_all[,c(9,12,14,25,6,5,11,4,10,19,21,23,16,8,24,18,17,22,1:3,20,13,7,15)]
pheatmap(rld2,
#rld_df2[1:25],
#gaps_row = c(879,2298,4213),
cluster_cols=F,
cluster_rows=hr_rld,
#clustering_distance_rows = "euclidean",
#clustering_method = "complete",
scale="row",
cutree_rows=6,
cutree_cols=4,
border_color=NA,
show_rownames=F,
main="Top 5k dREG Peak PCA Contributors Hierarchical",
breaks=seq(-2,2,by=0.01),
color=ocean.balance(401),
#color=scico(401, palette = 'berlin'),
#color=colorRampPalette(c("blue", "white", "red"))(41),
annotation_row=rld_df2[26:31],
annotation_col=data.frame(row.names=colnames(rld_df2[,1:25]), Subtype = data),
annotation_colors=list(k5=c("blue", "green", "red", "black", "orange"),
k6=c("purple", "blue", "green", "red", "black", "orange"),
k7=c("yellow", "purple", "blue", "green", "red", "black", "orange"),
k8=c("pink", "yellow", "purple", "blue", "green", "red", "black", "orange"),
k9=c("white", "pink", "yellow", "purple", "blue", "green", "red", "black", "orange"),
k10=c("brown", "white", "pink", "yellow", "purple", "blue", "green", "red", "black", "orange"),
Subtype=c("ATLL"= 'darkgray', "ALK+ALCL"= 'chocolate4', "CTCL"= 'chartreuse3', "ALK-ALCL"= 'cadetblue', "SQGD_TCL"= 'cyan3', "T_LGL"= 'red', "NKT"= 'lightpink', "HS_TCL"= 'black', "PTCL_NOS"= 'darkgoldenrod1', "NKL"= 'blueviolet'))
)

Save centered rlog data with clusters as table with dREG peak info
out <- data.frame(unique(row.names=names(c(rld_cutk4,bot5k_all)),
k2=c(rld_cutk2,bot5k_all),
k3=c(rld_cutk3,bot5k_all),
k4=c(rld_cutk4,bot5k_all)
))
Error in unique.default(row.names = names(c(rld_cutk4, bot5k_all)), k2 = c(rld_cutk2, :
argument "x" is missing, with no default
Integrate nearest/associated gene metrics to output above
Save PDF versions of figures for MS
#####Now for protein-coding genes##### ##Rerun on 10/21/21
Can we get similar sample clustering with dominant transcript genes or do we still have poor separation of the subtypes as seen before?
Load dds and generate other data tables
Do initial dendrograms and determine PCA contributors
Try PCA at different depths
Max separation ~500 top variance genes.. in line with expectations. Poor separation of mesenchymal from luminal as reflected in dendrogram.
Rank genes by PCA contribution and measure variance and cumulative variance
Results: Cutoff rlog variance cumulative variance 500 3.91504 13.3% 1k 3.06535 22.1% 2k 2.209018 35.4% 5k 1.161115 59.8% 10k 0.5271063 79.8% 20k 0.1786067 95.8% all 0 100%
#Completed re-analysis with corrected subtypes and colors on 10/21/21 AF
#####Try this later… #skipped 10/21/21 For test and all sets of cell lines, see which clustering method is best at separating k=3
####Don’t think this is needed here… #skipped 10/21/21 Height Branch Cutting Test
---
title: "PRO Weinstock002 Branchpoint"
output: html_notebook
---

Cluster testing from Weinstock002 data 8DEC21

Libraries
```{r}
library(dendextend)
library(DESeq2)
library(colorspace)
library(RColorBrewer)
library(DESeq2)
library(ggplot2)
library(dplyr)
library(reshape2)
library(plotly)
library(gridExtra)
library(gplots)
library(pals)
library(pheatmap)
library(colorspace)
library(scico)
```

Full PROseq data set analysis for TCL cell lines

https://talgalili.github.io/dendextend/articles/Cluster_Analysis.html
```{r}

rld_all <- rlog(dds_pro, blind=TRUE)
#colnames(rld_all) <- paste0(sampleNames_all,"(",subTypes_all,")")
sampleDists_all <- dist(t(assay(rld_all)))
sampleDistMtx_all <- as.matrix(sampleDists_all)
rownames(sampleDistMtx_all) <- paste0(rld_all$sample,"_",rld_all$subtype)
colnames(sampleDistMtx_all) <- NULL
hclust_all <- hclust(sampleDists_all)

norm_all <- as.data.frame(counts(dds_pro, normalized=TRUE)) #think this is redundant with normcounts_all but keeping it for now
colnames(norm_all) <- dds_pro$sample
```

Do initial dendrograms and determine PCA contributors
```{r}
plotPCA(rld_all, intgroup="subtype") 
plotPCA(rld_all, intgroup="sample") 
plot(hclust_all, labels=rownames(sampleDistMtx_all), main="PRO-seq All Sample dREG Dendrogram")
pdf("dREG_hclust.sample.pdf")
plot(hclust_all, labels=rownames(sampleDistMtx_all), main="PRO-seq All Sample dREG Dendrogram")
dev.off()
```

Determine if you can adjust the number of PCA contributors to cleanly separate subtypes
```{r}
  plotPCA(rld_all, intgroup="subtype", ntop=100) + ggtitle("Top 100")
  plotPCA(rld_all, intgroup="subtype", ntop=200) + ggtitle("Top 200")
  plotPCA(rld_all, intgroup="subtype", ntop=500) + ggtitle("Top 500")
  plotPCA(rld_all, intgroup="subtype", ntop=1000) + ggtitle("Top 1000")
  plotPCA(rld_all, intgroup="subtype", ntop=2000) + ggtitle("Top 2000")
  plotPCA(rld_all, intgroup="subtype", ntop=5000) + ggtitle("Top 5000")
  plotPCA(rld_all, intgroup="subtype", ntop=10000) + ggtitle("Top 10000")
  plotPCA(rld_all, intgroup="subtype", ntop=20000) + ggtitle("Top 20000")
  plotPCA(rld_all, intgroup="subtype", ntop=50000) + ggtitle("Top 50000")
  plotPCA(rld_all, intgroup="subtype", ntop=120888) + ggtitle("All")
  
pdf("dREG_pca.all.pdf")
  plotPCA(rld_all, intgroup="subtype", ntop=100) + ggtitle("Top 100")
  plotPCA(rld_all, intgroup="subtype", ntop=200) + ggtitle("Top 200")
  plotPCA(rld_all, intgroup="subtype", ntop=500) + ggtitle("Top 500")
  plotPCA(rld_all, intgroup="subtype", ntop=1000) + ggtitle("Top 1000")
  plotPCA(rld_all, intgroup="subtype", ntop=2000) + ggtitle("Top 2000")
  plotPCA(rld_all, intgroup="subtype", ntop=5000) + ggtitle("Top 5000")
  plotPCA(rld_all, intgroup="subtype", ntop=10000) + ggtitle("Top 10000")
  plotPCA(rld_all, intgroup="subtype", ntop=20000) + ggtitle("Top 20000")
  plotPCA(rld_all, intgroup="subtype", ntop=50000) + ggtitle("Top 50000")
  plotPCA(rld_all, intgroup="subtype", ntop=77061) + ggtitle("All")
dev.off()
#Note: can make this a nicer grid with shared axes?
```

Rank dREG peaks by PCA contribution and measure variance and cumulative variance
```{r}
#Load dREG peak annotations
dREG_annot2 <- read.table("../002_TCLs_p1000_dREGpeak_annotation.txt", sep="\t", header=TRUE)
#Define and rank top 10k PCA contributors
rv <- rowVars(assay(rld_all)) 
select <- order(rv, decreasing=TRUE)#[seq_len(min(10000, length(rv)))]
pc <- prcomp(t(assay(rld_all)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank_all <- data.frame("peakID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
#sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)

#Generate list of dREG peak annotations
peakInfo_all <- merge(dREG_annot2, pcRank_all, by.x="peakID", by.y="peakID")
peakVar_all <- data.frame("peakID"=rownames(assay(rld_all)))
peakVar_all$rlog.variance <- rowVars(as.matrix(assay(rld_all)))
peakVar_all <- merge(peakInfo_all, peakVar_all, by.x="peakID", by.y="peakID")
pdf("variance.pdf")
ggplot(peakVar_all, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
dev.off()
ggplot(peakVar_all, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar_all <- peakVar_all[order(peakVar_all$PCA.rank),]
peakVar_all$cumulative.Variance <- cumsum(peakVar_all$rlog.variance)/sum(peakVar_all$rlog.variance)
pdf("cumulative_variance.pdf")
ggplot(peakVar_all, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
dev.off()
ggplot(peakVar_all, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=5000, linetype="dotted")
```

Filter rld objects by samples... separate training set from "test" set (SUM149 and SUM159)
```{r}
#Separate training and test set
filt <- which(!colnames(rld_all) %in% c("HUT78","HUT102", "Su9T01", "FEPD", "MOTN1", "OCILY12", "SMZ1"))
rld_train <- rld_all[,filt]

filt <- which(colnames(rld_all) %in% c("HUT78","HUT102", "Su9T01", "FEPD", "MOTN1", "OCILY12", "SMZ1"))
rld_test <- rld_all[,filt]

filt <- which(!colnames(norm_all) %in% c("HUT78","HUT102", "Su9T01", "FEPD", "MOTN1", "OCILY12", "SMZ1"))
norm_train <- norm_all[,filt]

filt <- which(colnames(norm_all) %in% c("HUT78","HUT102", "Su9T01", "FEPD", "MOTN1", "OCILY12", "SMZ1"))
norm_test <- norm_all[,filt]
```

Determine the number of PCA contributors necessary to cleanly separate subtypes

Sample grouping looks pretty good between 5k and 10k top contributors. Will look at these cutoffs in more detail.

Rank dREG peaks by PCA contribution and measure variance and cumulative variance
```{r}
#Define and rank top 10k PCA contributors
rv <- rowVars(assay(rld_train)) 
select <- order(rv, decreasing=TRUE)#[seq_len(min(10000, length(rv)))]
pc <- prcomp(t(assay(rld_train)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank2 <- data.frame("peakID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
#sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)

#Generate list of dREG peak annotations
peakInfo2 <- merge(dREG_annot2, pcRank2, by.x="peakID", by.y="peakID")
peakVar2 <- data.frame("peakID"=rownames(assay(rld_train)))
peakVar2$rlog.variance <- rowVars(as.matrix(assay(rld_train)))
peakVar2 <- merge(peakInfo2, peakVar2, by.x="peakID", by.y="peakID")
ggplot(peakVar2, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")

#add plot of fraction of explained variance.. running sum / total vs rank
peakVar2 <- peakVar2[order(peakVar2$PCA.rank),]
peakVar2$cumulative.Variance <- cumsum(peakVar2$rlog.variance)/sum(peakVar2$rlog.variance)
ggplot(peakVar2, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")
```

5k seems reasonable

Filter for top 5k PCA contributors
```{r}
#filter normalized counts for top 5k PCA contributors
top5k <- filter(peakVar2, PCA.rank<=5000)$peakID
norm_top5k <- norm_train[which((rownames(norm_train) %in% top5k)==TRUE),]
rld_top5k_train <- subset(assay(rld_train), rownames(assay(rld_train)) %in% top5k)
rld_top5k_test <- subset(assay(rld_test), rownames(assay(rld_test)) %in% top5k)
rld_top5k_all <- subset(assay(rld_all), rownames(assay(rld_test)) %in% top5k)
```

Calculate sample distances with new top5k training and matched test sets, then calculate the correlation between the branchpoints of the two
```{r}
d_train <- rld_top5k_train %>% dist %>% hclust %>% as.dendrogram
d_test <- rld_top5k_test %>% dist %>% hclust %>% as.dendrogram
d_all <- rld_top5k_all %>% dist %>% hclust %>% as.dendrogram

#compare clusters excluding unclustered to unclustered alone
d_train_test <- dendlist(train = d_train, test = d_test)
d_train_test %>% cor.dendlist
d_train_test %>% cor.dendlist(method_coef = "spearman")
Bk_plot(d_train, d_test, k = 2:30, xlim = c(2,30), main = "TCL Train Set vs non-significant clusters Bk plot")

#compare clusters excluding unclustered to clusters using all samples
d_train_all <- dendlist(train = d_train, all = d_all)
d_train_all %>% cor.dendlist
d_train_all %>% cor.dendlist(method_coef = "spearman")
Bk_plot(d_train, d_all, k = 2:30, xlim = c(2,30), main = "TCL Train Set vs All Samples Bk plot")

#compare clusters all sample clustering to unclustered alone
d_all_test <- dendlist(all = d_all, test = d_test)
d_all_test %>% cor.dendlist
d_all_test %>% cor.dendlist(method_coef = "spearman")
Bk_plot(d_all, d_test, k = 2:30, xlim = c(2,30), main = "All Samples vs. non-significant clusters Bk plot")
```
3k clusters?

Now some fun with Tanglegrams...
```{r}
pre_tang_d_train_test <- d_train_test %>% ladderize %>% untangle %>% set("branches_k_color", k = 3)
train_branches_colors <- get_leaves_branches_col(pre_tang_d_train_test$train)
pre_tang_d_train_test %>% tanglegram(fast = T, color_lines = train_branches_colors)

pre_tang_d_train_all <- d_train_all %>% ladderize %>%  untangle %>% set("branches_k_color", k = 3)
train_branches_colors <- get_leaves_branches_col(pre_tang_d_train_all$train)
pre_tang_d_train_all %>% tanglegram(fast = T, color_lines = train_branches_colors)

pre_tang_d_all_test <- d_all_test %>% ladderize %>%  untangle %>% set("branches_k_color", k = 3)
train_branches_colors <- get_leaves_branches_col(pre_tang_d_all_test$all)
pre_tang_d_all_test %>% tanglegram(fast = T, color_lines = train_branches_colors)
```
The three different sets lead to drastically different cluster designations. Will attempt to look at overlap between cluster sets though it is somewhat arbitrary which samples end up being the hold-outs.

Only highlight genes clustered in common 
```{r}
#d_train_test_common <- d_train_test %>% prune_common_subtrees.dendlist
#d_train_test_common %>% untangle %>%  tanglegram(common_subtrees_color_branches = TRUE)

#d_train_all_common <- d_train_all %>% prune_common_subtrees.dendlist
#d_train_all_common %>% untangle %>%  tanglegram(common_subtrees_color_branches = TRUE)

#d_all_test_common <- d_all_test %>% prune_common_subtrees.dendlist
#d_all_test_common %>% untangle %>%  tanglegram(common_subtrees_color_branches = TRUE)
```

Quite a few shared leaves between the training set and all. Let's see what they are... 
```{r}
#d_train_test %>% nleaves
#d_train_test_common %>% nleaves

#d_train_all %>% nleaves
#d_train_all_common %>% nleaves

#d_all_test %>% nleaves
#d_all_test_common %>% nleaves
```
n leaves preserved between the subset and all samples.

Next is to figure a way to define expression/activity groups:
- k-means as before -
or
- dynamic branch cutting -

using top 5k PCA contributors (as determined a reasonable cutoff in the variance plots)
```{r}
#filter normalized counts for top 5k PCA contributors
top5k_all <- filter(peakVar_all, PCA.rank<=5000)$peakID
bot5k_all <- filter(peakVar_all, PCA.rank>115888)$peakID
norm_top5k_all <- norm_all[which((rownames(norm_all) %in% top5k_all)==TRUE),]

rld_top5k_all <- subset(assay(rld_all), rownames(assay(rld_all)) %in% top5k_all)
colnames(norm_top5k_all) <- colnames(rld_top5k_all)
rld_bot5k_all <- subset(assay(rld_all), rownames(assay(rld_all)) %in% bot5k_all)
(row.names(rld_top5k_all) %in% row.names(rld_bot5k_all))==FALSE #check dREGs pulled into top and bottom do not match- should run T for all

#scale() function centers values +/- the mean of each column for normalized counts or rlog normalized counts
scaledata_count_top <- t(scale(t(norm_top5k_all))) # Centers and scales data.
scaledata_count_top <- scaledata_count_top[complete.cases(scaledata_count_top),] # Removes rows with nulls
scaledata_rld_top <- t(scale(t(rld_top5k_all))) # Centers and scales data.
scaledata_rld_top <- scaledata_rld_top[complete.cases(scaledata_rld_top),] # Removes rows with nulls

scaledata_rld_all <- t(scale(t(assay(rld_all))))
scaledata_rld_all <- scaledata_rld_all[complete.cases(scaledata_rld_all),]

scaledata_bot <- t(scale(t(rld_bot5k_all)))
scaledata_bot <- scaledata_bot[complete.cases(scaledata_bot),] 

#also try with relative activity scaling
scaledata_rel_top <- data.frame(row.names=rownames(norm_top5k_all))
scaledata_rel_top$max.dREG.counts <- apply(norm_top5k_all[1:ncol(norm_top5k_all)], 1, max, na.rm=TRUE)
i=1
while (i<=ncol(norm_top5k_all)){
  scaledata_rel_top[paste0(names(norm_top5k_all[i]))] <- norm_top5k_all[,i]/scaledata_rel_top$max.dREG.counts
  i=i+1
}

#Generate column and row hierarchical clustering for each normalization method (all spearman for now)
hr_count <- hclust(as.dist(1-cor(t(scaledata_count_top), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_count <- hclust(as.dist(1-cor(scaledata_count_top, method="spearman")), method="complete") # Clusters columns by Spearman correlation.

hr_rld <- hclust(as.dist(1-cor(t(scaledata_count_top), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_rld <- hclust(as.dist(1-cor(scaledata_rld_top, method="spearman")), method="complete") # Clusters columns by Spearman correlation.

hr_bot <- hclust(as.dist(1-cor(t(scaledata_bot), method="spearman")), method="complete")

hr_rel <- hclust(as.dist(1-cor(t(scaledata_rel_top[,c(2:26)]), method="spearman")), method="complete") # Cluster rows by Pearson correlation.
hc_rel <- hclust(as.dist(1-cor(scaledata_rel_top[,c(2:26)], method="spearman")), method="complete") # Clusters columns by Spearman correlation.
```

Dynamic Branch Cutting Test (3-5)
```{r}
library(dendextend)

cnt_cutk3 <- cutree(hr_count,k=3) #cut at specific height h or cut for a specific number of clusters k
cnt_cutk4 <- cutree(hr_count,k=4)
cnt_cutk5 <- cutree(hr_count,k=5)

rld_cutk2 <- cutree(hr_rld,k=2)
rld_cutk3 <- cutree(hr_rld,k=3)
rld_cutk4 <- cutree(hr_rld,k=4)
rld_cutk5 <- cutree(hr_rld,k=5)
rld_cutk6 <- cutree(hr_rld,k=6)
rld_cutk7 <- cutree(hr_rld,k=7)
rld_cutk8 <- cutree(hr_rld,k=8)
rld_cutk9 <- cutree(hr_rld,k=9)
rld_cutk10 <- cutree(hr_rld,k=10)

rld_bot5k_all <- as.integer(rep(0, 5000))
names(rld_bot5k_all) <- bot5k_all

rel_cutk3 <- cutree(hr_rel,k=3)
rel_cutk4 <- cutree(hr_rel,k=4)
rel_cutk5 <- cutree(hr_rel,k=5)

#plot the tree
TreeR_cnt = as.dendrogram(hr_count, method="average")
TreeR_rld = as.dendrogram(hr_rld, method="average")
TreeR_rel = as.dendrogram(hr_rel, method="average")

#define annotation bars
the_bars_k3 <- cbind(cnt_cutk3, rld_cutk3, rel_cutk3)
the_bars_k4 <- cbind(cnt_cutk4, rld_cutk4, rel_cutk4)
the_bars_k5 <- cbind(cnt_cutk5, rld_cutk5, rel_cutk5)

the_bars_cnt <- cbind(cnt_cutk3, cnt_cutk4, cnt_cutk5)
the_bars_rld <- cbind(rld_cutk3, rld_cutk4, rld_cutk5)
the_bars_rel <- cbind(rel_cutk3, rel_cutk4, rel_cutk5)
```

Compare clusters
```{r}
plot(TreeR_rld,
     leaflab = "none",
     main = "dREG Peak Clustering (rlog) k=3",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_k3, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

plot(TreeR_rld,
     leaflab = "none",
     main = "dREG Peak Clustering (rlog) k=4",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_k4, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)

plot(TreeR_rld,
     leaflab = "none",
     main = "dREG Peak Clustering (rlog) k=5",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_k5, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("By Counts","By rLog","By Relative Activity"),cex.rowLabels=0.7)
```
Clustering is exactly the same regardless of method now!! This is great!

Compare clusters
```{r}
plot(TreeR_cnt,
     leaflab = "none",
     main = "dREG Peak Clustering (counts)",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_cnt, TreeR_cnt, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

plot(TreeR_rld,
     leaflab = "none",
     main = "dREG Peak Clustering (rlog)",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_rld, TreeR_rld, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)

plot(TreeR_rel,
     leaflab = "none",
     main = "dREG Peak Clustering (relative activity)",
     ylab = "Height")

#this makes the bar
colored_bars(the_bars_rel, TreeR_rel, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("k=3","k=4","k=5"),cex.rowLabels=0.7)
```

Let's see how well these clusters separate expression groups associated with different cell line subtypes...
```{r}
data_all <- c("HUT102_ATLL",
                 "ST1_ATLL",
                 "Su9T01_ATLL", 
                 "KARPAS299_ALK+ALCL", 
                 "L82_ALK+ALCL",
                 "SUPM2_ALK+ALCL",
                 "SMZ1_PTCL_NOS",
                 "MYLA_CTCL",
                 "MAC2A_ALK-ALCL",
                 "KIJK_ALK+ALCL",
                 "SR786_ALK+ALCL",
              "FEPD_ALK-ALCL",
                 "KARPAS384_SQGD_TCL",
                 "DL40_ALK-ALCL", 
                 "OCILY12_PTCL_NOS", 
                 "HUT78_CTCL",
                 "HH_CTCL",
                 "MJ_CTCL",
                 "MTA_NKT",
                 "MOTN1_T_LGL",
                 "NKL_NKT",
                 "DERL2_HS_TCL",
              "KHYG1_NKL",
              "SeAx_CTCL",
              "SUDHL1_ALK+ALCL")

#Defining box plot functions
clust.core = function(i, dat, clusters) {
  ind = (clusters == i)
  colMeans(dat[ind,])
}

box <- function(clusters, scaledata_count_top, title){
cores <- sapply(unique(clusters), clust.core, scaledata_count_top, clusters)

#make a dataframe of cores by subtype
d <- data.frame(cbind(data_all, cores)) #subType list from beginning!
colnames(d) <-c("sample", paste0("clust_",1:ncol(cores)))

#get the data frame into long format for plotting
dmolten <- melt(d, id.vars = "sample")
#order by subtype
dmolten <- dmolten[order(dmolten$sample),]
dmolten$sample <- ordered(dmolten$sample, levels=unique(dmolten$sample))

# make the plot
p1 <- ggplot(dmolten, aes(x=sample, y=as.numeric(value), col=variable)) + 
  geom_boxplot() +
#  geom_point() + 
#  geom_line() +
#  scale_x_continuous(minor_breaks = NULL, breaks=c(as.numeric(levels(factor(dmolten$subtype))))) +
#  scale_y_continuous() +
  xlab("Sample") +
  ylab("Activity") +
  labs(title= paste0("Cluster Core Expression by Sample (",title,")"),color = "Cluster")
return(p1)
}
```

Make boxplots
```{r}
box(rld_cutk2, scaledata_rld_all, "cutk2 rlog")
box(rld_cutk3, scaledata_rld_all, "cutk3 rlog")
box(rld_cutk4, scaledata_rld_all, "cutk4 rlog")
box(rld_cutk5, scaledata_rld_all, "cutk5 rlog")
box(rld_cutk6, scaledata_rld_all, "cutk6 rlog")
box(rld_cutk7, scaledata_rld_all, "cutk7 rlog")
box(rld_cutk8, scaledata_rld_all, "cutk8 rlog")
box(rld_cutk9, scaledata_rld_all, "cutk9 rlog")
box(rld_cutk10, scaledata_rld_all, "cutk10 rlog")

box(c(rld_cutk3,rld_bot5k_all), rbind(scaledata_rld_all,scaledata_bot), "cutk3 + static [\"clust_5\"] rlog")

box(rel_cutk3, scaledata_rel_top[,2:26], "cutk3 rel")
box(rel_cutk4, scaledata_rel_top[,2:26], "cutk4 rel")
box(rel_cutk5, scaledata_rel_top[,2:26], "cutk5 rel")

box(rel_cutk3, scaledata_count_top, "cutk3 count")
box(rel_cutk4, scaledata_count_top, "cutk4 count")
box(rel_cutk5, scaledata_count_top, "cutk5 count")
```

Properly by subtype instead of individual samples
```{r}
data_all <- c("ATLL",
                 "ATLL",
                 "ATLL", 
                 "ALK+ALCL", 
                 "ALK+ALCL",
                 "ALK+ALCL",
                 "PTCL_NOS",
                 "CTCL",
                 "ALK-ALCL",
                 "ALK+ALCL",
                 "ALK+ALCL",
              "ALK-ALCL",
                 "SQGD_TCL",
                 "ALK-ALCL", 
                 "PTCL_NOS", 
                 "CTCL",
                 "CTCL",
                 "CTCL",
                 "NKT",
                 "T_LGL",
                 "NKT",
                 "HS_TCL",
              "NKL",
              "CTCL",
              "ALK+ALCL")

#Defining box plot functions
clust.core = function(i, dat, clusters) {
  ind = (clusters == i)
  colMeans(dat[ind,])
}

box <- function(clusters, scaledata_count_top, title){
cores <- sapply(unique(clusters), clust.core, scaledata_count_top, clusters)

#make a dataframe of cores by subtype
d <- data.frame(cbind(data_all, cores)) #subType list from beginning!
colnames(d) <-c("subtype", paste0("clust_",1:ncol(cores)))

#get the data frame into long format for plotting
dmolten <- melt(d, id.vars = "subtype")
#order by subtype
dmolten <- dmolten[order(dmolten$subtype),]
dmolten$subtype <- ordered(dmolten$subtype, levels=unique(dmolten$subtype))

# make the plot
p1 <- ggplot(dmolten, aes(x=subtype, y=as.numeric(value), col=variable)) + 
  geom_boxplot() +
#  geom_point() + 
#  geom_line() +
#  scale_x_continuous(minor_breaks = NULL, breaks=c(as.numeric(levels(factor(dmolten$subtype))))) +
#  scale_y_continuous() +
  xlab("Subtype") +
  ylab("Activity") +
  labs(title= paste0("Cluster Core Expression by Subtype (",title,")"),color = "Cluster")
return(p1)
}
```

Make boxplots
```{r}
box(rld_cutk2, scaledata_rld_all, "cutk2 rlog")
box(rld_cutk3, scaledata_rld_all, "cutk3 rlog")
box(rld_cutk4, scaledata_rld_all, "cutk4 rlog")
box(rld_cutk5, scaledata_rld_all, "cutk5 rlog")
box(rld_cutk6, scaledata_rld_all, "cutk6 rlog")
box(rld_cutk7, scaledata_rld_all, "cutk7 rlog")
box(rld_cutk8, scaledata_rld_all, "cutk8 rlog")
box(rld_cutk9, scaledata_rld_all, "cutk9 rlog")
box(rld_cutk10, scaledata_rld_all, "cutk10 rlog")

box(c(rld_cutk3,rld_bot5k_all), rbind(scaledata_rld_all,scaledata_bot), "cutk3 + static [\"clust_5\"] rlog")

box(rel_cutk3, scaledata_rel_top[,2:26], "cutk3 rel")
box(rel_cutk4, scaledata_rel_top[,2:26], "cutk4 rel")
box(rel_cutk5, scaledata_rel_top[,2:26], "cutk5 rel")

box(rel_cutk3, scaledata_count_top, "cutk3 count")
box(rel_cutk4, scaledata_count_top, "cutk4 count")
box(rel_cutk5, scaledata_count_top, "cutk5 count")
```

Can't make much sense of unique clustering between the samples or their subtypes. Considering subsetting out original data into clustering and nonclustering samples and moving on with individual cluster treatments for each from there.

Want to make a heatmap with the above clusters and a cluster-trait correlation heatmap...

Trait correlation (edit for this dataset)
```{r}
traits <- data.frame(row.names=colnames(scaledata_rld_all)) 
#here I'm adding vectors of subtype traits to the dataframe
traits$'ATLL' <-     c(1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0) #corrected
traits$'ALK+ALCL' <-       c(0,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1) #corrected
traits$'ALK-ALCL' <- c(0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0) #corrected
traits$'NKT' <-     c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0) #corrected
traits$'CTCL' <-       c(0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0) #corrected
traits$'T_LGL' <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0) #corrected
traits$'SQGD_TCL' <-     c(0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0) #corrected
traits$'NKL' <-       c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0) #corrected
traits$'PTCL_NOS' <- c(0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0) #corrected
traits$'HS_TCL' <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0) #corrected

#Extract the gene/sample numbers
nPeaks = nrow(scaledata_rld_all) 
nSamples = ncol(scaledata_rld_all)

#p value calculation from WGCNA
corPvalueStudent = function(cor, nSamples) {
  T=sqrt(nSamples-2) * cor/sqrt(1-cor^2)
  2*pt(abs(T),nSamples-2, lower.tail = FALSE)
}
```

Make correlation heatmaps
```{r}
nSamples=25

#Finalize clustering and branch cuts
clusters <- rld_cutk2

cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=2)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk3

cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=3)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk4

cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=4)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk5

cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=5)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk6

cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=6)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk7

cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=7)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk8

cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=8)",
          xlab = "Traits",
          ylab = "Clusters")

clusters <- rld_cutk9

cores <- sapply(unique(clusters), clust.core, scaledata_rld_all, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=9)",
          xlab = "Traits",
          ylab = "Clusters")

#clusters <- c(rld_cutk4,rld_bop5k_all)

#cores <- sapply(unique(clusters), clust.core, rbind(scaledata_rld_top, scaledata_bot), clusters)
#correlate the cores to traits:
#moduleTraitCor = cor(cores, traits, use= "p")
#moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
#textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  #signif(moduleTraitPvalue, 1), ")", sep= "")
#dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
#par(oma=c(1,0,1,1))
#heatmap.2(moduleTraitCor,
          #dendrogram = "none",
          #Colv =FALSE,
          #scale = c("none"),
          #col="heat.colors",
          #na.rm=TRUE,
          #cellnote = textMatrix,
          #notecol="grey30",
          #notecex=1,
          #trace=c("none"),
          #cexRow = 0.8,
          #cexCol = 0.8,
          #main = "Cluster-Trait Correlation (k=4 + Static [5k])",
          #xlab = "Traits",
          #ylab = "Clusters")
```

Make annotated scaled rlog heatmap
```{r}
rld_df <- as.data.frame(scaledata_rld_top)
rld_df$k3 <- as.data.frame(rld_cutk3)$rld_cutk3
rld_df$k4 <- as.data.frame(rld_cutk4)$rld_cutk4

colors <- c('#e6194B','#3cb44b','#ffe119','#4363d8','#f58231','#911eb4','#42d4f4','#f032e6','#bfef45','#fabed4','#469990','#dcbeff','#9A6324','#fffac8','#800000','#aaffc3','#808000','#ffd8b1','#000075','#a9a9a9','#ffffff','#000000')
#High contrast colors for figures was pulled from https://sashamaps.net/docs/resources/20-colors/

pheatmap(rld_top5k_all,
         #rld_df[order(rld_df[,12]),][,1:11], 
         #gaps_row = c(879,2298,4213), 
         cluster_cols=T, 
         cluster_rows=hr_rld,
         #clustering_distance_rows = "euclidean",
         #clustering_method = "complete",
         scale="row",
         cutree_rows=4,
         cutree_cols=3, 
         border_color=NA, 
         show_rownames=F, 
         main="Top 5k dREG Peak PCA Contributors Hierarchical",
         breaks=seq(-2,2,by=0.01),
         color=ocean.balance(401),
         #color=scico(401, palette = 'berlin'),
         #color=colorRampPalette(c("blue", "white", "red"))(41),
         annotation_row=rld_df[26:27], 
         annotation_col=data.frame(row.names=colnames(rld_df[,1:25]), Subtype = data_all), 
         annotation_colors=list(k3=c("blue", "green", "red"),
                                k4=c("purple", "blue", "green", "goldenrod1"), 
                                Subtype=c("ATLL"= 'darkgray', "ALK+ALCL"= 'chocolate4', "CTCL"= 'chartreuse3', "ALK-ALCL"= 'cadetblue', "SQGD_TCL"= 'cyan3', "T_LGL"= 'red', "NKT"= 'lightpink', "HS_TCL"= 'black', "PTCL_NOS"= 'darkgoldenrod1', "NKL"= 'blueviolet'))
         )
```
k means 4 looks like it's picking up some significant groupings between samples. Will move forward with k=4

Redone cluster map with increased cut clusters and ordered by subtype
Make annotated scaled rlog heatmap
```{r}
rld_df2 <- as.data.frame(scaledata_rld_top)
rld_df2$k5 <- as.data.frame(rld_cutk5)$rld_cutk5
rld_df2$k6 <- as.data.frame(rld_cutk6)$rld_cutk6
rld_df2$k7 <- as.data.frame(rld_cutk7)$rld_cutk7
rld_df2$k8 <- as.data.frame(rld_cutk8)$rld_cutk8
rld_df2$k9 <- as.data.frame(rld_cutk9)$rld_cutk9
rld_df2$k10 <- as.data.frame(rld_cutk10)$rld_cutk10

colors <- c('#e6194B','#3cb44b','#ffe119','#4363d8','#f58231','#911eb4','#42d4f4','#f032e6','#bfef45','#fabed4','#469990','#dcbeff','#9A6324','#fffac8','#800000','#aaffc3','#808000','#ffd8b1','#000075','#a9a9a9','#ffffff','#000000')
#High contrast colors for figures was pulled from https://sashamaps.net/docs/resources/20-colors/

rld_df2 <- rld_df2[,c(9,12,14,25,6,5,11,4,10,19,21,23,16,8,24,18,17,22,1:3,20,13,7,15,26:31)]
data <- c("ALK-ALCL","ALK-ALCL","ALK-ALCL","ALK+ALCL","ALK+ALCL","ALK+ALCL","ALK+ALCL","ALK+ALCL","ALK+ALCL","NKT","NKT","NKL","CTCL","CTCL","CTCL","CTCL","CTCL","HS_TCL","ATLL","ATLL","ATLL","T_LGL","SQGD_TCL","PTCL_NOS","PTCL_NOS")
rld2 <- rld_top5k_all[,c(9,12,14,25,6,5,11,4,10,19,21,23,16,8,24,18,17,22,1:3,20,13,7,15)]

pheatmap(rld2,
         #rld_df2[1:25],
         #gaps_row = c(879,2298,4213), 
         cluster_cols=F, 
         cluster_rows=hr_rld,
         #clustering_distance_rows = "euclidean",
         #clustering_method = "complete",
         scale="row",
         cutree_rows=6,
         cutree_cols=4, 
         border_color=NA, 
         show_rownames=F, 
         main="Top 5k dREG Peak PCA Contributors Hierarchical",
         breaks=seq(-2,2,by=0.01),
         color=ocean.balance(401),
         #color=scico(401, palette = 'berlin'),
         #color=colorRampPalette(c("blue", "white", "red"))(41),
         annotation_row=rld_df2[26:31], 
         annotation_col=data.frame(row.names=colnames(rld_df2[,1:25]), Subtype = data), 
         annotation_colors=list(k5=c("blue", "green", "red", "black", "orange"),
                                k6=c("purple", "blue", "green", "red", "black", "orange"),
                                k7=c("yellow", "purple", "blue", "green", "red", "black", "orange"),
                                k8=c("pink", "yellow", "purple", "blue", "green", "red", "black", "orange"),
                                k9=c("white", "pink", "yellow", "purple", "blue", "green", "red", "black", "orange"),
                                k10=c("brown", "white", "pink", "yellow", "purple", "blue", "green", "red", "black", "orange"),
                                Subtype=c("ATLL"= 'darkgray', "ALK+ALCL"= 'chocolate4', "CTCL"= 'chartreuse3', "ALK-ALCL"= 'cadetblue', "SQGD_TCL"= 'cyan3', "T_LGL"= 'red', "NKT"= 'lightpink', "HS_TCL"= 'black', "PTCL_NOS"= 'darkgoldenrod1', "NKL"= 'blueviolet'))
         )
```


Save centered rlog data with clusters as table with dREG peak info
```{r}
out <- data.frame(unique(row.names=names(c(rld_cutk4,bot5k_all)), 
                  k2=c(rld_cutk2,bot5k_all), 
                  k3=c(rld_cutk3,bot5k_all), 
                  k4=c(rld_cutk4,bot5k_all)
                  ))

out <- merge(peakVar_all, out, by.x="peakID", by.y=0, all.x=T)
out <- merge(out, scaledata_rld_all, by.x="peakID", by.y=0, all.x=T)

write.table(out, file="branchpoint_output.txt", sep="\t", col.names=T, row.names=F, quote=F)
```

Integrate nearest/associated gene metrics to output above
```{r}
distal <- read.table("../DESeq/outputs/Polyak_parentAll_s0.5p0.025c30_p1000_m1000000_distal_dREGpeak_closestGenes.txt", header=T, sep="\t")

peak2gene1 <- promprox[,c(1,10,11)]
peak2gene2 <- distal[,c(1,10,11)]
colnames(peak2gene2) <- colnames(peak2gene1)
peak2gene2$geneName <- paste0("=\"",substring(peak2gene2$geneName, 2),"\"")
peak2gene3 <- distal[,c(1,21,22)]
colnames(peak2gene3) <- colnames(peak2gene1)
peak2gene3$geneName <- paste0("=\"",substring(peak2gene3$geneName, 2),"\"")
peak2gene <- rbind(peak2gene1,peak2gene2,peak2gene3)

peak2gene <- merge(peak2gene, out[,c(1,18)])
peak2gene <- unique(peak2gene)

write.table(peak2gene, file="branchpoint_assoc_genes.txt", sep="\t", col.names=T, row.names=F, quote=F)
```

Save PDF versions of figures for MS
```{r}
rld_df <- as.data.frame(scaledata_rld)
rld_df$k4 <- as.data.frame(rld_cutk4)$rld_cutk4

pdf("plots/centroid.boxplot.pdf")
box(c(rld_cutk4,rld_bot5k), rbind(scaledata_rld,scaledata_bot), "cutk4 + static [\"clust_5\"] rlog")
dev.off()

clusters <- rld_cutk4

cores <- sapply(unique(clusters), clust.core, scaledata_rld, clusters)
#correlate the cores to traits:
moduleTraitCor = cor(cores, traits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#generate a text matrix of the correlation and pvalue
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
                  signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
#plot a heatmap of the clusters by trait overlaying the corr/pvalue
pdf("plots/cluster-trait.heatmap.pdf")
par(oma=c(1,0,1,1))
heatmap.2(moduleTraitCor,
          dendrogram = "none",
          Colv =FALSE,
          scale = c("none"),
          col="heat.colors",
          na.rm=TRUE,
          cellnote = textMatrix,
          notecol="grey30",
          notecex=1,
          trace=c("none"),
          cexRow = 0.8,
          cexCol = 0.8,
          main = "Cluster-Trait Correlation (k=4)",
          xlab = "Traits",
          ylab = "Clusters")
dev.off()

pdf("plots/activity.heatmap.berlin.pdf")
pheatmap(rld_top5k_all,
         #rld_df[order(rld_df[,12]),][,1:11], 
         #gaps_row = c(879,2298,4213), 
         cluster_cols=T, 
         cluster_rows=hr_rld,
         #clustering_distance_rows = "euclidean",
         #clustering_method = "complete",
         scale="row",
         cutree_rows=4,
         cutree_cols=3, 
         border_color=NA, 
         show_rownames=F, 
         main="Top 5k dREG Peak PCA Contributors Hierarchical",
         breaks=seq(-2,2,by=0.01),
         color=scico(401, palette = 'berlin'),
         #color=colorRampPalette(c("blue", "white", "red"))(41),
         annotation_row=rld_df[12], 
         annotation_col=data.frame(row.names=colnames(rld_df[,1:11]), Subtype = subTypes_all), 
         annotation_colors=list(k4=c("purple", "blue", "green", "goldenrod1"), 
                                Subtype=c("Mesenchymal"='green', "Basal"='red', "Luminal"='blue'))
         )
dev.off()

pdf("plots/activity.heatmap.ocean.pdf")
pheatmap(rld_top5k_all,
         #rld_df[order(rld_df[,12]),][,1:11], 
         #gaps_row = c(879,2298,4213), 
         cluster_cols=T, 
         cluster_rows=hr_rld,
         #clustering_distance_rows = "euclidean",
         #clustering_method = "complete",
         scale="row",
         cutree_rows=4,
         cutree_cols=3, 
         border_color=NA, 
         show_rownames=F, 
         main="Top 5k dREG Peak PCA Contributors Hierarchical",
         breaks=seq(-2,2,by=0.01),
         color=ocean.balance(401),
         annotation_row=rld_df[12], 
         annotation_col=data.frame(row.names=colnames(rld_df[,1:11]), Subtype = subTypes_all), 
         annotation_colors=list(k4=c("purple", "blue", "green", "goldenrod1"), 
                                Subtype=c("Mesenchymal"='green', "Basal"='red', "Luminal"='blue'))
         )
dev.off()
```


#####Now for protein-coding genes##### ##Rerun on 10/21/21

Can we get similar sample clustering with dominant transcript genes or do we still have poor separation of the subtypes as seen before?

Load dds and generate other data tables
```{r}
dds_domTrans <- readRDS("../DESeq/RData/polyakAll_dds_domTrans.rds")

colnames(dds_domTrans) <- paste0(sampleNames_all,"(",subTypes_all,")") #swap sample names
dds_domTrans$sample <- sampleNames_all #swap sample names in metadata
dds_domTrans$type <- subTypes_all #swap sample types in metadata
rld_domTrans <- rlog(dds_domTrans, blind=TRUE)
#colnames(rld_domTrans) <- paste0(sampleNames_all,"(",subTypes_all,")")
sampleDists_domTrans <- dist(t(assay(rld_domTrans)))
sampleDistMtx_domTrans <- as.matrix(sampleDists_domTrans)
rownames(sampleDistMtx_domTrans) <- paste0(rld_domTrans$sample,"_",rld_domTrans$type)
colnames(sampleDistMtx_domTrans) <- NULL
hclust_domTrans <- hclust(sampleDists_domTrans)
```

Do initial dendrograms and determine PCA contributors
```{r}
plotPCA(rld_domTrans, intgroup="type") 
plotPCA(rld_domTrans, intgroup="sample")
pdf("plots/domTrans_hclust.sample.pdf")
plot(hclust_domTrans, labels=rownames(sampleDistMtx_domTrans), main="PRO-seq Dominant Transcript Dendrogram")
dev.off()
plot(hclust_domTrans, labels=rownames(sampleDistMtx_domTrans), main="PRO-seq Dominant Transcript Dendrogram")
```

Try PCA at different depths
```{r}
  plotPCA(rld_domTrans, intgroup="type", ntop=100) + ggtitle("Top 100")
  plotPCA(rld_domTrans, intgroup="type", ntop=200) + ggtitle("Top 200")
  plotPCA(rld_domTrans, intgroup="type", ntop=500) + ggtitle("Top 500")
  plotPCA(rld_domTrans, intgroup="type", ntop=1000) + ggtitle("Top 1000")
  plotPCA(rld_domTrans, intgroup="type", ntop=2000) + ggtitle("Top 2000")
  plotPCA(rld_domTrans, intgroup="type", ntop=5000) + ggtitle("Top 5000")
  plotPCA(rld_domTrans, intgroup="type", ntop=10000) + ggtitle("Top 10000")
  plotPCA(rld_domTrans, intgroup="type", ntop=20000) + ggtitle("Top 20000")
  plotPCA(rld_domTrans, intgroup="type", ntop=32375) + ggtitle("All")
```
Max separation ~500 top variance genes.. in line with expectations. Poor separation of mesenchymal from luminal as reflected in dendrogram.

Rank genes by PCA contribution and measure variance and cumulative variance
```{r}
#Define and rank top PCA contributors
rv <- rowVars(assay(rld_domTrans)) 
select <- order(rv, decreasing=TRUE)
pc <- prcomp(t(assay(rld_domTrans)[select,]))
loadings <- as.data.frame(pc$rotation)
aload <- abs(loadings)
pcRank_domTrans <- data.frame("geneID"=rownames(loadings), "PCA.rank"=seq.int(nrow(loadings)))
sweep(aload, 2, colSums(aload), "/")
###clean up variables
rm(rv)
rm(select)
rm(pc)
rm(loadings)

#Generate list of genes
geneVar <- data.frame("geneID"=rownames(assay(rld_domTrans)))
geneVar$rlog.variance <- rowVars(as.matrix(assay(rld_domTrans)))
geneVar <- merge(pcRank_domTrans, geneVar, by.x="geneID", by.y="geneID")
ggplot(geneVar, aes(x=PCA.rank, y=rlog.variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")

#add plot of fraction of explained variance.. running sum / total vs rank
geneVar <- geneVar[order(geneVar$PCA.rank),]
geneVar$cumulative.Variance <- cumsum(geneVar$rlog.variance)/sum(geneVar$rlog.variance)
ggplot(geneVar, aes(x=PCA.rank, y=cumulative.Variance)) + geom_line() + geom_vline(xintercept=c(500,1000,2000,5000,10000), linetype="dotted")
```
Results:
Cutoff    rlog variance   cumulative variance
500       3.91504         13.3%
1k        3.06535         22.1%
2k        2.209018        35.4%
5k        1.161115        59.8%
10k       0.5271063       79.8%
20k       0.1786067       95.8%
all       0               100%

#Completed re-analysis with corrected subtypes and colors on 10/21/21 AF



#####Try this later...  #skipped 10/21/21
For test and all sets of cell lines, see which clustering method is best at separating k=3
```{r}
get_ordered_3_clusters <- function(dend) {
   cutree(dend, k = 3)[order.dendrogram(dend)]
}

dend_3_clusters <- lapply(iris_dendlist, get_ordered_3_clusters)

compare_clusters_to_iris <- function(clus) {FM_index(clus, rep(1:3, each = 50), assume_sorted_vectors = TRUE)}

clusters_performance <- sapply(dend_3_clusters, compare_clusters_to_iris)
dotchart(sort(clusters_performance), xlim = c(0.7,1),
         xlab = "Fowlkes-Mallows Index (from 0 to 1)",
         main = "Perormance of clustering algorithms \n in detecting the 3 species",
         pch = 19)
```

####Don't think this is needed here... #skipped 10/21/21
Height Branch Cutting Test
```{r}
rld_cuth1.5 <- cutree(hr,h=1.5)
rld_cuth1.6 <- cutree(hr,h=1.6)
rld_cuth1.7 <- cutree(hr,h=1.7)

plot(TreeR,
     leaflab = "none",
     main = "Gene Clustering",
     ylab = "Height")
the_bars2 <- cbind(cuth1.5, cuth1.6, cuth1.7, cuth1.8, cuth1.85, cuth1.9)
colored_bars(the_bars2, TreeR, sort_by_labels_order = T, y_shift=-0.1, rowLabels = c("h=1.5","h=1.6","h=1.7","h=1.8", "h=1.85", "h=1.9"),cex.rowLabels=0.7)

#this will add lines showing the cut heights
abline(h=1.5, lty = 2, col="grey")
abline(h=1.6, lty = 2, col="grey")
abline(h=1.7, lty = 2, col="grey")
abline(h=1.8, lty = 2, col="grey")
abline(h=1.85, lty = 2, col="grey")
abline(h=1.9, lty = 2, col="grey")
```