data = read.csv("https://data.broadinstitute.org/Trinity/TMP/bhaas/assembly_metrics.csv", header=T, com="#")
examine correlation structure across all data
org_program = paste(data$Organism, data$Program, sep=",")
org_prog_data = data
rownames(org_prog_data) = org_program
org_prog_data = org_prog_data[,-c(1,2)]
org_prog_data_scaled = scale(org_prog_data)
org_prog_data_cor = cor(org_prog_data_scaled, use='complete.obs')
#correlations can be pos or neg, showing all as pos here.
pheatmap(abs(org_prog_data_cor), show_colnames = T, show_rownames = F)

examine metric correlations by organism
data_split_by_org = split(data, data[,1])
for (organism in names(data_split_by_org)) {
message(organism)
org_data = data_split_by_org[[organism]]
rownames(org_data) = org_data$Program
org_data = org_data[,-c(1,2)] #remove organism and prog name
org_data = scale(org_data)
NA_cols = which(apply(org_data, 2, function(x) {all(is.na(x))}) == TRUE)
if (length(NA_cols) > 0) {
org_data = org_data[, -1 * NA_cols] # remove columns with just NA values
}
correlations = cor(org_data, use="complete.obs")
correlations[is.na(correlations)] = 0
correlations = abs(correlations)
pheatmap(correlations, main=paste0(c(organism, " metrics")), show_colnames = T, show_rownames = F)
}








