pyscenic_clustering

setwd("/Users/Daniela/Desktop/Math/third_lab/pyscenic")

library(Seurat)

## Attaching SeuratObject

library(data.table)
library(MASS)
library(circlize)

## ========================================
## circlize version 0.4.13
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

library(multcomp)

## Loading required package: mvtnorm

## Loading required package: survival

## Loading required package: TH.data

## 
## Attaching package: 'TH.data'

## The following object is masked from 'package:MASS':
## 
##     geyser

library(rjson)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(RColorBrewer)
library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

library(circlize)

seurat = load(file = "harmony_BT474.rda")

binary = read.csv("binarization.csv",row.names = 1)

colnames(binary) = gsub(pattern = "...$",replacement = "", x  = colnames(binary))

seurat_meta = AddMetaData(hnc, binary)

#AUC scores split by RNA_snn_res.0.5
Idents(seurat_meta) = 'RNA_snn_res.0.5'

data_to_write_out = as.data.frame(as.matrix(seurat_meta@meta.data))

#10 cluster contrast
K = rbind("c0 - others" = c( 9, -1, -1, -1, -1, -1, -1, -1, -1, -1),     
          "c1 - others" = c( -1, 9, -1, -1, -1, -1, -1, -1, -1, -1),
          "c2 - others" = c( -1, -1, 9, -1, -1, -1, -1, -1, -1, -1),
          "c3 - others" = c( -1, -1, -1, 9, -1, -1, -1, -1, -1, -1),
          "c4 - others" = c( -1, -1, -1, -1, 9, -1, -1, -1, -1, -1),
          "c5 - others" = c( -1, -1, -1, -1, -1, 9, -1, -1, -1, -1),
          "c6 - others" = c( -1, -1, -1, -1, -1, -1, 9, -1, -1, -1),
          "c7 - others" = c( -1, -1, -1, -1, -1, -1, -1, 9, -1, -1),
          "c8 - others" = c( -1, -1, -1, -1, -1, -1, -1, -1, 9, -1),
          "c9 - others" = c( -1, -1, -1, -1, -1, -1, -1, -1, -1, 9))

#COMBINE 4 HEATMAPS 

#type 1------------------------------------
#check how many clusters: type 1 has no cluster c9
data_to_write_out1 = data_to_write_out[data_to_write_out$type=="1.DMSO",]
data_to_write_out1$clusters = as.factor(data_to_write_out1$RNA_snn_res.0.5)
levels(data_to_write_out1$clusters) = c("c0","c1","c2","c3","c4","c5","c6","c7","c8")

#330 x 10
#cluster 0
result1 = vector("list",331)

#9 cluster contrast
K1 = rbind("c0 - others" = c( 8, -1, -1, -1, -1, -1, -1, -1, -1),     
          "c1 - others" = c( -1, 8, -1, -1, -1, -1, -1, -1, -1),
          "c2 - others" = c( -1, -1, 8, -1, -1, -1, -1, -1, -1),
          "c3 - others" = c( -1, -1, -1, 8, -1, -1, -1, -1, -1),
          "c4 - others" = c( -1, -1, -1, -1, 8, -1, -1, -1, -1),
          "c5 - others" = c( -1, -1, -1, -1, -1, 8, -1, -1, -1),
          "c6 - others" = c( -1, -1, -1, -1, -1, -1, 8, -1, -1),
          "c7 - others" = c( -1, -1, -1, -1, -1, -1, -1, 8, -1),
          "c8 - others" = c( -1, -1, -1, -1, -1, -1, -1, -1, 8))

for(i in 1:331){
  g1 = glm(as.numeric(data_to_write_out1[,26+i]) ~ clusters,
          data= data_to_write_out1, family = "binomial") #GLM with contrast

  mult1 = glht(g1, linfct = mcp(clusters = K1))
  s1 = summary(mult1, test=adjusted("hochberg"))
  result1[[i]] = s1$test$tstat
}

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

#convert to dataframe
df1 = data.frame(matrix(unlist(result1), nrow=length(result1), byrow=TRUE))

colnames(df1) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9")
all1 = colnames(data_to_write_out1)
rownames(df1) = all1[27:357]

#target genes
# Give the input file name to the function.
regulons = fromJSON(file = "regulons.json")

genes = lengths(regulons)

for(i in 1:331){
  rownames(df1)[i] = paste0(rownames(df1)[i]," (",genes[i]," genes)") 
}

#type 2------------------------------------
#type 2 has no cluster c9
data_to_write_out2 = data_to_write_out[data_to_write_out$type=="2.fullvestrant",]
data_to_write_out2$clusters = as.factor(data_to_write_out2$RNA_snn_res.0.5)
levels(data_to_write_out2$clusters) = c("c0","c1","c2","c3","c4","c5","c6","c7","c8")

#330 x 10
#cluster 0
result2 = vector("list",331)

K2 = rbind("c0 - others" = c( 8, -1, -1, -1, -1, -1, -1, -1, -1),     
           "c1 - others" = c( -1, 8, -1, -1, -1, -1, -1, -1, -1),
          "c2 - others" = c( -1, -1, 8, -1, -1, -1, -1, -1, -1),
           "c3 - others" = c( -1, -1, -1, 8, -1, -1, -1, -1, -1),
           "c4 - others" = c( -1, -1, -1, -1, 8, -1, -1, -1, -1),
           "c5 - others" = c( -1, -1, -1, -1, -1, 8, -1, -1, -1),
           "c6 - others" = c( -1, -1, -1, -1, -1, -1, 8, -1, -1),
           "c7 - others" = c( -1, -1, -1, -1, -1, -1, -1, 8, -1),
           "c8 - others" = c( -1, -1, -1, -1, -1, -1, -1, -1, 8))

for(i in 1:331){
  g2 = glm(as.numeric(data_to_write_out2[,26+i]) ~ clusters,
           data= data_to_write_out2, family = "binomial") #GLM with contrast
  
  mult2 = glht(g2, linfct = mcp(clusters = K2))
  s2 = summary(mult2, test=adjusted("hochberg"))
  result2[[i]] = s2$test$tstat
}

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

#convert to dataframe
df2 = data.frame(matrix(unlist(result2), nrow=length(result2), byrow=TRUE))

colnames(df2) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9")
all2 = colnames(data_to_write_out2)
rownames(df2) = all2[27:357]

#target genes
# Give the input file name to the function.
regulons = fromJSON(file = "regulons.json")

genes = lengths(regulons)

for(i in 1:331){
  rownames(df2)[i] = paste0(rownames(df2)[i]," (",genes[i]," genes)") 
}

#type 3------------------------------------
#c3 has all 10 clusters
data_to_write_out3 = data_to_write_out[data_to_write_out$type=="3.neratinib",]
data_to_write_out3$clusters = as.factor(data_to_write_out3$RNA_snn_res.0.5)
levels(data_to_write_out3$clusters) = c("c0","c1","c2","c3","c4","c5","c6","c7","c8","c9")

#330 x 10
#cluster 0
result3 = vector("list",331)

for(i in 1:331){
  g3 = glm(as.numeric(data_to_write_out3[,26+i]) ~ clusters,
           data= data_to_write_out3, family = "binomial") #GLM with contrast
  
  mult3 = glht(g3, linfct = mcp(clusters = K))
  s3 = summary(mult3, test=adjusted("hochberg"))
  result3[[i]] = s3$test$tstat
}

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

#convert to dataframe
df3 = data.frame(matrix(unlist(result3), nrow=length(result3), byrow=TRUE))

colnames(df3) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9","c10")
all3 = colnames(data_to_write_out3)
rownames(df3) = all3[27:357]

#target genes
# Give the input file name to the function.
regulons = fromJSON(file = "regulons.json")

genes = lengths(regulons)

for(i in 1:331){
  rownames(df3)[i] = paste0(rownames(df3)[i]," (",genes[i]," genes)") 
}

#type 4------------------------------------
#type 4 has all 10 clusters
data_to_write_out4 = data_to_write_out[data_to_write_out$type=="4.both",]
data_to_write_out4$clusters = as.factor(data_to_write_out4$RNA_snn_res.0.5)
levels(data_to_write_out4$clusters) = c("c0","c1","c2","c3","c4","c5","c6","c7","c8","c9","c10")

#330 x 10
#cluster 0
result4 = vector("list",331)

for(i in 1:331){
  g4 = glm(as.numeric(data_to_write_out4[,26+i]) ~ clusters,
           data= data_to_write_out4, family = "binomial") #GLM with contrast
  
  mult4 = glht(g4, linfct = mcp(clusters = K))
  s4 = summary(mult4, test=adjusted("hochberg"))
  result4[[i]] = s4$test$tstat
}

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

#convert to dataframe
df4 = data.frame(matrix(unlist(result4), nrow=length(result4), byrow=TRUE))

colnames(df4) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9","c10")
all4 = colnames(data_to_write_out4)
rownames(df4) = all4[27:357]

#target genes
# Give the input file name to the function.
regulons = fromJSON(file = "regulons.json")

genes = lengths(regulons)

for(i in 1:331){
  rownames(df4)[i] = paste0(rownames(df4)[i]," (",genes[i]," genes)") 
}

#filter rows
df1$type = "type1"
df2$type = "type2"
zeros = rep(0,331)
df1$c10 = zeros
df2$c10 = zeros
df3$type = "type3"
df4$type = "type4"

df_new = rbind(df1,df2,df3,df4)

colnames(df_new) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9","type","c10")

df_new = df_new %>% filter(abs(c1) > 3 | abs(c2)> 3 | abs(c3) > 3 | abs(c4) > 3 | abs(c5) > 3 | abs(c6) > 3 | abs(c7) > 3 | abs(c8) > 3 | abs(c9) > 3 | abs(c10) > 3)

final = rownames(df_new[1:44,])

#COMBINE 4 HEATMAPS ---------------------

#type 1------------------------------------
#check how many clusters: type 1 has no cluster c9
data_to_write_out1 = data_to_write_out[data_to_write_out$type=="1.DMSO",]
data_to_write_out1$clusters = as.factor(data_to_write_out1$RNA_snn_res.0.5)
levels(data_to_write_out1$clusters) = c("c0","c1","c2","c3","c4","c5","c6","c7","c8")

#330 x 10
#cluster 0
result1 = vector("list",331)

K1 = rbind("c0 - others" = c( 8, -1, -1, -1, -1, -1, -1, -1, -1),     
           "c1 - others" = c( -1, 8, -1, -1, -1, -1, -1, -1, -1),
           "c2 - others" = c( -1, -1, 8, -1, -1, -1, -1, -1, -1),
           "c3 - others" = c( -1, -1, -1, 8, -1, -1, -1, -1, -1),
           "c4 - others" = c( -1, -1, -1, -1, 8, -1, -1, -1, -1),
           "c5 - others" = c( -1, -1, -1, -1, -1, 8, -1, -1, -1),
           "c6 - others" = c( -1, -1, -1, -1, -1, -1, 8, -1, -1),
           "c7 - others" = c( -1, -1, -1, -1, -1, -1, -1, 8, -1),
           "c8 - others" = c( -1, -1, -1, -1, -1, -1, -1, -1, 8))

for(i in 1:331){
  g1 = glm(as.numeric(data_to_write_out1[,26+i]) ~ clusters,
           data= data_to_write_out1, family = "binomial") #GLM with contrast
  
  mult1 = glht(g1, linfct = mcp(clusters = K1))
  s1 = summary(mult1, test=adjusted("hochberg"))
  result1[[i]] = s1$test$tstat
}

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

#convert to dataframe
df1 = data.frame(matrix(unlist(result1), nrow=length(result1), byrow=TRUE))

colnames(df1) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9")
all1 = colnames(data_to_write_out1)
rownames(df1) = all1[27:357]

#target genes
# Give the input file name to the function.
regulons = fromJSON(file = "regulons.json")

genes = lengths(regulons)

for(i in 1:331){
  rownames(df1)[i] = paste0(rownames(df1)[i]," (",genes[i]," genes)") 
}

#heatmap
df1 = df1[rownames(df1) %in% final,]

hoch_mat1 = as.matrix(df1)

#same for all legends
col_fun = colorRamp2(c(-10, 0, 10), c("blue", "white", "red"))

cn = c(1:9)
c1 = Heatmap(hoch_mat1, column_order = sort(colnames(hoch_mat1)), row_order = sort(rownames(hoch_mat1)),show_column_names = FALSE, show_column_dend = FALSE, show_row_dend = F,
             top_annotation = HeatmapAnnotation(
               text = anno_text(cn, rot = 0, just = "center"),
               annotation_height = max_text_width(cn)
             ), column_title = "DMSO",
             rect_gp = gpar(col = "white", lwd = 1),
             heatmap_legend_param = list(
               col_fun = col_fun, title=c("Z score"),
               show_heatmap_legend = T)
)

#type 2------------------------------------
#type 2 has no cluster c9
data_to_write_out2 = data_to_write_out[data_to_write_out$type=="2.fullvestrant",]
data_to_write_out2$clusters = as.factor(data_to_write_out2$RNA_snn_res.0.5)
levels(data_to_write_out2$clusters) = c("c0","c1","c2","c3","c4","c5","c6","c7","c8")

#330 x 10
#cluster 0
result2 = vector("list",331)

K2 = rbind("c0 - others" = c( 8, -1, -1, -1, -1, -1, -1, -1, -1),     
           "c1 - others" = c( -1, 8, -1, -1, -1, -1, -1, -1, -1),
           "c2 - others" = c( -1, -1, 8, -1, -1, -1, -1, -1, -1),
           "c3 - others" = c( -1, -1, -1, 8, -1, -1, -1, -1, -1),
           "c4 - others" = c( -1, -1, -1, -1, 8, -1, -1, -1, -1),
           "c5 - others" = c( -1, -1, -1, -1, -1, 8, -1, -1, -1),
           "c6 - others" = c( -1, -1, -1, -1, -1, -1, 8, -1, -1),
           "c7 - others" = c( -1, -1, -1, -1, -1, -1, -1, 8, -1),
           "c8 - others" = c( -1, -1, -1, -1, -1, -1, -1, -1, 8))

for(i in 1:331){
  
  g2 = glm(as.numeric(data_to_write_out2[,26+i]) ~ clusters,
           data= data_to_write_out2, family = "binomial") #GLM with contrast
  
  mult2 = glht(g2, linfct = mcp(clusters = K2))
  s2 = summary(mult2, test=adjusted("hochberg"))
  result2[[i]] = s2$test$tstat
}

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

#convert to dataframe
df2 = data.frame(matrix(unlist(result2), nrow=length(result2), byrow=TRUE))

colnames(df2) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9")
all2 = colnames(data_to_write_out2)
rownames(df2) = all2[27:357]

#target genes
# Give the input file name to the function.
regulons = fromJSON(file = "regulons.json")

genes = lengths(regulons)

for(i in 1:331){
  rownames(df2)[i] = paste0(rownames(df2)[i]," (",genes[i]," genes)") 
}

#heatmap
df2 = df2[rownames(df2) %in% final,]
df2 = df2[,1:9]

hoch_mat2 = as.matrix(df2)

cn = c(1:9)
c2 = Heatmap(hoch_mat2, show_column_names = FALSE, column_order = sort(colnames(hoch_mat2)), row_order = sort(rownames(hoch_mat2)),show_column_dend = FALSE, show_row_dend = F,
             top_annotation = HeatmapAnnotation(
               text = anno_text(cn, rot = 0, just = "center"),
               annotation_height = max_text_width(cn)
             ), column_title = "Fullvestrant",
             rect_gp = gpar(col = "white", lwd = 1),
             show_heatmap_legend = F,
             heatmap_legend_param = list(
               col_fun = col_fun, title=c("Z score"))
) 

#type 3------------------------------------
#c3 has all 10 clusters
data_to_write_out3 = data_to_write_out[data_to_write_out$type=="3.neratinib",]
data_to_write_out3$clusters = as.factor(data_to_write_out3$RNA_snn_res.0.5)
levels(data_to_write_out3$clusters) = c("c0","c1","c2","c3","c4","c5","c6","c7","c8","c9")

#330 x 10
#cluster 0
result3 = vector("list",331)

for(i in 1:331){
  g3 = glm(as.numeric(data_to_write_out3[,26+i]) ~ clusters,
           data= data_to_write_out3, family = "binomial") #GLM with contrast
  
  mult3 = glht(g3, linfct = mcp(clusters = K))
  s3 = summary(mult3, test=adjusted("hochberg"))
  result3[[i]] = s3$test$tstat
}

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

#convert to dataframe
df3 = data.frame(matrix(unlist(result3), nrow=length(result3), byrow=TRUE))

colnames(df3) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9","c99")
all3 = colnames(data_to_write_out3)
rownames(df3) = all3[27:357]

#target genes
# Give the input file name to the function.
regulons = fromJSON(file = "regulons.json")

genes = lengths(regulons)

for(i in 1:331){
  rownames(df3)[i] = paste0(rownames(df3)[i]," (",genes[i]," genes)") 
}

#heatmap
df3 = df3[rownames(df3) %in% final,]

hoch_mat3 = as.matrix(df3)

cn = c(1:10)
c3 = Heatmap(hoch_mat3, show_column_names = FALSE, column_order = sort(colnames(hoch_mat3)), row_order = sort(rownames(hoch_mat3)),show_column_dend = FALSE, show_row_dend = F,
             top_annotation = HeatmapAnnotation(
               text = anno_text(cn, rot = 0, just = "center"),
               annotation_height = max_text_width(cn)
             ), column_title = "Neratinib",
             rect_gp = gpar(col = "white", lwd = 1),
             show_heatmap_legend = F,
             heatmap_legend_param = list(
               col_fun = col_fun, title=c("Z score"))
)

#type 4------------------------------------
#type 4 has all 10 clusters
data_to_write_out4 = data_to_write_out[data_to_write_out$type=="4.both",]
data_to_write_out4$clusters = as.factor(data_to_write_out4$RNA_snn_res.0.5)
levels(data_to_write_out4$clusters) = c("c0","c1","c2","c3","c4","c5","c6","c7","c8","c9","c99")
#c99 is 10, used for ordering

#330 x 10
#cluster 0
result4 = vector("list",331)

for(i in 1:331){
  g4 = glm(as.numeric(data_to_write_out4[,26+i]) ~ clusters,
           data= data_to_write_out4, family = "binomial") #GLM with contrast
  
  mult4 = glht(g4, linfct = mcp(clusters = K))
  s4 = summary(mult4, test=adjusted("hochberg"))
  result4[[i]] = s4$test$tstat
}

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

#convert to dataframe
df4 = data.frame(matrix(unlist(result4), nrow=length(result4), byrow=TRUE))

colnames(df4) = c("c1","c2","c3","c4","c5","c6","c7","c8","c9","c99")
all4 = colnames(data_to_write_out4)
rownames(df4) = all4[27:357]

#target genes
# Give the input file name to the function.
regulons = fromJSON(file = "regulons.json")

genes = lengths(regulons)

for(i in 1:331){
  rownames(df4)[i] = paste0(rownames(df4)[i]," (",genes[i]," genes)") 
}

#heatmap
df4 = df4[rownames(df4) %in% final,]

hoch_mat4 = as.matrix(df4)

cn = c(1:10)
c4 = Heatmap(hoch_mat4, show_column_names = FALSE, column_order = sort(colnames(hoch_mat4)), row_order = sort(rownames(hoch_mat4)),show_column_dend = FALSE, show_row_dend = F,
             top_annotation = HeatmapAnnotation(
               text = anno_text(cn, rot = 0, just = "center"),
               annotation_height = max_text_width(cn)
             ), column_title = "Both",
             rect_gp = gpar(col = "white", lwd = 1),
             show_heatmap_legend = F,
             heatmap_legend_param = list(
               col_fun = col_fun, title=c("Z score"))
)

#combine
c5 = c1 + c2 + c3 + c4

#plot
c5

#8, 50
pdf("heatmap_demo.pdf", width = 12, height = 10)
draw(c5, show_heatmap_legend = TRUE)
dev.off()

## quartz_off_screen 
##                 2

pyscenic_clustering_demo.R

Daniela

2021-11-15