1 Functions

source_from_github(repositoy = "cNMF_functions",version = "0.3.87",script_name = "cnmf_function_Harmony.R")
ℹ SHA-1 hash of file is e4b1a5e086d635d32ada87c261558383601de712
source_from_github(repositoy = "HMSC_functions",version = "0.1.14",script_name = "functions.R")
ℹ SHA-1 hash of file is 9ec21e2e1fdc9a7b465400b3111b50804a136cf3

2 Data

acc1_cancer_cells = readRDS("./Data/acc1_cancer_cells_15KnCount_V4.RDS")

3 Original UMAP

DimPlot(object = acc1_cancer_cells,pt.size = 2)


comparisons <- list(c("ACC.plate2", "ACC1.P3"))


plt2 = VlnPlot(object = acc1_cancer_cells,group.by= "orig.ident",features = "nFeature_RNA")+ stat_compare_means(comparisons = comparisons, label = "p.signif",label.y = 10000)

plt1+plt2
Warning: Removed 3 rows containing missing values (geom_signif).

4 Seurat intergration

acc1_cancer_cells.list <- SplitObject(acc1_cancer_cells, split.by = "plate")

# normalize and identify variable features for each dataset independently
acc1_cancer_cells.list <- lapply(X = acc1_cancer_cells.list, FUN = function(x) {
    # x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 1000)
})

# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = acc1_cancer_cells.list,nfeatures = 1000)
acc.anchors <- FindIntegrationAnchors(object.list = acc1_cancer_cells.list, anchor.features = features,k.filter = 50)
acc.combined <- IntegrateData(anchorset = acc.anchors,k.weight = 50,features.to.integrate = rownames(acc1_cancer_cells),preserve.order = F)
Merging dataset 1 into 2
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
DefaultAssay(acc.combined) <- "integrated"
acc.combined <- ScaleData(acc.combined, verbose = FALSE)
acc.combined <- RunPCA(acc.combined, npcs = 30, verbose = FALSE)
ElbowPlot(acc.combined)

acc.combined <- RunUMAP(acc.combined, reduction = "pca", dims = 1:10)
acc.combined <- FindNeighbors(acc.combined, reduction = "pca", dims = 1:10)
acc.combined <- FindClusters(acc.combined, resolution = 0.5)

5 UMAPS

DimPlot(acc.combined, reduction = "umap", group.by = "plate")

6 clusters DEG

acc_deg <- FindMarkers(acc.combined, ident.1 = "0",ident.2 = "1",features = VariableFeatures(acc.combined),densify = T)
enrichment_analysis(acc_deg,background = VariableFeatures(acc.combined),fdr_Cutoff = 0.01,ident.1 = "0",ident.2 = "1",show_by = 1)

7 MYB-HPV

myb_vs_hpv = FetchData(object = acc.combined,vars = c("hpv33_positive","MYB"))
myb_vs_hpv $hpv33_positive = as.character(myb_vs_hpv $hpv33_positive )

ggboxplot(myb_vs_hpv, x = "hpv33_positive", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "t.test",comparisons = list(c("positive","negative")))+ stat_summary(fun.data = function(x) data.frame(y=12, label = paste("Mean=",round(mean(x),digits = 2))), geom="text") +ylab("log2(MYB)")

8 HPV-MYB per plate

plate_1 = subset(acc.combined,subset = plate == "ACC.plate2")
myb_vs_hpv = FetchData(object = plate_1,vars = c("hpv33_positive","MYB"))
myb_vs_hpv $hpv33_positive = as.character(myb_vs_hpv $hpv33_positive )

p = ggboxplot(myb_vs_hpv, x = "hpv33_positive", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "wilcox.test",comparisons = list(c("positive","negative")))+ stat_summary(fun.data = function(x) data.frame(y=15, label = paste("Mean=",round(mean(x),digits = 2))), geom="text") +ylab("log2(MYB)")+ggtitle("ACC.plate2")

plate_2 = subset(acc.combined,subset = plate == "ACC1.P3")
myb_vs_hpv = FetchData(object = plate_2,vars = c("hpv33_positive","MYB"))
myb_vs_hpv $hpv33_positive = as.character(myb_vs_hpv $hpv33_positive )

p+ggboxplot(myb_vs_hpv, x = "hpv33_positive", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "wilcox.test",comparisons = list(c("positive","negative")))+ stat_summary(fun.data = function(x) data.frame(y=15, label = paste("Mean=",round(mean(x),digits = 2))), geom="text") +ylab("log2(MYB)")+ggtitle("ACC1.P3")

8.1 myo-lum score

original_myo_genes = c( "TP63", "TP73", "CAV1", "CDH3", "KRT5", "KRT14", "ACTA2", "TAGLN", "MYLK", "DKK3")
original_lum_genes = c("KIT", "EHF", "ELF5", "KRT7", "CLDN3", "CLDN4", "CD24", "LGALS3", "LCN2", "SLPI" )
calculate_score(dataset = acc.combined,myo_genes = original_myo_genes,lum_genes = original_lum_genes)
correlation of lum score and myo score: -0.07
correlation of lum score and original lum score: 1
correlation of myo score and original myo score: 1

myo_protein_markers = c("CNN1", "TP63","ACTA2")
top_myo  = top_correlated(dataset = acc.combined, genes = myo_protein_markers,threshold = 0.35)
1
[1] 1
print("Number of genes = " %>% paste(length(top_myo)))
[1] "Number of genes =  15"
message("Names of genes:")
Names of genes:
top_myo %>% head(30)
 [1] "COL16A1"     "RP1-39G22.4" "CD200"       "MYLK"        "TP63"        "KCNMB1"      "ADAMTS2"     "CLIC3"       "SNCG"        "ACTA2"      
[11] "TAGLN"       "MIR7974"     "CNN1"        "MYL9"        "POM121L9P"  
message("Genes that also apeared in the original score:")
Genes that also apeared in the original score:
base::intersect(top_myo,original_myo_genes) 
[1] "MYLK"  "TP63"  "ACTA2" "TAGLN"
lum_protein_markers = c("KIT")
top_lum  = top_correlated(dataset = acc.combined, genes = lum_protein_markers,threshold = 0.35)
Warning in cor(expression %>% t(), markers_average) :
  the standard deviation is zero
print("Number of genes = " %>% paste(length(top_lum)))
[1] "Number of genes =  8"
message("Names of genes:")
Names of genes:
top_lum %>% head(30)
[1] "MAP2"    "KIT"     "GLRB"    "SLC29A1" "SASH1"   "ALDH3B2" "CCND1"   "NDFIP2" 
message("Genes that also apeared in the original score:")
Genes that also apeared in the original score:
base::intersect(top_lum,original_lum_genes) 
[1] "KIT"
cd_features <- list(c("MYLK" ,"TP63" , "ACTA2" ,"TAGLN","CNN1", "TP63","ACTA2"))
pbmc_small <- AddModuleScore(
  object = acc.combined,
  features = cd_features,
  ctrl = 5,
  name = 'myo_Features'
)

FeaturePlot(object = pbmc_small,features = "myo_Features1")

cd_features <- list(top_lum)
pbmc_small <- AddModuleScore(
  object = pbmc_small,
  features = cd_features,
  ctrl = 5,
  name = 'lum_Features'
)

FeaturePlot(object = pbmc_small,features = "lum_Features1")

cor(pbmc_small$lum_Features1,pbmc_small$myo_Features1)
[1] -0.118548

9 Harmony + cNMF

from cnmf import cNMF
ModuleNotFoundError: No module named 'cnmf'
selected_k = 3
density_threshold = 0.1
cnmf_obj.consensus(k=selected_k, density_threshold=density_threshold,show_clustering=True)
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_k, density_threshold=density_threshold)
gep_scores = py$gep_scores
gep_tpm = py$gep_tpm
all_metagenes= py$usage_norm

10 Harmony results

# Make metagene names
for (i in 1:ncol(all_metagenes)) {
  colnames(all_metagenes)[i] = "metagene." %>% paste0(i)
}

#add each metagene to metadata
for (i in 1:ncol(all_metagenes)) {
  metage_metadata = all_metagenes %>% select(i)
  acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = metage_metadata)
}
print_tab(plt = 
            FeaturePlot(object = acc1_cancer_cells,features = colnames(all_metagenes),combine = T),
          title = "metagenes expression")

#add each metagene to metadata
for (i in 1:ncol(all_metagenes)) {
  metage_metadata = all_metagenes %>% select(i)
  lung_corr_nonneg = AddMetaData(object = lung_corr_nonneg,metadata = metage_metadata)
}
print_tab(plt = 
            FeaturePlot(object = lung_corr_nonneg,features = colnames(all_metagenes),combine = T),
          title = "metagenes expression")
---
title: '`r rstudioapi::getSourceEditorContext()$path %>% basename() %>% gsub(pattern = "\\.Rmd",replacement = "")`' 
author: "Avishai Wizel"
date: '`r Sys.time()`'
output: 
  html_notebook: 
    code_folding: hide
    toc: yes
    toc_collapse: yes
    toc_float: 
      collapsed: FALSE
    number_sections: true
    toc_depth: 1
---



# Functions

```{r warning=FALSE}
source_from_github(repositoy = "cNMF_functions",version = "0.3.87",script_name = "cnmf_function_Harmony.R")
source_from_github(repositoy = "HMSC_functions",version = "0.1.14",script_name = "functions.R")
```

# Data

```{r}
acc1_cancer_cells = readRDS("./Data/acc1_cancer_cells_15KnCount_V4.RDS")
```



# Original UMAP
```{r}
DimPlot(object = acc1_cancer_cells,pt.size = 2)
```

```{r fig.height=10, fig.width=8}

comparisons <- list(c("ACC.plate2", "ACC1.P3"))


plt2 = VlnPlot(object = acc1_cancer_cells,group.by= "orig.ident",features = "nFeature_RNA")+ stat_compare_means(comparisons = comparisons, label = "p.signif",label.y = 10000)

plt1+plt2
```

# Seurat intergration
```{r}
acc1_cancer_cells.list <- SplitObject(acc1_cancer_cells, split.by = "plate")

# normalize and identify variable features for each dataset independently
acc1_cancer_cells.list <- lapply(X = acc1_cancer_cells.list, FUN = function(x) {
    # x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 1000)
})

# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = acc1_cancer_cells.list,nfeatures = 1000)
acc.anchors <- FindIntegrationAnchors(object.list = acc1_cancer_cells.list, anchor.features = features,k.filter = 50)

```

```{r}
acc.combined <- IntegrateData(anchorset = acc.anchors,k.weight = 50,features.to.integrate = rownames(acc1_cancer_cells),preserve.order = F)
DefaultAssay(acc.combined) <- "integrated"

```

```{r}
acc.combined <- ScaleData(acc.combined, verbose = FALSE)
acc.combined <- RunPCA(acc.combined, npcs = 30, verbose = FALSE)
ElbowPlot(acc.combined)
```

```{r message=FALSE, warning=FALSE}
acc.combined <- RunUMAP(acc.combined, reduction = "pca", dims = 1:10)
acc.combined <- FindNeighbors(acc.combined, reduction = "pca", dims = 1:10)
acc.combined <- FindClusters(acc.combined, resolution = 0.5)
```

# UMAPS
```{r}
DimPlot(acc.combined, reduction = "umap", group.by = "plate")
```
# clusters DEG
```{r}
acc_deg <- FindMarkers(acc.combined, ident.1 = "0",ident.2 = "1",features = VariableFeatures(acc.combined),densify = T)
```

```{r}
enrichment_analysis(acc_deg,background = VariableFeatures(acc.combined),fdr_Cutoff = 0.01,ident.1 = "0",ident.2 = "1",show_by = 1)
```




# MYB-HPV
```{r}
myb_vs_hpv = FetchData(object = acc.combined,vars = c("hpv33_positive","MYB"))
myb_vs_hpv $hpv33_positive = as.character(myb_vs_hpv $hpv33_positive )

ggboxplot(myb_vs_hpv, x = "hpv33_positive", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "t.test",comparisons = list(c("positive","negative")))+ stat_summary(fun.data = function(x) data.frame(y=12, label = paste("Mean=",round(mean(x),digits = 2))), geom="text") +ylab("log2(MYB)")

```
# HPV-MYB per plate
```{r}
plate_1 = subset(acc.combined,subset = plate == "ACC.plate2")
myb_vs_hpv = FetchData(object = plate_1,vars = c("hpv33_positive","MYB"))
myb_vs_hpv $hpv33_positive = as.character(myb_vs_hpv $hpv33_positive )

p = ggboxplot(myb_vs_hpv, x = "hpv33_positive", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "wilcox.test",comparisons = list(c("positive","negative")))+ stat_summary(fun.data = function(x) data.frame(y=15, label = paste("Mean=",round(mean(x),digits = 2))), geom="text") +ylab("log2(MYB)")+ggtitle("ACC.plate2")

plate_2 = subset(acc.combined,subset = plate == "ACC1.P3")
myb_vs_hpv = FetchData(object = plate_2,vars = c("hpv33_positive","MYB"))
myb_vs_hpv $hpv33_positive = as.character(myb_vs_hpv $hpv33_positive )

p+ggboxplot(myb_vs_hpv, x = "hpv33_positive", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "wilcox.test",comparisons = list(c("positive","negative")))+ stat_summary(fun.data = function(x) data.frame(y=15, label = paste("Mean=",round(mean(x),digits = 2))), geom="text") +ylab("log2(MYB)")+ggtitle("ACC1.P3")
```

## myo-lum score
```{r}
original_myo_genes = c( "TP63", "TP73", "CAV1", "CDH3", "KRT5", "KRT14", "ACTA2", "TAGLN", "MYLK", "DKK3")
original_lum_genes = c("KIT", "EHF", "ELF5", "KRT7", "CLDN3", "CLDN4", "CD24", "LGALS3", "LCN2", "SLPI" )
```


```{r}
calculate_score(dataset = acc.combined,myo_genes = original_myo_genes,lum_genes = original_lum_genes)
```
```{r warning=FALSE, collapse=T}
myo_protein_markers = c("CNN1", "TP63","ACTA2")
top_myo  = top_correlated(dataset = acc.combined, genes = myo_protein_markers,threshold = 0.35)
print("Number of genes = " %>% paste(length(top_myo)))
message("Names of genes:")
top_myo %>% head(30)
message("Genes that also apeared in the original score:")
base::intersect(top_myo,original_myo_genes) 
```
```{r}
lum_protein_markers = c("KIT")
top_lum  = top_correlated(dataset = acc.combined, genes = lum_protein_markers,threshold = 0.35)
print("Number of genes = " %>% paste(length(top_lum)))
message("Names of genes:")
top_lum %>% head(30)
message("Genes that also apeared in the original score:")
base::intersect(top_lum,original_lum_genes) 
```
```{r}
cd_features <- list(c("MYLK" ,"TP63" , "ACTA2" ,"TAGLN","CNN1", "TP63","ACTA2"))
pbmc_small <- AddModuleScore(
  object = acc.combined,
  features = cd_features,
  ctrl = 5,
  name = 'myo_Features'
)

FeaturePlot(object = pbmc_small,features = "myo_Features1")
```

```{r}
cd_features <- list(top_lum)
pbmc_small <- AddModuleScore(
  object = pbmc_small,
  features = cd_features,
  ctrl = 5,
  name = 'lum_Features'
)

FeaturePlot(object = pbmc_small,features = "lum_Features1")
```
 
```{r}
cor(pbmc_small$lum_Features1,pbmc_small$myo_Features1)
```

```{r}
a = merge(acc_cancerCells_noACC1@assays$RNA@data,acc.combined@assays$integrated@data)
```





# Harmony + cNMF
```{python}
from cnmf import cNMF
import pickle
nfeatures = "2K"
f = open('./Data/cNMF/HMSC_cNMF_harmony_2Kvargenes/cnmf_obj.pckl', 'rb')
cnmf_obj = pickle.load(f)
f.close()
```


```{python}
selected_k = 3
density_threshold = 0.1
cnmf_obj.consensus(k=selected_k, density_threshold=density_threshold,show_clustering=True)
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_k, density_threshold=density_threshold)
```

```{r}
gep_scores = py$gep_scores
gep_tpm = py$gep_tpm
all_metagenes= py$usage_norm
```

# Harmony results {.tabset}
```{r fig.height=10, fig.width=10, results='asis'}
# Make metagene names
for (i in 1:ncol(all_metagenes)) {
  colnames(all_metagenes)[i] = "metagene." %>% paste0(i)
}

#add each metagene to metadata
for (i in 1:ncol(all_metagenes)) {
  metage_metadata = all_metagenes %>% select(i)
  acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = metage_metadata)
}
print_tab(plt = 
            FeaturePlot(object = acc1_cancer_cells,features = colnames(all_metagenes),combine = T),
          title = "metagenes expression")

#add each metagene to metadata
for (i in 1:ncol(all_metagenes)) {
  metage_metadata = all_metagenes %>% select(i)
  lung_corr_nonneg = AddMetaData(object = lung_corr_nonneg,metadata = metage_metadata)
}
print_tab(plt = 
            FeaturePlot(object = lung_corr_nonneg,features = colnames(all_metagenes),combine = T),
          title = "metagenes expression")
```


<script src="https://hypothes.is/embed.js" async></script>

