1 Data

lung = readRDS("./Data/lung_cancercells_withTP_onlyPatients.rds")
lung_patients = lung$patient.ident %>% unique() %>% as.character()
lung_patients_filtered = lung_patients[!(lung_patients %in% c("X1055new","X1099"))] # remove patients with less than 100 malignant cells
lung = subset(x = lung,subset = patient.ident %in% lung_patients_filtered)
suffix ="xeno_genes_0-5sigma_2-7theta_100iter_26_9"
from cnmf import cNMF
suffix = r.suffix
import pickle
f = open('./Data/cnmf/cnmf_objects/patients_' + suffix + '_cnmf_obj.pckl', 'rb')
cnmf_obj = pickle.load(f)
f.close()

2 Functions

library(stringi)
library(reticulate)
source_from_github(repositoy = "DEG_functions",version = "0.2.47")
source_from_github(repositoy = "cNMF_functions",version = "0.4.02",script_name = "cnmf_functions_V3.R")
source_from_github(repositoy = "sc_general_functions",version = "0.1.28",script_name = "functions.R")
genesets <- msigdb_download("Homo sapiens",category="H") %>% append( msigdb_download("Homo sapiens",category="C2",subcategory = "CP"))
genesets[["HIF_targets"]] = hif_targets

genesets_go <- msigdb_gsets("Homo sapiens", "C5", "GO:BP", clean=TRUE)

3 K selection plot

plot_path = paste0("/sci/labs/yotamd/lab_share/avishai.wizel/R_projects/EGFR/Data/cnmf/cNMF_patients_Varnorm_Harmony_",suffix,"/cNMF_patients_Varnorm_Harmony_",suffix,".k_selection.png")
knitr::include_graphics(plot_path)
cnmf_obj.consensus(k=3, density_threshold=0.1,show_clustering=True)
cnmf_obj.consensus(k=6, density_threshold=0.1,show_clustering=True)
cnmf_obj.consensus(k=7, density_threshold=0.1,show_clustering=True)
cnmf_obj.consensus(k=8, density_threshold=0.1,show_clustering=True)

4 gep scores for all NMF k’s

density_threshold = 0.1
usage_norm3, gep_scores3, gep_tpm3, topgenes = cnmf_obj.load_results(K=3, density_threshold=density_threshold)
usage_norm6, gep_scores6, gep_tpm6, topgenes = cnmf_obj.load_results(K=6, density_threshold=density_threshold)
usage_norm7, gep_scores7, gep_tpm7, topgenes = cnmf_obj.load_results(K=7, density_threshold=density_threshold)
usage_norm8, gep_scores8, gep_tpm8, topgenes = cnmf_obj.load_results(K=8, density_threshold=density_threshold)

5 gsea of each program

gep_scores3 = py$gep_scores3
gep_scores6 = py$gep_scores6
gep_scores7 = py$gep_scores7
gep_scores8 = py$gep_scores8

gep_tpm8 = py$gep_tpm8
top_genes = py$topgenes
  for (col in seq_along(gep_scores8)) {
     ranked_vec = gep_scores8[,col] %>% setNames(rownames(gep_scores8)) %>% sort(decreasing = TRUE) 
     hyp_obj <- hypeR_fgsea(ranked_vec, genesets,up_only = T)
     print_tab(hyp_dots(hyp_obj,title = paste("program",col))+ aes(size=nes),title = paste0("gep",col))
  }

gep1

gep2

gep3

gep4

Warning in fgsea::fgseaMultilevel(stats = signature, pathways = gsets.obj$genesets, : There were 2 pathways for which P-values were not calculated properly due to unbalanced (positive and negative) gene-level statistic values. For such pathways pval, padj, NES, log2err are set to NA. You can try to increase the value of the argument nPermSimple (for example set it nPermSimple = 10000) ## gep5 {.unnumbered }

Warning in fgsea::fgseaMultilevel(stats = signature, pathways = gsets.obj$genesets, : There were 1 pathways for which P-values were not calculated properly due to unbalanced (positive and negative) gene-level statistic values. For such pathways pval, padj, NES, log2err are set to NA. You can try to increase the value of the argument nPermSimple (for example set it nPermSimple = 10000) ## gep6 {.unnumbered }

gep7

gep8

NA

programs_main_pathways = list(gep1 = 1:2, gep2 = 1:3,gep3 = 1:4)

6 Calculate usage by counts before Harmony

# get expression with genes in cnmf input
genes = rownames(gep_scores8)
lung_expression = t(as.matrix(GetAssayData(lung,slot='data'))) 
lung_expression = 2**lung_expression #convert from log2(tpm+1) to tpm
lung_expression = lung_expression-1
lung_expression = lung_expression[,genes] %>% as.data.frame()

all_0_genes = colnames(lung_expression)[colSums(lung_expression==0, na.rm=TRUE)==nrow(lung_expression)] #delete rows that have all 0
genes = genes[!genes %in% all_0_genes]
lung_expression = lung_expression[,!colnames(lung_expression) %in% all_0_genes]
gc(verbose = F)

lung_expression = r.lung_expression
genes = r.genes
usage_by_calc = get_usage_from_score(counts=lung_expression,tpm=lung_expression,genes=genes,cnmf_obj=cnmf_obj,k=8,sumTo1=True)
usage_by_calc = py$usage_by_calc
colnames(usage_by_calc) = c("autoimmune","TNFa.NFkB", "hypoxia","unknown2", "cell_cycle1", "cell_cycle2","INFg","unknown")
# colnames(usage_by_calc) = paste0("gep",1:8)
#add each metagene to metadata
for (i  in 1:ncol(usage_by_calc )) {
  metagene_metadata = usage_by_calc [,i,drop=F]
  lung = AddMetaData(object = lung,metadata = metagene_metadata,col.name = colnames(usage_by_calc)[i])
}

FeaturePlot(object = lung,features = colnames(usage_by_calc),ncol = 3)

7 Regulation

metagenes_mean_compare(dataset = lung,time.point_var = "time.point",prefix = "patient",patient.ident_var = "patient.ident",pre_on = c("pre-treatment","on-treatment"),test = "wilcox.test",programs = colnames(usage_by_calc)[1:5],without_split = F)

autoimmune per patient

TNFa.NFkB per patient

hypoxia per patient

hypoxia2 per patient

cell_cycle1 per patient

NA

8 Program 2

col=1
ranked_vec = gep_scores8[, col] %>% setNames(rownames(gep_scores8)) %>% sort(decreasing = TRUE)
print (paste("running gep",col))
hyp_obj <-hypeR_fgsea(ranked_vec, genesets_go, up_only = T)

print(hyp_dots(hyp_obj, title = paste("program", col), abrv = 70) + aes(size =nes))
  

pathway_num = 1
le_genes = hyp_obj$as.data.frame()[pathway_num,"le"] %>% strsplit(",") %>% unlist()
score = (lung_expression[, le_genes] %>% as.matrix())  %*% as.matrix(gep_scores8[le_genes, 2, drop =F])

# score = FetchData(object = lung,vars = le_genes) %>% rowMeans()
pathway_name = paste0(hyp_obj$data[pathway_num, "label", drop = F], "_le") %>% gsub(pattern = " ",replacement = "_")
lung = AddMetaData(lung, score, col.name = pathway_name)
print_tab(FeaturePlot(object = lung,features = pathway_name),title = pathway_name)
##   Positive_Regulation_Of_Transcription_By_Rna_Polymerase_Ii_le {.unnumbered }  

NA
cor(score,usage_by_calc[,2])
       [,1]
2 0.5433146
metagenes_mean_compare(dataset = lung,time.point_var = "time.point",prefix = "patient",patient.ident_var = "patient.ident",pre_on = c("pre-treatmen
                                                                                                                                      t","on-treatment"),test = "wilcox.test",programs = pathway_name,without_split = F)

Positive_Regulation_Of_Transcription_By_Rna_Polymerase_Ii_le per patient

NA

9 programs LE genes


programs_main_pathways_names = list()
  for (col in seq_along(gep_scores8)[1:3]) {
     ranked_vec = gep_scores8[,col] %>% setNames(rownames(gep_scores8)) %>% sort(decreasing = TRUE) 
     hyp_obj <- hypeR_fgsea(ranked_vec, genesets)
     programs_main_pathways_names[[col]] =  hyp_obj$data[programs_main_pathways[[col]],"label",drop=T]
     for (pathway_num in programs_main_pathways[[col]]) {
        le_genes =  hyp_obj$data[pathway_num,,drop=F] %>% pull("le") %>% strsplit(",") %>% unlist()
        # score = (lung_expression[,le_genes] %>% as.matrix())  %*% as.matrix(gep_scores8[le_genes,col,drop=F])
        # score = score %>% as.vector()
        score = FetchData(object = lung,vars = le_genes) %>% rowMeans()
        pathway_name = paste0(hyp_obj$data[pathway_num,"label",drop=F],"_le")
        lung=AddMetaData(lung,score,col.name = pathway_name)
        cor_res = cor(score,usage_by_calc[,col])
        print_tab(FeaturePlot(object = lung,features = pathway_name)+ggtitle(pathway_name, subtitle = paste("cor to usage:",cor_res)),title = pathway_name)
     }
  }

HALLMARK_INTERFERON_GAMMA_RESPONSE_le

HALLMARK_INTERFERON_ALPHA_RESPONSE_le

SA_FAS_SIGNALING_le

HALLMARK_TNFA_SIGNALING_VIA_NFKB_le

HALLMARK_P53_PATHWAY_le

HALLMARK_HYPOXIA_le

HALLMARK_GLYCOLYSIS_le

HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION_le

HIF_targets_le

NA

10 programs LE genes

metagenes_mean_compare(dataset = lung,time.point_var = "time.point",prefix = "patient",patient.ident_var = "patient.ident",pre_on = c("pre-treatment","on-treatment"),test = "wilcox.test",programs = programs_main_pathways_names %>% unlist() %>% paste0("_le") ,without_split = F)

HALLMARK_INTERFERON_GAMMA_RESPONSE_le per patient

HALLMARK_INTERFERON_ALPHA_RESPONSE_le per patient

SA_FAS_SIGNALING_le per patient

HALLMARK_TNFA_SIGNALING_VIA_NFKB_le per patient

HALLMARK_P53_PATHWAY_le per patient

HALLMARK_HYPOXIA_le per patient

HALLMARK_GLYCOLYSIS_le per patient

HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION_le per patient

HIF_targets_le per patient

NA

---
title: '`r rstudioapi::getSourceEditorContext()$path %>% basename() %>% gsub(pattern = "\\.Rmd",replacement = "")`' 
author: "Avishai Wizel"
date: '`r Sys.time()`'
output: 
  html_notebook: 
    code_folding: hide
    toc: yes
    toc_collapse: yes
    toc_float: 
      collapsed: FALSE
    number_sections: true
    toc_depth: 1
---


# Data

```{r}
lung = readRDS("./Data/lung_cancercells_withTP_onlyPatients.rds")
lung_patients = lung$patient.ident %>% unique() %>% as.character()
lung_patients_filtered = lung_patients[!(lung_patients %in% c("X1055new","X1099"))] # remove patients with less than 100 malignant cells
lung = subset(x = lung,subset = patient.ident %in% lung_patients_filtered)
```


```{r}
suffix ="xeno_genes_0-5sigma_2-7theta_100iter_26_9"
```


```{python}
from cnmf import cNMF
suffix = r.suffix
import pickle
f = open('./Data/cnmf/cnmf_objects/patients_' + suffix + '_cnmf_obj.pckl', 'rb')
cnmf_obj = pickle.load(f)
f.close()
```



# Functions

```{r}
library(stringi)
library(reticulate)
source_from_github(repositoy = "DEG_functions",version = "0.2.47")
source_from_github(repositoy = "cNMF_functions",version = "0.4.02",script_name = "cnmf_functions_V3.R")
source_from_github(repositoy = "sc_general_functions",version = "0.1.28",script_name = "functions.R")
```

```{r}
genesets <- msigdb_download("Homo sapiens",category="H") %>% append( msigdb_download("Homo sapiens",category="C2",subcategory = "CP"))
genesets[["HIF_targets"]] = hif_targets

genesets_go <- msigdb_gsets("Homo sapiens", "C5", "GO:BP", clean=TRUE)

```

# K selection plot
```{r fig.height=4, fig.width=4}
plot_path = paste0("/sci/labs/yotamd/lab_share/avishai.wizel/R_projects/EGFR/Data/cnmf/cNMF_patients_Varnorm_Harmony_",suffix,"/cNMF_patients_Varnorm_Harmony_",suffix,".k_selection.png")
knitr::include_graphics(plot_path)
```


```{python}
cnmf_obj.consensus(k=3, density_threshold=0.1,show_clustering=True)
cnmf_obj.consensus(k=6, density_threshold=0.1,show_clustering=True)
cnmf_obj.consensus(k=7, density_threshold=0.1,show_clustering=True)
cnmf_obj.consensus(k=8, density_threshold=0.1,show_clustering=True)
```


# gep scores for all NMF k's
```{python}
density_threshold = 0.1
usage_norm3, gep_scores3, gep_tpm3, topgenes = cnmf_obj.load_results(K=3, density_threshold=density_threshold)
usage_norm6, gep_scores6, gep_tpm6, topgenes = cnmf_obj.load_results(K=6, density_threshold=density_threshold)
usage_norm7, gep_scores7, gep_tpm7, topgenes = cnmf_obj.load_results(K=7, density_threshold=density_threshold)
usage_norm8, gep_scores8, gep_tpm8, topgenes = cnmf_obj.load_results(K=8, density_threshold=density_threshold)

```

# gsea of each program {.tabset}
```{r fig.height=8, fig.width=8, results='asis'}
gep_scores3 = py$gep_scores3
gep_scores6 = py$gep_scores6
gep_scores7 = py$gep_scores7
gep_scores8 = py$gep_scores8

gep_tpm8 = py$gep_tpm8
top_genes = py$topgenes
```

```{r results='asis'}
  for (col in seq_along(gep_scores8)) {
     ranked_vec = gep_scores8[,col] %>% setNames(rownames(gep_scores8)) %>% sort(decreasing = TRUE) 
     hyp_obj <- hypeR_fgsea(ranked_vec, genesets,up_only = T)
     print_tab(hyp_dots(hyp_obj,title = paste("program",col))+ aes(size=nes),title = paste0("gep",col))
  }
```



```{r}
programs_main_pathways = list(gep1 = 1:2, gep2 = 1:3,gep3 = 1:4)
```

# Calculate usage by counts before Harmony
```{r echo=TRUE, results='asis'}
# get expression with genes in cnmf input
genes = rownames(gep_scores8)
lung_expression = t(as.matrix(GetAssayData(lung,slot='data'))) 
lung_expression = 2**lung_expression #convert from log2(tpm+1) to tpm
lung_expression = lung_expression-1
lung_expression = lung_expression[,genes] %>% as.data.frame()

all_0_genes = colnames(lung_expression)[colSums(lung_expression==0, na.rm=TRUE)==nrow(lung_expression)] #delete rows that have all 0
genes = genes[!genes %in% all_0_genes]
lung_expression = lung_expression[,!colnames(lung_expression) %in% all_0_genes]
gc(verbose = F)
```


```{python}

lung_expression = r.lung_expression
genes = r.genes
usage_by_calc = get_usage_from_score(counts=lung_expression,tpm=lung_expression,genes=genes,cnmf_obj=cnmf_obj,k=8,sumTo1=True)

```

```{r}
usage_by_calc = py$usage_by_calc
```

```{r}
colnames(usage_by_calc) = c("autoimmune","TNFa.NFkB", "hypoxia","unknown2", "cell_cycle1", "cell_cycle2","INFg","unknown")
```


```{r echo=TRUE, fig.height=9, fig.width=12, results='asis'}
# colnames(usage_by_calc) = paste0("gep",1:8)
#add each metagene to metadata
for (i  in 1:ncol(usage_by_calc )) {
  metagene_metadata = usage_by_calc [,i,drop=F]
  lung = AddMetaData(object = lung,metadata = metagene_metadata,col.name = colnames(usage_by_calc)[i])
}

FeaturePlot(object = lung,features = colnames(usage_by_calc),ncol = 3)

```
# Regulation {.tabset}

```{r results='asis'}
metagenes_mean_compare(dataset = lung,time.point_var = "time.point",prefix = "patient",patient.ident_var = "patient.ident",pre_on = c("pre-treatment","on-treatment"),test = "wilcox.test",programs = colnames(usage_by_calc)[1:5],without_split = F)
```

# Program 2
```{r}
col=1
ranked_vec = gep_scores8[, col] %>% setNames(rownames(gep_scores8)) %>% sort(decreasing = TRUE)
print (paste("running gep",col))
hyp_obj <-hypeR_fgsea(ranked_vec, genesets_go, up_only = T)

print(hyp_dots(hyp_obj, title = paste("program", col), abrv = 70) + aes(size =nes))
  
```


```{r}
col=2
ranked_vec = gep_scores8[, col] %>% setNames(rownames(gep_scores8)) %>% sort(decreasing = TRUE)
print (paste("running gep",col))
hyp_obj <-hypeR_fgsea(ranked_vec, genesets_go, up_only = T)

print(hyp_dots(hyp_obj, title = paste("program", col), abrv = 70) + aes(size =nes))
  
```
```{r}
pathway_num = 1
le_genes = hyp_obj$as.data.frame()[pathway_num,"le"] %>% strsplit(",") %>% unlist()
score = (lung_expression[, le_genes] %>% as.matrix())  %*% as.matrix(gep_scores8[le_genes, 2, drop =F])

# score = FetchData(object = lung,vars = le_genes) %>% rowMeans()
pathway_name = paste0(hyp_obj$data[pathway_num, "label", drop = F], "_le") %>% gsub(pattern = " ",replacement = "_")
lung = AddMetaData(lung, score, col.name = pathway_name)
print_tab(FeaturePlot(object = lung,features = pathway_name),title = pathway_name)
```
```{r}
pathway_num = 1
le_genes = hyp_obj$as.data.frame()[pathway_num,"le"] %>% strsplit(",") %>% unlist()
score = (lung_expression[, le_genes[ le_genes %in% top_genes[1:200]]] %>% as.matrix())  %*% as.matrix(gep_scores8[le_genes[ le_genes %in% top_genes[1:200]], 2, drop =F])
cor(score,usage_by_calc[,2])
score = FetchData(object = lung,vars = le_genes[ le_genes %in% top_genes[1:200]]) %>% rowMeans()
cor(score,usage_by_calc[,2])

        
top_genes = gep_scores8[,2] %>% setNames(rownames(gep_scores8)) %>% sort(decreasing = TRUE) %>% names()
score = (lung_expression[, top_genes[1:200]] %>% as.matrix())  %*% as.matrix(gep_scores8[top_genes[1:200], 2, drop =F])
 top_genes %in%  le_genes %>% which()
cor(score,usage_by_calc[,2])

pathway_name = paste0(hyp_obj$data[pathway_num, "label", drop = F], "_le") %>% gsub(pattern = " ",replacement = "_")
lung = AddMetaData(lung, score, col.name = pathway_name)
print_tab(FeaturePlot(object = lung,features = pathway_name),title = pathway_name)
```

```{r results='asis'}
metagenes_mean_compare(dataset = lung,time.point_var = "time.point",prefix = "patient",patient.ident_var = "patient.ident",pre_on = c("pre-treatmen
                                                                                                                                      t","on-treatment"),test = "wilcox.test",programs = pathway_name,without_split = F)
```


# programs LE genes {.tabset}



```{r results='asis'}

programs_main_pathways_names = list()
  for (col in seq_along(gep_scores8)[1:3]) {
     ranked_vec = gep_scores8[,col] %>% setNames(rownames(gep_scores8)) %>% sort(decreasing = TRUE) 
     hyp_obj <- hypeR_fgsea(ranked_vec, genesets)
     programs_main_pathways_names[[col]] =  hyp_obj$data[programs_main_pathways[[col]],"label",drop=T]
     for (pathway_num in programs_main_pathways[[col]]) {
        le_genes =  hyp_obj$data[pathway_num,,drop=F] %>% pull("le") %>% strsplit(",") %>% unlist()
        # score = (lung_expression[,le_genes] %>% as.matrix())  %*% as.matrix(gep_scores8[le_genes,col,drop=F])
        # score = score %>% as.vector()
        score = FetchData(object = lung,vars = le_genes) %>% rowMeans()
        pathway_name = paste0(hyp_obj$data[pathway_num,"label",drop=F],"_le")
        lung=AddMetaData(lung,score,col.name = pathway_name)
        cor_res = cor(score,usage_by_calc[,col])
        print_tab(FeaturePlot(object = lung,features = pathway_name)+ggtitle(pathway_name, subtitle = paste("cor to usage:",cor_res)),title = pathway_name)
     }
  }
```
# programs LE genes {.tabset}

```{r results='asis'}
metagenes_mean_compare(dataset = lung,time.point_var = "time.point",prefix = "patient",patient.ident_var = "patient.ident",pre_on = c("pre-treatment","on-treatment"),test = "wilcox.test",programs = programs_main_pathways_names %>% unlist() %>% paste0("_le") ,without_split = F)
```
<script src="https://hypothes.is/embed.js" async></script>


