# print params
paged_table(as.data.frame(unlist(params)))

1 Data

getwd()
[1] "/sci/labs/yotamd/lab_share/avishai.wizel/R_projects/HMSC/Notebooks/TCGA"
library(TCGAbiolinks)
library(biomaRt)
library(kableExtra)
library(rmarkdown)
#source("./Notebooks/TCGA/TCGA_functions.R")
getwd()
[1] "/sci/labs/yotamd/lab_share/avishai.wizel/R_projects/HMSC/Notebooks/TCGA"
avg_diff_cutoff = 0.4
fdr_cutoff = 0.1
signature = opscc_deg %>% filter(avg_diff > avg_diff_cutoff &
                                  fdr < fdr_cutoff) %>% rownames()
signature
 [1] "HPV16-E5" "HPV16-E1" "H2AFZ"    "STMN1"    "CKS1B"    "TUBA1B"   "PTN"      "FBL"      "ALDH3A1"  "KRT5"    
[11] "RPL37"    "HMGB2"    "PTTG1"    "DST"      "IGFBP2"   "KRT15"    "RPS16"    "ATP1B3"   "PDLIM1"   "HSPA1A"  
[21] "S100A2"   "IGKC"     "HSPB1"   
project_name = "TCGA-HNSC"
HNSC_tpm = readRDS("./Data_out/TCGA_survival_analysis/TCGA_HNSC_TPM.RDS")
Warning in gzfile(file, "rb") :
  cannot open compressed file './Data_out/TCGA_survival_analysis/TCGA_HNSC_TPM.RDS', probable reason 'No such file or directory'
Error in gzfile(file, "rb") : cannot open the connection
clinical_data <- GDCquery_clinic(project_name, "clinical")
OPSCC_tissues = c("Base of tongue, NOS",
"Oropharynx, NOS",
"Posterior wall of oropharynx",
"Tonsil, NOS")
clinical_data = clinical_data[clinical_data$tissue_or_organ_of_origin %in% OPSCC_tissues,]
cbp_data = read_tsv(file = "./Input_data/TCGA/cbioportal_hnsc_tcga_pan_can_atlas_2018_clinical_data.tsv")
Error: './Input_data/TCGA/cbioportal_hnsc_tcga_pan_can_atlas_2018_clinical_data.tsv' does not exist in current working directory ('/sci/labs/yotamd/lab_share/avishai.wizel/R_projects/HMSC/Notebooks/TCGA').
# load data 
library("readxl")
library(stringr)
library(ggpubr)
TCGA_HPV_data <- read_excel("./Input_data/TCGA/PMC3806554_ncomms3513-s2.xlsx",skip = 3)
TCGA_HPV_data  %<>% filter(`Tumor type` == "Primary solid")
TCGA_HPV_data$submitter_id = TCGA_HPV_data$`Sample barcode` %>% str_sub( start = 1, end = 12) 
TCGA_HPV_data = TCGA_HPV_data %>%
    filter(Cancer == "HNSC") %>%
    mutate (hpv_status = 
            if_else(
              condition = ppm == "N/A",
              true = "HPV-",
              false =  "HPV+"))

clinical_data = inner_join(x = clinical_data,y = TCGA_HPV_data[,c("submitter_id","hpv_status")],by = c("submitter_id"="submitter_id"))

2 MYB

gene_status = set_clinical_data(clin_data = clinical_data,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = params$stratify)
paged_table(gene_status)
clinical_data_with_scores = clinical_data[clinical_data$submitter_id %in% rownames(gene_status),]
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = left_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    main = "TCGA Set\n GBM",
    height = 10,
    width=10,filename = NULL,pval.method    =T
)

p[[1]]/p[[2]]
Warning in grSoftVersion() :
  unable to load shared object '/usr/local/lib/R/modules//R_X11.so':
  libXt.so.6: cannot open shared object file: No such file or directory

3 HPV+ signature


p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "hpv_status",
    filename = NULL,pval.method = T
)

p[[1]]/p[[2]]

4 MYB - HPV correlation

# load data 
library("readxl")
library(stringr)
library(ggpubr)
TCGA_HPV_data <- read_excel("./Input_data/TCGA/PMC3806554_ncomms3513-s5.xlsx",skip = 2)
TCGA_HPV_data$submitter_id = TCGA_HPV_data$`Sample Barcode` %>% str_sub( start = 1, end = 12) 
rownames(TCGA_HPV_data) = TCGA_HPV_data$submitter_id
TCGA_HPV_data = TCGA_HPV_data %>% filter(Cancer == "CESC")

gene_status = set_clinical_data(clin_data = clinical_data,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = params$stratify)
gene_status = gene_status[TCGA_HPV_data$submitter_id,] %>%  cbind(HPV_PPM = TCGA_HPV_data$ppm) %>% na.omit()

paged_table(gene_status)


sp <- ggscatter(gene_status, x = "MYB", y = "HPV_PPM",
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE # Add confidence interval
   )
# Add correlation coefficient
sp + stat_cor(method = "pearson")

5 HPV signature - HPV correlation

# with HPV signature
gene_status = set_clinical_data(clin_data = clinical_data,genes = signature,tpm_data_frame = HNSC_tpm,stratify = params$stratify,signature_name = "HPV+ signature")
gene_status = gene_status[TCGA_HPV_data$submitter_id,] %>% cbind(HPV_PPM = TCGA_HPV_data$ppm) %>% na.omit()
paged_table(gene_status)

sp <- ggscatter(gene_status, x = "HPV+ signature", y = "HPV_PPM",
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE # Add confidence interval
   )
# Add correlation coefficient
sp + stat_cor(method = "pearson")

6 MYB- HPV T test


gene_status = set_clinical_data(clin_data = clinical_data,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = params$stratify)
gene_status$patient = rownames(gene_status)
gene_status = inner_join(x = gene_status,y = clinical_data[,c("submitter_id","hpv_status")],by = c("patient"="submitter_id"))

paged_table(gene_status)
library(ggpubr)
library(rstatix)
stat.test <- gene_status %>%
  t_test(MYB ~ hpv_status) %>%
  add_significance()
stat.test



bxp <- ggboxplot(gene_status, x = "hpv_status", y = "MYB", fill = "#00AFBB")+
  geom_jitter()
stat.test <- stat.test %>% add_xy_position(x = "hpv_status")
bxp + 
  stat_pvalue_manual(stat.test,  label = "T-test, p = {p}") 

7 MYB and HPV survival plot

7.1 HPV + tumors

clinical_data_with_scores = clinical_data  %>% filter(hpv_status == "HPV+")

gene_status = set_clinical_data(clin_data = clinical_data_with_scores,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = "M")
paged_table(gene_status)
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = inner_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))
# debugonce(TCGAanalyze_survival)

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    filename = NULL,pval.method = T
)
p[[1]]/p[[2]]

7.2 HPV - tumors

clinical_data_with_scores = clinical_data  %>% filter(hpv_status == "HPV-")

gene_status = set_clinical_data(clin_data = clinical_data_with_scores,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = params$stratify)
paged_table(gene_status)
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = inner_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    filename = NULL,pval.method = T
)
p[[1]]/p[[2]]

8 shared signature

shared_signature = readRDS(file = "./Data_out/temp/shared_signature.RDS")
gene_status = set_clinical_data(clin_data = clinical_data,genes = shared_signature,tpm_data_frame = HNSC_tpm,stratify = params$stratify,signature_name = "HPV+ signature")
paged_table(gene_status)
clinical_data_with_scores = clinical_data[clinical_data$submitter_id %in% rownames(gene_status),]
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = left_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    main = "TCGA Set\n GBM",
    height = 10,
    width=10,filename = NULL,pval.method = T
)

p[[1]]/p[[2]]

9 HMSC signature

# load
hmsc_deg = read.table(file = "./Data_out/HPV_analysis/hpv_deg_df.csv",row.names = 1)
  avg_diff_cutoff = 1
  fdr_cutoff = 0.1
  signature = hmsc_deg %>% filter(avg_diff>avg_diff_cutoff & fdr<fdr_cutoff) %>% rownames()
  signature
gene_status = set_clinical_data(clin_data = clinical_data,genes = signature,tpm_data_frame = HNSC_tpm,stratify = params$stratify,signature_name = "HPV+ signature")
paged_table(gene_status)
clinical_data_with_scores = clinical_data[clinical_data$submitter_id %in% rownames(gene_status),]
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = left_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    main = "TCGA Set\n GBM",
    height = 10,
    width=10,filename = NULL,pval.method = T
)

p[[1]]/p[[2]]
---
title: '`r rstudioapi::getSourceEditorContext()$path %>% basename() %>% gsub(pattern = "\\.Rmd",replacement = "")`' 
author: "Avishai Wizel"
date: '`r Sys.time()`'
output: 
  html_notebook: 
    code_folding: hide
    toc: yes
    toc_collapse: yes
    toc_float: 
      collapsed: TRUE
    number_sections: true
    toc_depth: 2
    
  html_document: 
    code_folding: hide
    toc: yes
    toc_collapse: yes
    toc_float: 
      collapsed: TRUE
    number_sections: true
    toc_depth: 2
    df-print: paged

params:
  stratify: "M"
  log_tpm: TRUE
  data_out_dir: NULL
  figs_out_dir: NULL
---


```{r}
# print params
paged_table(as.data.frame(unlist(params)))
```

# Data

```{r}

library(TCGAbiolinks)
library(biomaRt)
library(kableExtra)
library(rmarkdown)
source("./Notebooks/TCGA/TCGA_functions.R")
```



```{r}
# find hpv signature genes
opscc_deg = readRDS(file = "./Data_out/opscc_deg.rds")

avg_diff_cutoff = 0.4
fdr_cutoff = 0.1
signature = opscc_deg %>% filter(avg_diff > avg_diff_cutoff &
                                  fdr < fdr_cutoff) %>% rownames()
signature
  
```

```{r}
project_name = "TCGA-HNSC"
```

```{r}
HNSC_tpm = readRDS("./Data_out/TCGA_survival_analysis/TCGA_HNSC_TPM.RDS")

if (params$log_tpm) {
  HNSC_tpm = log2(HNSC_tpm+1)
  
}
```

```{r}
clinical_data <- GDCquery_clinic(project_name, "clinical")
OPSCC_tissues = c("Base of tongue, NOS",
"Oropharynx, NOS",
"Posterior wall of oropharynx",
"Tonsil, NOS")
clinical_data = clinical_data[clinical_data$tissue_or_organ_of_origin %in% OPSCC_tissues,]
```

```{r}
# add HPV data by clinical data
cbp_data = read_tsv(file = "./Input_data/TCGA/cbioportal_hnsc_tcga_pan_can_atlas_2018_clinical_data.tsv")
cbp_data = cbp_data[,c("Patient ID", "Subtype")] %>% dplyr::rename(hpv_status = Subtype) %>% 
  mutate(hpv_status = gsub(,x = hpv_status,pattern = "HNSC_HPV",replacement = "HPV"))
clinical_data = inner_join(x = clinical_data,y = cbp_data,by = c("submitter_id"="Patient ID"))
clinical_data  = clinical_data[ !is.na(clinical_data$hpv_status), ]
```

```{r}
# load data 
library("readxl")
library(stringr)
library(ggpubr)
TCGA_HPV_data <- read_excel("./Input_data/TCGA/PMC3806554_ncomms3513-s2.xlsx",skip = 3)
TCGA_HPV_data  %<>% filter(`Tumor type` == "Primary solid")
TCGA_HPV_data$submitter_id = TCGA_HPV_data$`Sample barcode` %>% str_sub( start = 1, end = 12) 
TCGA_HPV_data = TCGA_HPV_data %>%
    filter(Cancer == "HNSC") %>%
    mutate (hpv_status = 
            if_else(
              condition = ppm == "N/A",
              true = "HPV-",
              false =  "HPV+"))

clinical_data = inner_join(x = clinical_data,y = TCGA_HPV_data[,c("submitter_id","hpv_status")],by = c("submitter_id"="submitter_id"))

```


# MYB
```{r}
gene_status = set_clinical_data(clin_data = clinical_data,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = params$stratify)
paged_table(gene_status)
```





```{r fig.height=6, fig.width=8}
clinical_data_with_scores = clinical_data[clinical_data$submitter_id %in% rownames(gene_status),]
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = left_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    main = "TCGA Set\n GBM",
    height = 10,
    width=10,filename = NULL,pval.method	=T
)

p[[1]]/p[[2]]
```

# HPV+ signature





```{r fig.height=6, fig.width=8}

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "hpv_status",
    filename = NULL,pval.method = T
)

p[[1]]/p[[2]]
```



# MYB - HPV correlation

```{r}
# load data 
library("readxl")
library(stringr)
library(ggpubr)
TCGA_HPV_data <- read_excel("./Input_data/TCGA/PMC3806554_ncomms3513-s5.xlsx",skip = 2)
TCGA_HPV_data$submitter_id = TCGA_HPV_data$`Sample Barcode` %>% str_sub( start = 1, end = 12) 
rownames(TCGA_HPV_data) = TCGA_HPV_data$submitter_id
TCGA_HPV_data = TCGA_HPV_data %>% filter(Cancer == "CESC")
```

```{r}

gene_status = set_clinical_data(clin_data = clinical_data,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = params$stratify)
gene_status = gene_status[TCGA_HPV_data$submitter_id,] %>%  cbind(HPV_PPM = TCGA_HPV_data$ppm) %>% na.omit()

paged_table(gene_status)


sp <- ggscatter(gene_status, x = "MYB", y = "HPV_PPM",
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE # Add confidence interval
   )
# Add correlation coefficient
sp + stat_cor(method = "pearson")

```

# HPV signature - HPV correlation

```{r}
# with HPV signature
gene_status = set_clinical_data(clin_data = clinical_data,genes = signature,tpm_data_frame = HNSC_tpm,stratify = params$stratify,signature_name = "HPV+ signature")
gene_status = gene_status[TCGA_HPV_data$submitter_id,] %>% cbind(HPV_PPM = TCGA_HPV_data$ppm) %>% na.omit()
paged_table(gene_status)

sp <- ggscatter(gene_status, x = "HPV+ signature", y = "HPV_PPM",
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE # Add confidence interval
   )
# Add correlation coefficient
sp + stat_cor(method = "pearson")


```

# MYB- HPV T test

```{r}
# create data

gene_status = set_clinical_data(clin_data = clinical_data,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = params$stratify)
gene_status$patient = rownames(gene_status)
gene_status = inner_join(x = gene_status,y = clinical_data[,c("submitter_id","hpv_status")],by = c("patient"="submitter_id"))

paged_table(gene_status)



```

```{r}
# plot
library(ggpubr)
library(rstatix)
stat.test <- gene_status %>%
  t_test(MYB ~ hpv_status) %>%
  add_significance()
stat.test


bxp <- ggboxplot(gene_status, x = "hpv_status", y = "MYB", fill = "#00AFBB")+
  geom_jitter()
stat.test <- stat.test %>% add_xy_position(x = "hpv_status")
bxp + 
  stat_pvalue_manual(stat.test,  label = "T-test, p = {p}") 

```

# MYB and HPV survival plot
## HPV + tumors
```{r}
clinical_data_with_scores = clinical_data  %>% filter(hpv_status == "HPV+")

gene_status = set_clinical_data(clin_data = clinical_data_with_scores,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = "M")
paged_table(gene_status)
```


```{r fig.height=6, fig.width=8}
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = inner_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))
# debugonce(TCGAanalyze_survival)

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    filename = NULL,pval.method = T
)
p[[1]]/p[[2]]
```

## HPV - tumors
```{r}
clinical_data_with_scores = clinical_data  %>% filter(hpv_status == "HPV-")

gene_status = set_clinical_data(clin_data = clinical_data_with_scores,genes = "MYB",tpm_data_frame = HNSC_tpm,stratify = params$stratify)
paged_table(gene_status)
```


```{r fig.height=6, fig.width=8}
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = inner_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    filename = NULL,pval.method = T
)
p[[1]]/p[[2]]
```
# shared signature
```{r}
shared_signature = readRDS(file = "./Data_out/temp/shared_signature.RDS")
```

```{r}
gene_status = set_clinical_data(clin_data = clinical_data,genes = shared_signature,tpm_data_frame = HNSC_tpm,stratify = params$stratify,signature_name = "HPV+ signature")
paged_table(gene_status)
```





```{r fig.height=6, fig.width=8}
clinical_data_with_scores = clinical_data[clinical_data$submitter_id %in% rownames(gene_status),]
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = left_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    main = "TCGA Set\n GBM",
    height = 10,
    width=10,filename = NULL,pval.method = T
)

p[[1]]/p[[2]]
```

# HMSC signature

```{r}
# load
hmsc_deg = read.table(file = "./Data_out/HPV_analysis/hpv_deg_df.csv",row.names = 1)
  avg_diff_cutoff = 1
  fdr_cutoff = 0.1
  signature = hmsc_deg %>% filter(avg_diff>avg_diff_cutoff & fdr<fdr_cutoff) %>% rownames()
  signature

```

```{r}
gene_status = set_clinical_data(clin_data = clinical_data,genes = signature,tpm_data_frame = HNSC_tpm,stratify = params$stratify,signature_name = "HPV+ signature")
paged_table(gene_status)
```





```{r fig.height=6, fig.width=8}
clinical_data_with_scores = clinical_data[clinical_data$submitter_id %in% rownames(gene_status),]
gene_status$patient = rownames(gene_status)
clinical_data_with_scores = left_join(x = clinical_data_with_scores,y = gene_status, by = c("submitter_id"="patient"))

p = TCGAanalyze_survival(
    data = clinical_data_with_scores,
    clusterCol = "gene_status",
    main = "TCGA Set\n GBM",
    height = 10,
    width=10,filename = NULL,pval.method = T
)

p[[1]]/p[[2]]
```