rm(list = ls()) 
library(cgdsr)
## Please send questions to cbioportal@googlegroups.com
library(DT)
##Get list of cancer studies at server
mycgds <- CGDS("http://www.cbioportal.org/")
test(mycgds)
## getCancerStudies...  OK
## getCaseLists (1/2) ...  OK
## getCaseLists (2/2) ...  OK
## getGeneticProfiles (1/2) ...  OK
## getGeneticProfiles (2/2) ...  OK
## getClinicalData (1/1) ...  OK
## getProfileData (1/6) ...  OK
## getProfileData (2/6) ...  OK
## getProfileData (3/6) ...  OK
## getProfileData (4/6) ...  OK
## getProfileData (5/6) ...  OK
## getProfileData (6/6) ...  OK
all_TCGA_studies <- getCancerStudies(mycgds)
DT::datatable(all_TCGA_studies)
#install.packages("xfun")
#same to http://www.cbioportal.org/datasets
##search the dataset you are intested in
hnscc2018 <- "hnsc_tcga_pan_can_atlas_2018"
############################case_list
getCaseLists(mycgds,hnscc2018)[,c(1,2)]
##                                   case_list_id
## 1             hnsc_tcga_pan_can_atlas_2018_all
## 2   hnsc_tcga_pan_can_atlas_2018_3way_complete
## 3             hnsc_tcga_pan_can_atlas_2018_cna
## 4         hnsc_tcga_pan_can_atlas_2018_log2CNA
## 5      hnsc_tcga_pan_can_atlas_2018_microbiome
## 6 hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna
## 7          hnsc_tcga_pan_can_atlas_2018_cnaseq
## 8       hnsc_tcga_pan_can_atlas_2018_sequenced
## 9            hnsc_tcga_pan_can_atlas_2018_rppa
##                        case_list_name
## 1                         All samples
## 2                    Complete samples
## 3               Samples with CNA data
## 4  Samples with log2 copy-number data
## 5        Samples with microbiome data
## 6 Samples with mRNA data (RNA Seq V2)
## 7  Samples with mutation and CNA data
## 8          Samples with mutation data
## 9    Samples with protein data (RPPA)
dim(getCaseLists(mycgds,hnscc2018)) #9 5
## [1] 9 5
#View(head(getCaseLists(mycgds,hnscc2018),2))
#############################gene_list
getGeneticProfiles(mycgds,hnscc2018)[,c(1,2)]
##                                                                   genetic_profile_id
## 1                                                  hnsc_tcga_pan_can_atlas_2018_rppa
## 2                                          hnsc_tcga_pan_can_atlas_2018_rppa_Zscores
## 3                                                hnsc_tcga_pan_can_atlas_2018_gistic
## 4                                       hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna
## 5                        hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_Zscores
## 6                                               hnsc_tcga_pan_can_atlas_2018_log2CNA
## 7                                             hnsc_tcga_pan_can_atlas_2018_mutations
## 8                                                hnsc_tcga_pan_can_atlas_2018_fusion
## 9             hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_Zscores
## 10                                 hnsc_tcga_pan_can_atlas_2018_microbiome_signature
## 11 hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_ref_normal_Zscores
##                                                         genetic_profile_name
## 1                                                  Protein expression (RPPA)
## 2                                         Protein expression z-scores (RPPA)
## 3                               Putative copy-number alterations from GISTIC
## 4      mRNA Expression, RSEM (Batch normalized from Illumina HiSeq_RNASeqV2)
## 5     mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM)
## 6                                                    Log2 copy-number values
## 7                                                                  Mutations
## 8                                                                    Fusions
## 9     mRNA expression z-scores relative to all samples (log RNA Seq V2 RSEM)
## 10                                   Microbiome Signatures (log RNA Seq CPM)
## 11 mRNA expression z-scores relative to normal samples (log RNA Seq V2 RSEM)
dim(getGeneticProfiles(mycgds,hnscc2018)) #11 6
## [1] 11  6
##choose a gene list you are interested in
choose_genes <- c("ACHE")
mycaselist <- getCaseLists(mycgds,hnscc2018)[grep("mrna",getCaseLists(mycgds,hnscc2018)[,1]),1]
mycaselist 
## [1] "hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna"
mygeneticprofile <- getGeneticProfiles(mycgds,hnscc2018)[grep("rna_seq_v2_mrna$",getGeneticProfiles(mycgds,hnscc2018)[,1]),1]
getGeneticProfiles(mycgds,hnscc2018)[,c(1:2)]
##                                                                   genetic_profile_id
## 1                                                  hnsc_tcga_pan_can_atlas_2018_rppa
## 2                                          hnsc_tcga_pan_can_atlas_2018_rppa_Zscores
## 3                                                hnsc_tcga_pan_can_atlas_2018_gistic
## 4                                       hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna
## 5                        hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_Zscores
## 6                                               hnsc_tcga_pan_can_atlas_2018_log2CNA
## 7                                             hnsc_tcga_pan_can_atlas_2018_mutations
## 8                                                hnsc_tcga_pan_can_atlas_2018_fusion
## 9             hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_Zscores
## 10                                 hnsc_tcga_pan_can_atlas_2018_microbiome_signature
## 11 hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_ref_normal_Zscores
##                                                         genetic_profile_name
## 1                                                  Protein expression (RPPA)
## 2                                         Protein expression z-scores (RPPA)
## 3                               Putative copy-number alterations from GISTIC
## 4      mRNA Expression, RSEM (Batch normalized from Illumina HiSeq_RNASeqV2)
## 5     mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM)
## 6                                                    Log2 copy-number values
## 7                                                                  Mutations
## 8                                                                    Fusions
## 9     mRNA expression z-scores relative to all samples (log RNA Seq V2 RSEM)
## 10                                   Microbiome Signatures (log RNA Seq CPM)
## 11 mRNA expression z-scores relative to normal samples (log RNA Seq V2 RSEM)
mygeneticprofile
## [1] "hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna"
#View(getGeneticProfiles(mycgds,hnscc2018))
mygeneticprofile
## [1] "hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna"
expr <- getProfileData(mycgds,choose_genes,
                         mygeneticprofile,mycaselist)
#head(expr)
#View(expr)
DT::datatable(expr)
dim(expr) #[1] 515   2
## [1] 515   1
getCaseLists(mycgds,hnscc2018)[,c(1,2)]
##                                   case_list_id
## 1             hnsc_tcga_pan_can_atlas_2018_all
## 2   hnsc_tcga_pan_can_atlas_2018_3way_complete
## 3             hnsc_tcga_pan_can_atlas_2018_cna
## 4         hnsc_tcga_pan_can_atlas_2018_log2CNA
## 5      hnsc_tcga_pan_can_atlas_2018_microbiome
## 6 hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna
## 7          hnsc_tcga_pan_can_atlas_2018_cnaseq
## 8       hnsc_tcga_pan_can_atlas_2018_sequenced
## 9            hnsc_tcga_pan_can_atlas_2018_rppa
##                        case_list_name
## 1                         All samples
## 2                    Complete samples
## 3               Samples with CNA data
## 4  Samples with log2 copy-number data
## 5        Samples with microbiome data
## 6 Samples with mRNA data (RNA Seq V2)
## 7  Samples with mutation and CNA data
## 8          Samples with mutation data
## 9    Samples with protein data (RPPA)
mycaselist_m <- getCaseLists(mycgds,hnscc2018)[8,1] 
mycaselist_m
## [1] "hnsc_tcga_pan_can_atlas_2018_sequenced"
getGeneticProfiles(mycgds,hnscc2018)[, c(1,2)]
##                                                                   genetic_profile_id
## 1                                                  hnsc_tcga_pan_can_atlas_2018_rppa
## 2                                          hnsc_tcga_pan_can_atlas_2018_rppa_Zscores
## 3                                                hnsc_tcga_pan_can_atlas_2018_gistic
## 4                                       hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna
## 5                        hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_Zscores
## 6                                               hnsc_tcga_pan_can_atlas_2018_log2CNA
## 7                                             hnsc_tcga_pan_can_atlas_2018_mutations
## 8                                                hnsc_tcga_pan_can_atlas_2018_fusion
## 9             hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_Zscores
## 10                                 hnsc_tcga_pan_can_atlas_2018_microbiome_signature
## 11 hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_ref_normal_Zscores
##                                                         genetic_profile_name
## 1                                                  Protein expression (RPPA)
## 2                                         Protein expression z-scores (RPPA)
## 3                               Putative copy-number alterations from GISTIC
## 4      mRNA Expression, RSEM (Batch normalized from Illumina HiSeq_RNASeqV2)
## 5     mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM)
## 6                                                    Log2 copy-number values
## 7                                                                  Mutations
## 8                                                                    Fusions
## 9     mRNA expression z-scores relative to all samples (log RNA Seq V2 RSEM)
## 10                                   Microbiome Signatures (log RNA Seq CPM)
## 11 mRNA expression z-scores relative to normal samples (log RNA Seq V2 RSEM)
mygeneticprofile_m <- getGeneticProfiles(mycgds,hnscc2018)[7,1]
mygeneticprofile_m
## [1] "hnsc_tcga_pan_can_atlas_2018_mutations"
DT::datatable(getGeneticProfiles(mycgds,hnscc2018))
mut_df <- getProfileData(mycgds,choose_genes,
                         mygeneticprofile_m,mycaselist_m)
#View(mut_df)
#head(mut_df)
DT::datatable(mut_df)
dim(mut_df) #[1] 515   2
## [1] 515   1
length(intersect(row.names(expr),row.names(mut_df)))
## [1] 507
mut_df <- apply(mut_df,2,as.factor)
##remove “NAN”
mut_df[mut_df == "NaN"] = ""
##remove“NA”
mut_df[is.na(mut_df)] = ""
mut_df[mut_df != ''] = "MUT"
dim(mut_df) #[1] 515   2
## [1] 515   1
head(mut_df)
##                 ACHE
## TCGA.4P.AA8J.01 ""  
## TCGA.BA.4074.01 ""  
## TCGA.BA.4076.01 ""  
## TCGA.BA.4078.01 ""  
## TCGA.BA.5149.01 ""  
## TCGA.BA.5151.01 ""
mycaselist_c <- getCaseLists(mycgds,hnscc2018)[3,1] 
mycaselist_c
## [1] "hnsc_tcga_pan_can_atlas_2018_cna"
mygeneticprofile_c <- getGeneticProfiles(mycgds,hnscc2018)[3,1]
getGeneticProfiles(mycgds,hnscc2018)[,c(1,2)]
##                                                                   genetic_profile_id
## 1                                                  hnsc_tcga_pan_can_atlas_2018_rppa
## 2                                          hnsc_tcga_pan_can_atlas_2018_rppa_Zscores
## 3                                                hnsc_tcga_pan_can_atlas_2018_gistic
## 4                                       hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna
## 5                        hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_Zscores
## 6                                               hnsc_tcga_pan_can_atlas_2018_log2CNA
## 7                                             hnsc_tcga_pan_can_atlas_2018_mutations
## 8                                                hnsc_tcga_pan_can_atlas_2018_fusion
## 9             hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_Zscores
## 10                                 hnsc_tcga_pan_can_atlas_2018_microbiome_signature
## 11 hnsc_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_ref_normal_Zscores
##                                                         genetic_profile_name
## 1                                                  Protein expression (RPPA)
## 2                                         Protein expression z-scores (RPPA)
## 3                               Putative copy-number alterations from GISTIC
## 4      mRNA Expression, RSEM (Batch normalized from Illumina HiSeq_RNASeqV2)
## 5     mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM)
## 6                                                    Log2 copy-number values
## 7                                                                  Mutations
## 8                                                                    Fusions
## 9     mRNA expression z-scores relative to all samples (log RNA Seq V2 RSEM)
## 10                                   Microbiome Signatures (log RNA Seq CPM)
## 11 mRNA expression z-scores relative to normal samples (log RNA Seq V2 RSEM)
mygeneticprofile_c
## [1] "hnsc_tcga_pan_can_atlas_2018_gistic"
cna <- getProfileData(mycgds,choose_genes,
                      mygeneticprofile_c,mycaselist_c)
DT::datatable(cna)
##deal with cna
rn <- rownames(cna)
cna <- apply(cna,2,function(x)
  as.character(factor(x, levels = c(-2:2), 
                      labels = c("HOMDEL", "HETLOSS", "DIPLOID", "GAIN", "AMP"))))
cna[is.na(cna)] = ""
cna[cna == 'DIPLOID'] = ""
rownames(cna)=rn
DT::datatable(cna)
mycaselist_cli <- getCaseLists(mycgds,hnscc2018)[1,1] 
mycaselist_cli
## [1] "hnsc_tcga_pan_can_atlas_2018_all"
myclinicaldata <- getClinicalData(mycgds,mycaselist_cli)
DT::datatable(myclinicaldata)
##############################choose a gene list you are interested in
#10 here
choose_columns=c('AGE','AJCC_PATHOLOGIC_TUMOR_STAGE','DFS_MONTHS','DFS_STATUS',
                  'GRADE','NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT',
                  'OS_STATUS','OS_MONTHS','RADIATION_THERAPY','SEX')
choose_clinicaldata <- myclinicaldata[,choose_columns]
dim(choose_clinicaldata) #[1] 523  10
## [1] 523  10
DT::datatable(choose_clinicaldata)
##############################survival analysis
library(survival)
dat <- choose_clinicaldata[choose_clinicaldata$OS_MONTHS >0,]
table(dat$OS_STATUS)
## 
##   0:LIVING 1:DECEASED 
##        303        219
DT::datatable(dat)
#create the survival object, 'DECEASED' means 'occurrence'(估计KM生存曲线)
my.surv <- Surv(dat$OS_MONTHS,dat$OS_STATUS =='1:DECEASED') 
dim(my.surv)
## [1] 523   2
head(my.surv)
## [1] 55.856922+ 14.465595+ 12.690272+  9.599895  14.662853  89.325048
kmfit1 <- survfit(my.surv ~ 1) 
library("survminer")
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'tibble':
##   method     from  
##   format.tbl pillar
##   print.tbl  pillar
## Loading required package: ggpubr
ggsurvplot(kmfit1,data = dat)

my.surv <- Surv(dat$OS_MONTHS,dat$OS_STATUS=='1:DECEASED') 
kmfit2 <- survfit(my.surv~dat$SEX)  #这里只需要将上面的1换成dat$SEX
ggsurvplot(kmfit2,data = dat)

survdiff(my.surv~dat$SEX) #sig
## Call:
## survdiff(formula = my.surv ~ dat$SEX)
## 
## n=522, 1 observation deleted due to missingness.
## 
##                  N Observed Expected (O-E)^2/E (O-E)^2/V
## dat$SEX=Female 141       71     56.3      3.86      5.25
## dat$SEX=Male   381      148    162.7      1.34      5.25
## 
##  Chisq= 5.2  on 1 degrees of freedom, p= 0.02
ggsurvplot(kmfit2,data = dat,pval=T)

kmfit3 <- survfit(my.surv~dat$AJCC_PATHOLOGIC_TUMOR_STAGE)
ggsurvplot(kmfit3,data = dat,pval=T)

length(intersect(rownames(choose_clinicaldata),rownames(mut_df)))
## [1] 515
dat2 <- cbind(choose_clinicaldata[rownames(mut_df),c('OS_STATUS','OS_MONTHS')],mut_df)
dat2 <- dat2[dat2$OS_MONTHS > 0,]
dat2 <- dat2[!is.na(dat$OS_STATUS),]
dat2$OS_STATUS <- as.character(dat2$OS_STATUS) 
attach(dat2)
DT::datatable(dat2)
###
my.surv <- Surv(OS_MONTHS,OS_STATUS=='1:DECEASED')
kmfit4 <- survfit(my.surv~ACHE,data = dat2)  
library("survminer") 
ggsurvplot(kmfit4,conf.int =F, pval = T,risk.table =T, ncensor.plot = TRUE,
             title="Survival plot by ACHE Mutaiton")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

detach(dat2)
length(intersect(rownames(choose_clinicaldata),rownames(cna)))
## [1] 517
dat <- cbind(choose_clinicaldata[rownames(cna),c('OS_STATUS','OS_MONTHS')],cna)
dat <- dat[dat$OS_MONTHS > 0,]
dat <- dat[!is.na(dat$OS_STATUS),]
dat$OS_STATUS <- as.character(dat$OS_STATUS) 
attach(dat)
DT::datatable(dat)
my.surv <- Surv(OS_MONTHS,OS_STATUS=='1:DECEASED')
kmfit7 <- survfit(my.surv~ACHE,data = dat)  
ggsurvplot(kmfit7,conf.int =F, pval = T,risk.table =T, ncensor.plot = TRUE,
             title="Survival plot by CNA")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

library(survival)
expr$ACHE_1 <- expr$ACHE
dat3=cbind(choose_clinicaldata[,c('OS_STATUS','OS_MONTHS')],expr[rownames(choose_clinicaldata),])
dat3=dat3[dat3$OS_MONTHS > 0,]
dat3=dat3[!is.na(dat3$OS_STATUS),]
dat3$OS_STATUS=as.character(dat3$OS_STATUS)
DT::datatable(dat3)
head(dat3)
##                  OS_STATUS OS_MONTHS     ACHE   ACHE_1
## TCGA.CN.4723.01   0:LIVING 55.856922  30.7574  30.7574
## TCGA.BA.A6DE.01   0:LIVING 14.465595 104.7290 104.7290
## TCGA.C9.A480.01   0:LIVING 12.690272  88.2463  88.2463
## TCGA.CV.6441.01 1:DECEASED  9.599895 557.2100 557.2100
## TCGA.HD.8224.01 1:DECEASED 14.662853 743.6500 743.6500
## TCGA.CV.7411.01 1:DECEASED 89.325048   7.9702   7.9702
library(ggpubr)
p <- ggboxplot(dat3, x="OS_STATUS", y="ACHE", color = "OS_STATUS",
                 palette = "jco", add = "jitter")
p+stat_compare_means(method = "t.test")
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_compare_means).
## Warning: Removed 8 rows containing missing values (geom_point).

dat3=dat3[!is.na(dat3$ACHE),]
dat3$ACHE = ifelse(dat3$ACHE > median(dat3$ACHE),'high','low')
attach(dat3)
## The following objects are masked from dat:
## 
##     ACHE, OS_MONTHS, OS_STATUS
table(dat3$ACHE)
## 
## high  low 
##  257  257
DT::datatable(dat3)
library(survival)
my.surv <- Surv(dat3$OS_MONTHS,dat3$OS_STATUS=='1:DECEASED')
kmfit1 <- survfit(my.surv~ACHE,data = dat3) 
getwd()
## [1] "C:/Users/liyix/OneDrive/Desktop"
ggsurvplot(kmfit1,conf.int =F, pval = T,risk.table =T, ncensor.plot = TRUE)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

dir_path <- getwd()
ggsurvplot(kmfit1,conf.int =F, pval = T,risk.table =F, ncensor.plot = F)

ggsave(filename = paste0(Sys.Date(),"-","surv.tif"), 
       plot = last_plot(), device = "tiff", path = dir_path,
       width = 10, height = 12, units = "cm",
       dpi = 300, limitsize = TRUE,compression = "lzw")
#########################
#ref https://www.jianshu.com/p/fd5e06ec260b
#