R Markdown

Install packages

Load packages

Survival analysis

Below we are testing the TCGA data selected to find the columns of data we are interested in exploring

colnames(colData(tcga_data))

##   [1] "barcode"                           "patient"                          
##   [3] "sample"                            "shortLetterCode"                  
##   [5] "definition"                        "sample_submitter_id"              
##   [7] "sample_type_id"                    "sample_id"                        
##   [9] "sample_type"                       "days_to_collection"               
##  [11] "state"                             "initial_weight"                   
##  [13] "intermediate_dimension"            "pathology_report_uuid"            
##  [15] "submitter_id"                      "shortest_dimension"               
##  [17] "oct_embedded"                      "longest_dimension"                
##  [19] "is_ffpe"                           "tissue_type"                      
##  [21] "synchronous_malignancy"            "days_to_diagnosis"                
##  [23] "treatments"                        "last_known_disease_status"        
##  [25] "tissue_or_organ_of_origin"         "days_to_last_follow_up"           
##  [27] "primary_diagnosis"                 "age_at_diagnosis"                 
##  [29] "year_of_diagnosis"                 "prior_malignancy"                 
##  [31] "secondary_gleason_grade"           "prior_treatment"                  
##  [33] "primary_gleason_grade"             "ajcc_pathologic_t"                
##  [35] "morphology"                        "ajcc_clinical_m"                  
##  [37] "ajcc_pathologic_n"                 "ajcc_clinical_t"                  
##  [39] "classification_of_tumor"           "diagnosis_id"                     
##  [41] "site_of_resection_or_biopsy"       "icd_10_code"                      
##  [43] "tumor_grade"                       "progression_or_recurrence"        
##  [45] "alcohol_history"                   "exposure_id"                      
##  [47] "vital_status"                      "gender"                           
##  [49] "race"                              "ethnicity"                        
##  [51] "age_at_index"                      "days_to_birth"                    
##  [53] "year_of_birth"                     "demographic_id"                   
##  [55] "bcr_patient_barcode"               "primary_site"                     
##  [57] "project_id"                        "disease_type"                     
##  [59] "releasable"                        "name"                             
##  [61] "released"                          "year_of_death"                    
##  [63] "days_to_death"                     "paper_sample"                     
##  [65] "paper_patient"                     "paper_Subtype"                    
##  [67] "paper_Batch"                       "paper_Age"                        
##  [69] "paper_Race"                        "paper_PSA_preop"                  
##  [71] "paper_Tumor_cellularity_pathology" "paper_Reviewed_Gleason"           
##  [73] "paper_Reviewed_Gleason_category"   "paper_Reviewed_Gleason_sum"       
##  [75] "paper_Clinical_Gleason"            "paper_Clinical_Gleason_category"  
##  [77] "paper_Clinical_Gleason_sum"        "paper_Residual_tumor"             
##  [79] "paper_Absolute_Ploidy"             "paper_Absolute_Genome_Doublings"  
##  [81] "paper_ABSOLUTE_purity"             "paper_CLONET_purity"              
##  [83] "paper_avgDNA_purity"               "paper_ISOpure_purity"             
##  [85] "paper_DeMix_purity"                "paper_avgRNA_purity"              
##  [87] "paper_SCNA_cluster"                "paper_iCluster"                   
##  [89] "paper_mRNA_cluster"                "paper_methylation_cluster"        
##  [91] "paper_RPPA_cluster"                "paper_miRNA_cluster"              
##  [93] "paper_exon_imbalance_score"        "paper_ERG_status"                 
##  [95] "paper_ETV1_status"                 "paper_ETV4_status"                
##  [97] "paper_FLI1_status"                 "paper_SPOP_mut"                   
##  [99] "paper_FOXA1_mut"                   "paper_MED12_mut"                  
## [101] "paper_IDH1_mut"                    "paper_KMT2A_mut"                  
## [103] "paper_KMT2C_mut"                   "paper_KMT2D_mut"                  
## [105] "paper_KDM6A_mut"                   "paper_SETD2_mut"                  
## [107] "paper_CHD1_mut"                    "paper_TP53_mut"                   
## [109] "paper_PTEN_mut"                    "paper_PIK3CA_mut"                 
## [111] "paper_BRAF_mut"                    "paper_HRAS_mut"                   
## [113] "paper_CTNNB1_mut"                  "paper_AKT1_mut"                   
## [115] "paper_BRCA1_mut"                   "paper_BRCA2_mut"                  
## [117] "paper_BRCA1_germline_mut"          "paper_BRCA2_germline_mut"         
## [119] "paper_CDKN1B_mut"                  "paper_RB1_mut"                    
## [121] "paper_ZMYM3_mut"                   "paper_ATM_MUT"                    
## [123] "paper_CDK12_MUT"                   "paper_FANCC_MUT"                  
## [125] "paper_FANCD2_MUT"                  "paper_SPINK1_high"                
## [127] "paper_Mutations"                   "paper_Fraction_genome_altered"    
## [129] "paper_PTEN_CNA"                    "paper_TP53_CNA"                   
## [131] "paper_CHD1_CNA"                    "paper_BRCA1_CNA"                  
## [133] "paper_BRCA2_CNA"                   "paper_CDKN1B_CNA"                 
## [135] "paper_RB1_CNA"                     "paper_CDK12_CNA"                  
## [137] "paper_FANCD2_CNA"                  "paper_FAM175A_CNA"                
## [139] "paper_FANCC_CNA"                   "paper_RAD51C_CNA"                 
## [141] "paper_SPOPL_CNA"                   "paper_AR_score"                   
## [143] "paper_AR_protein"                  "paper_AR_mRNA"                    
## [145] "paper_AR_V7_reads"                 "paper_AR_V7_ratio"                
## [147] "paper_AR_V7_presence"

When feeding through the TCGA-PRAD data we came across 27 paper mutations that represent a mutation that this study was trying to find

# The code below is creating a list of values for each row in the TCGA-PRAD data that tests positive for this mutation. By doing this, we can then correlate the index of that tests positive for this mutation to all the patients and their relative ID's that test positive for the mutation. 

spop <- which(tcga_data$paper_SPOP_mut == 1)

spop

##  [1]  25  33  63  68  76  92  95  98 126 161 164 183 220 230 245 248 249 254 259
## [20] 280 281 290 292 294 308 315 350 371 374 394 399 414 434 461 471 483 546 548

Tabulating based on column names to have a better sense of what is entailed under each subtype

Assigning the TCGA-PRAD data to a new variable named ‘Clinical’ and checking the size of the data

clinical = tcga_data@colData

dim(clinical)

## [1] 551 147

Survival Analysis on Overall Data

Completing an overall survival analysis comparing the clinical data to the stage of prostate cancer that each patient is listed as having

# Overall Survival 

clin_overall = clinical[clinical$definition == "Primary solid Tumor",
                    c("patient",
                      "vital_status",
                      "days_to_death",
                      "days_to_last_follow_up",
                      "ajcc_clinical_t")]

# clin_overall$ajcc_clinical_t <- clin_overall$ajcc_clinical_t[clin_overall$ajcc_clinical_t != "T2"]
# clin_overall$ajcc_clinical_t <- na.omit(clin_overall$ajcc_clinical_t)
clin_overall$deceased = clin_overall$vital_status == "Dead"

clin_overall$overall_survival = ifelse(clin_overall$deceased,
                                   clin_overall$days_to_death,
                                   clin_overall$days_to_last_follow_up)

fit = survfit(Surv(overall_survival, deceased) ~ ajcc_clinical_t, data=clin_overall)
pval <- surv_pvalue(fit, data = clin_overall)$pval

ggsurvplot(fit, data=clin_overall, legend = "right", pval = T, title = "Survival Analysis of Overall Data: Codes")

#Since this is a small dataset and we are pre-separating the data into 

clin_overall = clinical[clinical$definition == "Primary solid Tumor",
                    c("patient",
                      "vital_status",
                      "days_to_death",
                      "days_to_last_follow_up",
                      "gender")]


clin_overall$deceased = clin_overall$vital_status == "Dead"

clin_overall$overall_survival = ifelse(clin_overall$deceased,
                                   clin_overall$days_to_death,
                                   clin_overall$days_to_last_follow_up)

fit = survfit(Surv(overall_survival, deceased) ~ gender, data=clin_overall)
# pval <- surv_pvalue(fit, data = clin_overall)$pval

ggsurvplot(fit, data=clin_overall, legend = "right", title = "Survival Analysis of Overall Data: Overall")

Assigning the list of patients that contain each studied mutation to unique variables so that they can be cross referenced later

#with each mutation that was found in tcga_data we can find a list of all the patients who have that specific mutation so like chd1 is a matrix w/ a list of entries that gives us the row entries for which patients have those mutations

# Running through the Mutations to find how many people have one and how many people have two
spop <- which(tcga_data$paper_SPOP_mut == 1)
foxa1 <- which(tcga_data$paper_FOXA1_mut == 1)
med12 <- which(tcga_data$paper_MED12_mut == 1)
idh1 <- which(tcga_data$paper_IDH1_mut == 1)
kmt2a <- which(tcga_data$paper_KMT2A_mut == 1)
kmt2c <- which(tcga_data$paper_KMT2C_mut == 1)
kmt2d <- which(tcga_data$paper_KMT2D_mut == 1)
kdm6a <- which(tcga_data$paper_KDM6A_mut == 1)
setd2 <- which(tcga_data$paper_SETD2_mut == 1)
chd1 <- which(tcga_data@colData$paper_CHD1_mut == 1)
tp53 <- which(tcga_data@colData$paper_TP53_mut == 1)
pten <- which(tcga_data@colData$paper_PTEN_mut == 1)
pik3ca <- which(tcga_data@colData$paper_PIK3CA_mut == 1)
braf <- which(tcga_data@colData$paper_BRAF_mut == 1)
hras <- which(tcga_data@colData$paper_HRAS_mut == 1)
ctnnb1 <- which(tcga_data@colData$paper_CTNNB1_mut == 1)
akt1 <- which(tcga_data@colData$paper_AKT1_mut == 1)
brc1 <- which(tcga_data@colData$paper_BRCA1_mut == 1)
brc2 <- which(tcga_data@colData$paper_BRCA2_mut == 1)
brca1_germline <- which(tcga_data@colData$paper_BRCA1_germline_mut == 1)
brca2_germline <- which(tcga_data@colData$paper_BRCA2_germline_mut == 1)
cdkn1b <- which(tcga_data@colData$paper_CDKN1B_mut == 1)
rb1 <- which(tcga_data@colData$paper_RB1_mut == 1)
zmym3 <- which(tcga_data@colData$paper_ZMYM3_mut == 1)
atm <- which(tcga_data@colData$paper_ATM_MUT == 1)
cdk12 <- which(tcga_data@colData$paper_CDK12_MUT == 1)
fancc <- which(tcga_data@colData$paper_FANCC_MUT == 1)
fancd2 <- which(tcga_data@colData$paper_FANCD2_MUT == 1)

Creation of the mutation list

# Below is the creation of the mutations list, identifying all 27 mutations we are looking for in this study
mutlist <- c(spop, foxa1, med12, idh1, kmt2a, kmt2c, kmt2d, kdm6a, setd2, chd1, tp53, pten, pik3ca, braf, hras, ctnnb1, akt1, brc1, brc2, brca1_germline, brca2_germline, cdkn1b, rb1, zmym3, atm, cdk12, fancc, fancd2)

# One_mut is to find a list of patients that are found to only have one of the studied mutations in their sequence
one_mut <- which(table(mutlist) == 1)

# Due to the fact that this was a fairly small sample size (~500), more_mut represents the demographic of men thats sequence found more than one of the studied mutations.
more_mut <- which(table(mutlist) > 1)

Assigning the lists manually due to index errors pulling from table()

#When pulling the data from the lists created, they assigned two numbers as one for each variable, one value for the patient index, and one value for the index of the list. Thus the values were copy pasted into lists manually as seen below

# One_mut represents the people who were found to only have one studied mutation
one_mut <- c(1,5,14,30,33,41,42,47,  50,  53,  63,  66,  69,  80,  82,  92,  95, 100, 103, 104, 106, 107, 113, 123,146, 148, 161, 179, 183, 190, 195, 203, 211, 220, 227, 242, 243, 248, 257, 258, 260, 281, 288, 290, 291, 300, 308, 314, 321,325, 328, 330, 339, 343, 348, 350, 355, 360, 361, 363, 369, 371, 372, 378, 382, 386, 394, 399, 405, 408, 410, 413, 425, 428,430, 434, 447, 460, 461, 468, 469, 472, 475, 479, 483, 489, 491, 498, 499, 504, 514, 515, 517, 520, 522, 525, 531, 532, 533,534, 541, 546 )

#more_mut represents the people who were found to have more than one of the studied mutations
more_mut <- c(25,  35,  38,46,  59,  68,  76,  90,  98, 120, 126, 135, 164, 166, 169, 176, 230, 231, 235, 241, 245, 249, 250, 252, 254, 259, 266, 270, 280, 292, 294, 315, 332, 333, 362, 374, 398, 406, 411, 414, 433, 436, 437, 471, 509, 548  )

length(more_mut)

## [1] 46

length(one_mut)

## [1] 102

Creating new datasets based on whether a patient has one studied mutation or more than one.

#Initialize a data frame that is following the column names of clinical, same type as the clincal data frame 
single_mut <- clinical[one_mut[1] , ]


#Using the patient col# create a separate dataset with clinical data for one mutation patient
for (val in one_mut) {
  single_mut <- rbind(single_mut, clinical[val, ])

  }

# Here we are initializing the data frame the same way as shown previously but this time on patients with multiple mutations. 
multi_mut <- clinical[more_mut[1], ]


for (val in more_mut) {
  multi_mut <- rbind(multi_mut, clinical[val, ])

  }


# Remove the first column as the row 1 and row 2 contains same data for both clinical dataset
single_mut <- single_mut[-1, ]
multi_mut <- multi_mut[-1, ]



dim(single_mut)

## [1] 102 147

#Here we are tabulating our results to find out how many people in the data set are alive and how many are dead
multi_mut$vital_status

##  [1] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
## [10] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
## [19] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
## [28] "Alive" "Alive" "Alive" "Alive" "Dead"  "Alive" "Alive" "Alive" "Alive"
## [37] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Dead"  "Alive" "Alive"
## [46] "Alive"

table(multi_mut$vital_status)

## 
## Alive  Dead 
##    44     2

single_mut$vital_status

##   [1] "Alive" "Alive" "Alive" "Alive" "Alive" "Dead"  "Alive" "Alive" "Alive"
##  [10] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
##  [19] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
##  [28] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
##  [37] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Dead" 
##  [46] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
##  [55] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
##  [64] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
##  [73] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Dead"  "Alive"
##  [82] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
##  [91] "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
## [100] "Alive" "Alive" "Alive"

table(single_mut$vital_status)

## 
## Alive  Dead 
##    99     3

Survival Analysis On Single Mutation Data

clin_single = single_mut[single_mut$definition == "Primary solid Tumor",
                    c("patient",
                      "vital_status",
                      "days_to_death",
                      "days_to_last_follow_up",
                      "ajcc_clinical_t")]

clin_single$deceased = clin_single$vital_status == "Dead"

clin_single$overall_survival = ifelse(clin_single$deceased,
                                   clin_single$days_to_death,
                                   clin_single$days_to_last_follow_up)

fit = survfit(Surv(overall_survival, deceased) ~ ajcc_clinical_t, data=clin_single)
pval <- surv_pvalue(fit, data = clin_single)$pval

ggsurvplot(fit, data=clin_single, pval = T, legend ="right", title = "Survival Analysis of Single Mutation Data: Code")

clin_single = single_mut[single_mut$definition == "Primary solid Tumor",
                    c("patient",
                      "vital_status",
                      "days_to_death",
                      "days_to_last_follow_up",
                      "gender")]

clin_single$deceased = clin_single$vital_status == "Dead"

clin_single$overall_survival = ifelse(clin_single$deceased,
                                   clin_single$days_to_death,
                                   clin_single$days_to_last_follow_up)

fit = survfit(Surv(overall_survival, deceased) ~ gender, data=clin_single)
# pval <- surv_pvalue(fit, data = clin_single)$pval

ggsurvplot(fit, data=clin_single, legend ="right", title = "Survival Analysis of Single Mutation Data: Overall")

Survival Analysis on Multiple Mutation Data

#multiple mutation survival analysis

clin_multi = multi_mut[multi_mut$definition == "Primary solid Tumor",
                    c("patient",
                      "vital_status",
                      "days_to_death",
                      "days_to_last_follow_up",
                      "ajcc_clinical_t")]

clin_multi$deceased = clin_multi$vital_status == "Dead"

clin_multi$overall_survival = ifelse(clin_multi$deceased,
                                   clin_multi$days_to_death,
                                   clin_multi$days_to_last_follow_up)

fit = survfit(Surv(overall_survival, deceased) ~ ajcc_clinical_t, data=clin_multi)


pval <- surv_pvalue(fit, data = clin_multi)$pval

ggsurvplot(fit, data=clin_multi, pval = T, legend = "right", title = "Survival Analysis of Multiple Mutation Data: Code")

# Due to the small pool size of data, it is impossible to find a correlation from death to cancer based on both the multiple mutations and the pool size, so only the number of mutations and death will be considered. 

clin_multi = multi_mut[multi_mut$definition == "Primary solid Tumor",
                    c("patient",
                      "vital_status",
                      "days_to_death",
                      "days_to_last_follow_up",
                      "gender")]

clin_multi$deceased = clin_multi$vital_status == "Dead"

clin_multi$overall_survival = ifelse(clin_multi$deceased,
                                   clin_multi$days_to_death,
                                   clin_multi$days_to_last_follow_up)

fit = survfit(Surv(overall_survival, deceased) ~ gender, data=clin_multi)

ggsurvplot(fit, data=clin_multi, legend = "right", title = "Survival Analysis of Multiple Mutation Data: Overall")

Creating a Fourth Dataset Based On Patients with Mutations that are Unknown

combind_mut <-c(1,5,14,30,33,41,42,47,  50,  53,  63,  66,  69,  80,  82,  92,  95, 100, 103, 104, 106, 107, 113, 123,146, 148, 161, 179, 183, 190, 195, 203, 211, 220, 227, 242, 243, 248, 257, 258, 260, 281, 288, 290, 291, 300, 308, 314, 321,325, 328, 330, 339, 343, 348, 350, 355, 360, 361, 363, 369, 371, 372, 378, 382, 386, 394, 399, 405, 408, 410, 413, 425, 428,430, 434, 447, 460, 461, 468, 469, 472, 475, 479, 483, 489, 491, 498, 499, 504, 514, 515, 517, 520, 522, 525, 531, 532, 533,534, 541, 546 , 25,  35,  38,46,  59,  68,  76,  90,  98, 120, 126, 135, 164, 166, 169, 176, 230, 231, 235, 241, 245, 249, 250, 252, 254, 259, 266, 270, 280, 292, 294, 315, 332, 333, 362, 374, 398, 406, 411, 414, 433, 436, 437, 471, 509, 548  )

combind_mut <- sort(combind_mut)
combind_mut_num <- as.numeric(unlist(combind_mut))
unknown_mut <- c(1:551)
unknown_mut_num <- as.numeric(unlist(unknown_mut))


#scrub through each entry in combind_mut. created list unknown_mut and cycle through and if the two values at whatever index are the same then remove the value from the list
for (i in 1:551) {
  
  if (combind_mut_num[i] %in% unknown_mut_num[i]) {
    
    unknown_mut_num = unknown_mut_num[-i]
    
  }
  
}

This is for unknown mut, the list of patient tags as shown in earlier code

combind_mut <-c( 1, 5, 14, 30, 33, 41, 42, 47, 50,  53,  63,  66,  69,  80,  82,  92,  95, 100, 103, 104, 106, 107, 113, 123,146, 148, 161, 179, 183, 190, 195, 203, 211, 220, 227, 242, 243, 248, 257, 258, 260, 281, 288, 290, 291, 300, 308, 314, 321,325, 328, 330, 339, 343, 348, 350, 355, 360, 361, 363, 369, 371, 372, 378, 382, 386, 394, 399, 405, 408, 410, 413, 425, 428,430, 434, 447, 460, 461, 468, 469, 472, 475, 479, 483, 489, 491, 498, 499, 504, 514, 515, 517, 520, 522, 525, 531, 532, 533,534, 541, 546 , 25,  35,  38,46,  59,  68,  76,  90,  98, 120, 126, 135, 164, 166, 169, 176, 230, 231, 235, 241, 245, 249, 250, 252, 254, 259, 266, 270, 280, 292, 294, 315, 332, 333, 362, 374, 398, 406, 411, 414, 433, 436, 437, 471, 509, 548  )

combind_mut <- sort(combind_mut)
unknown_mut <- c(1:551)

unknown_mut <- unknown_mut[-c(combind_mut)]

unknown_mut

##   [1]   2   3   4   6   7   8   9  10  11  12  13  15  16  17  18  19  20  21
##  [19]  22  23  24  26  27  28  29  31  32  34  36  37  39  40  43  44  45  48
##  [37]  49  51  52  54  55  56  57  58  60  61  62  64  65  67  70  71  72  73
##  [55]  74  75  77  78  79  81  83  84  85  86  87  88  89  91  93  94  96  97
##  [73]  99 101 102 105 108 109 110 111 112 114 115 116 117 118 119 121 122 124
##  [91] 125 127 128 129 130 131 132 133 134 136 137 138 139 140 141 142 143 144
## [109] 145 147 149 150 151 152 153 154 155 156 157 158 159 160 162 163 165 167
## [127] 168 170 171 172 173 174 175 177 178 180 181 182 184 185 186 187 188 189
## [145] 191 192 193 194 196 197 198 199 200 201 202 204 205 206 207 208 209 210
## [163] 212 213 214 215 216 217 218 219 221 222 223 224 225 226 228 229 232 233
## [181] 234 236 237 238 239 240 244 246 247 251 253 255 256 261 262 263 264 265
## [199] 267 268 269 271 272 273 274 275 276 277 278 279 282 283 284 285 286 287
## [217] 289 293 295 296 297 298 299 301 302 303 304 305 306 307 309 310 311 312
## [235] 313 316 317 318 319 320 322 323 324 326 327 329 331 334 335 336 337 338
## [253] 340 341 342 344 345 346 347 349 351 352 353 354 356 357 358 359 364 365
## [271] 366 367 368 370 373 375 376 377 379 380 381 383 384 385 387 388 389 390
## [289] 391 392 393 395 396 397 400 401 402 403 404 407 409 412 415 416 417 418
## [307] 419 420 421 422 423 424 426 427 429 431 432 435 438 439 440 441 442 443
## [325] 444 445 446 448 449 450 451 452 453 454 455 456 457 458 459 462 463 464
## [343] 465 466 467 470 473 474 476 477 478 480 481 482 484 485 486 487 488 490
## [361] 492 493 494 495 496 497 500 501 502 503 505 506 507 508 510 511 512 513
## [379] 516 518 519 521 523 524 526 527 528 529 530 535 536 537 538 539 540 542
## [397] 543 544 545 547 549 550 551

Filtering clinical data to unknown data, removing whatevers the same

idk_mut <- clinical[unknown_mut[1], ]
#same thing

for (val in unknown_mut) {
  idk_mut <- rbind(idk_mut, clinical[val, ])

}

  idk_mut <- idk_mut[-1, ]

Plotting the idk_mut, which is the data frame containing patient tags with no mutations or unknown mutations

clin_idk = idk_mut[idk_mut$definition == "Primary solid Tumor",
                    c("patient",
                      "vital_status",
                      "days_to_death",
                      "days_to_last_follow_up",
                      "ajcc_clinical_t")]

clin_idk$deceased = clin_idk$vital_status == "Dead"

clin_idk$overall_survival = ifelse(clin_idk$deceased,
                                   clin_idk$days_to_death,
                                   clin_idk$days_to_last_follow_up)

fit = survfit(Surv(overall_survival, deceased) ~ ajcc_clinical_t, data=clin_idk)
pval <- surv_pvalue(fit, data = clin_idk)$pval

ggsurvplot(fit, data=clin_idk, pval = T, legend = "right", title = "Unknown Mutation Survival Analysis: Code")

clin_idk = idk_mut[idk_mut$definition == "Primary solid Tumor",
                    c("patient",
                      "vital_status",
                      "days_to_death",
                      "days_to_last_follow_up",
                      "gender")]

clin_idk$deceased = clin_idk$vital_status == "Dead"

clin_idk$overall_survival = ifelse(clin_idk$deceased,
                                   clin_idk$days_to_death,
                                   clin_idk$days_to_last_follow_up)

fit = survfit(Surv(overall_survival, deceased) ~ gender, data=clin_idk)
# pval <- surv_pvalue(fit, data = clin_idk)$pval

ggsurvplot(fit, data=clin_idk, legend = "right", title = "Unknown Mutation Survival Analysis: Overall")

Creating lists of the patient ID’s that are relevant for each set of data

# Here we are gathering a list of the titles of all the sets in RNA SEQ that we can plot the gene expressions
names_multi <- substr(multi_mut[[1]], 1, 12)
names_sing <- substr(single_mut[[1]], 1, 12)
names_idk <- substr(idk_mut[[1]], 1 , 12)

# rna_multi <- which(colnames(RNAseq_matrix) == names_multi)
rnanamevec <- vector(length = 46)

RNAseq_matrix <- RNAseq_matrix[,-(1:2)]

Expressional Analysis using Heatmap - Single Mutation

# Single Heatmap
rna_one <- RNAseq_matrix[one_mut,]

#Plotting the heatmap with pseudo 1
pheatmap(log(rna_one+1), show_rownames = FALSE, show_colnames = FALSE)

# Calculating the mean expression across each patient in the subtype to find the most expressed gene for patients with a single studied mutation
mean_one <- apply(rna_one, 1, mean)
max_one <- max(mean_one)
min_one <- min(mean_one)
highest_exressed_one <- which(mean_one == max_one)
lowest_expressed_one <- which(mean_one == min_one)
print(highest_exressed_one)

## ENSG00000014257 
##              56

print(lowest_expressed_one)

## ENSG00000018607 
##              65

Expressional Analysis using Heatmap - Multiple Mutations

# Multiple Heatmap
rna_multi <- RNAseq_matrix[more_mut,]

pheatmap(log(rna_multi+1), show_rownames = FALSE, show_colnames = FALSE)

# Calculating the mean expression across each patient in the subtype to find the most expressed gene for patients with multiple studied mutations
mean_multi <- apply(rna_multi, 1, mean)
max_multi <- max(mean_multi)
min_multi <- min(mean_multi)
highest_exressed_multi <- which(mean_multi == max_multi)
lowest_expressed_multi <- which(mean_multi == min_multi)
print(highest_exressed_multi)

## ENSG00000008988 
##              18

print(lowest_expressed_multi)

## ENSG00000021852 
##              40

Expressional Analysis using Heatmap - Unknown Mutations

#Unknown Heatmap 

rna_idk <- RNAseq_matrix[combind_mut,]

pheatmap(log(rna_idk+1), show_rownames = FALSE, show_colnames = FALSE)

# Calculating the mean expression across each patient in the subtype to find the most expressed gene for patients with the unknown mutations
mean_idk <- apply(rna_idk, 1, mean)
max_idk <- max(mean_idk)
min_idk <- min(mean_idk)
highest_exressed_idk <- which(mean_idk == max_idk)
lowest_expressed_idk <- which(mean_idk == min_idk)
print(highest_exressed_idk)

## ENSG00000014257 
##              90

print(lowest_expressed_idk)

## ENSG00000018607 
##             101

Relating separated TCGA data to Clinical Data in order to find the average stage of cancer for each subgroup

#Creating empy list
multi_list <- c()
single_list <- c()
idk_list <- c()

#Coding to find the row numbers that contain the same names as The multiple variation code
for (i in 1:length(clinical.dataset$X.Patient.Identifier)) {
  jj <- which(clinical.dataset$X.Patient.Identifier == names_multi[i])
  multi_list <- c(multi_list, jj)
}

#Coding to find the row numbers that contain the same names as The singe variation code
for (i in 1:length(clinical.dataset$X.Patient.Identifier)) {
  jj <- which(clinical.dataset$X.Patient.Identifier == names_sing[i])
  single_list <- c(single_list, jj)
}

#Coding to find the row numbers that contain the same names as The unkown variation code
for (i in 1:length(clinical.dataset$X.Patient.Identifier)) {
  jj <- which(clinical.dataset$X.Patient.Identifier == names_idk[i])
  idk_list <- c(idk_list, jj)
}

Creating a new dataset that can be edited and replacing all occurrances of the cancer progress with integers

# New dataset which will be edited
cdata <- clinical.dataset

#Associating the list of patients that are in each stage of cancer
i_t2a <- which(cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code == "T2A")
i_t2b <- which(cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code == "T2B")
i_t2c <- which(cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code == "T2C")
i_t3a <- which(cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code == "T3A")
i_t3b <- which(cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code == "T3B")
i_t4 <- which(cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code == "T4")

# Changing the value of each stage of cancer to integer values to find the mean
for(i in 1:length(i_t2a)){
  index=i_t2a[i]
  cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[index] = 1
}
for(i in 1:length(i_t2b)){
  index=i_t2b[i]
  cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[index] = 2
}
for(i in 1:length(i_t2c)){
  index=i_t2c[i]
  cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[index] = 3
}
for(i in 1:length(i_t3a)){
  index=i_t3a[i]
  cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[index] = 4
}
for(i in 1:length(i_t3b)){
  index=i_t3b[i]
  cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[index] = 5
}
for(i in 1:length(i_t4)){
  index=i_t4[i]
  cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[index] = 6
}

Mean cancer stage calcluations

#Now we are taking the mean value of the cancers based on the patient to determine the mean stage of cancer given how many studied mutations each person has. 

code_multi <- cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[multi_list]
code_multi <- strtoi(code_multi)
multi_mean <- mean(code_multi)
sprintf("The mean value is %g therefore the average cancer level is T3A when the patient shows multiple studied mutations", multi_mean)

## [1] "The mean value is 3.93478 therefore the average cancer level is T3A when the patient shows multiple studied mutations"

code_single <- cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[single_list]
code_single <- strtoi(code_single)
code_single <- na.omit(code_single)
single_mean <- mean(code_single)
sprintf("The mean value is %g therefore the average cancer level is between T2C and T3A when the patient shows expression of a single studied mutation", single_mean)

## [1] "The mean value is 3.73 therefore the average cancer level is between T2C and T3A when the patient shows expression of a single studied mutation"

code_idk <- cdata$American.Joint.Committee.on.Cancer.Tumor.Stage.Code[idk_list]
code_idk <- strtoi(code_idk)
code_idk <- na.omit(code_idk)
idk_mean <- mean(code_idk)
sprintf("The mean value is %g therefore the average cancer level is between T2C and T3A when the patient shows expression of unstudied mutations", idk_mean)

## [1] "The mean value is 3.84518 therefore the average cancer level is between T2C and T3A when the patient shows expression of unstudied mutations"

Final Project