#library(arsenal) playing with descriptive stats tables
library(devtools)
## Warning: package 'devtools' was built under R version 4.3.3
## Loading required package: usethis
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
library(survival)
library(survminer)
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.3.3
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.3.3
## 
## Attaching package: 'ggsurvfit'
## 
## The following object is masked from 'package:psych':
## 
##     %+%

Intro

This project uses CNS/Brain Cancer data from CPTAC CNS/Brain Cancer GDC and generated in Jul 2024 to understand clinical predictors of overall survival. The first null hypothesis is that there is no difference in the median over-all survival time across the five biopsy groups; any observed difference would result from pure coincidences or sampling errors and occurred randomly. The second null hypothesis being tested is that there is no difference in the median mutation count time across the five biopsy groups; any observed difference would result from pure coincidences or sampling errors and occurred randomly. I set the alpha is 0.05. The Kruskal Wallis test type of statistical test that determines if the medians of independent groups are statistically significant. It is the nonparametric equivalent of the one-way ANOVA test.

Methods

raw <- read.delim("C:/Users/maddoxp/OneDrive - Children's Hospital of Philadelphia/Documents/Biostatistics Class/brain_cptac_gdc_clinical_data.tsv", header = TRUE, sep = "\t")
#write.csv(raw, "final_project_p1.csv")
head(raw)
##          Study.ID Patient.ID    Sample.ID Diagnosis.Age AIDS.risk.factors
## 1 brain_cptac_gdc  C3L-00104 C3L-00104-01            58              <NA>
## 2 brain_cptac_gdc  C3L-00365 C3L-00365-01            59              <NA>
## 3 brain_cptac_gdc  C3L-00674 C3L-00674-71            45              <NA>
## 4 brain_cptac_gdc  C3L-00677 C3L-00677-72            69              <NA>
## 5 brain_cptac_gdc  C3L-01043 C3L-01043-01            59              <NA>
## 6 brain_cptac_gdc  C3L-01045 C3L-01045-03            73              <NA>
##   Alcohol.History.Documented    Alcohol.Intensity   Biopsy.Site Body.Mass.Index
## 1                       True   Occasional Drinker  Frontal lobe           32.54
## 2                       True   Occasional Drinker Parietal lobe           20.61
## 3                       <NA>                 <NA>  Frontal lobe           27.44
## 4                       <NA>                 <NA>  Frontal lobe           19.32
## 5                      False Lifelong Non-Drinker Parietal lobe           24.22
## 6                      False Lifelong Non-Drinker Temporal lobe           22.04
##        Cancer.Type Cancer.Type.Detailed Cause.of.Death Comorbidity Consent.Type
## 1 CNS/Brain Cancer            CNS/Brain           <NA>        <NA>           NA
## 2 CNS/Brain Cancer            CNS/Brain           <NA>  Depression           NA
## 3 CNS/Brain Cancer            CNS/Brain Cancer Related        <NA>           NA
## 4 CNS/Brain Cancer            CNS/Brain Cancer Related        <NA>           NA
## 5 CNS/Brain Cancer            CNS/Brain Cancer Related        <NA>           NA
## 6 CNS/Brain Cancer            CNS/Brain      Infection        <NA>           NA
##   Disease.Free..Months.   Disease.Free.Status Diabetes.Treatment.Type
## 1              3.318003         0:DiseaseFree                      NA
## 2             10.578187                  <NA>                      NA
## 3              8.048620 1:Recurred/Progressed                      NA
## 4              5.091984         0:DiseaseFree                      NA
## 5              2.595269                  <NA>                      NA
## 6              2.168200                  <NA>                      NA
##   Disease.Response       Disease.Status Disease.Type ECOG.Performance.Status
## 1             <NA>           With tumor      Gliomas                      NA
## 2             <NA> Unknown tumor status      Gliomas                      NA
## 3             <NA> Unknown tumor status      Gliomas                      NA
## 4             <NA> Unknown tumor status      Gliomas                       1
## 5             <NA>           With tumor      Gliomas                      NA
## 6             <NA>           With tumor      Gliomas                      NA
##       Ethnicity.Category Height..cms. Index.Date KPS
## 1 NOT HISPANIC OR LATINO       188.00  Diagnosis  NA
## 2 NOT HISPANIC OR LATINO       162.00  Diagnosis  NA
## 3 NOT HISPANIC OR LATINO       193.00  Diagnosis  NA
## 4 NOT HISPANIC OR LATINO       164.47  Diagnosis  70
## 5                   <NA>       170.00  Diagnosis   0
## 6                   <NA>       165.00  Diagnosis   0
##   Number.of.positive.lymph.nodes Morphology Mutation.Count Oncotree.Code
## 1                             NA     9440/3             59         BRAIN
## 2                             NA     9440/3             62         BRAIN
## 3                             NA     9440/3             38         BRAIN
## 4                             NA     9440/3           1190         BRAIN
## 5                             NA     9440/3             32         BRAIN
## 6                             NA     9440/3             58         BRAIN
##   Overall.Survival..Months. Overall.Survival.Status
## 1                  4.237845              1:DECEASED
## 2                 10.578187              1:DECEASED
## 3                 15.703022              1:DECEASED
## 4                  5.091984              1:DECEASED
## 5                  2.595269              1:DECEASED
## 6                  2.168200              1:DECEASED
##                       Other.Patient.ID                      Other.Sample.ID
## 1 d42636dd-728c-45a4-b953-db165a30c761 331b4168-af74-44c4-9c2c-235b295a32aa
## 2 0c38f553-6892-4f67-b306-40503b8842b5 c6cb701b-98e7-479b-858d-48725a02f55c
## 3 e1bd6dfa-0567-46e7-991c-63eefaf5bd7a 0d7a2e99-e3c0-4b06-8109-24f3b7b5cdcb
## 4 f5287051-d58d-4243-8a07-5fdfb5afa301 b8952cfa-68f7-42da-a6fb-aac4357e0813
## 5 206f20bb-c212-4a26-967a-5b3700f322be 68de0994-c58e-4042-9b8d-863f798171b3
## 6 b137313e-0200-4fb7-a151-3c2c76170dea 4c8f9392-2b64-4ef1-a6f2-db31e3f2b02b
##   Primary.Diagnosis Primary.Tumor.Site Patient.Primary.Tumor.Site
## 1      Glioblastoma              Brain                      Brain
## 2      Glioblastoma              Brain                      Brain
## 3      Glioblastoma              Brain                      Brain
## 4      Glioblastoma              Brain                      Brain
## 5      Glioblastoma              Brain                      Brain
## 6      Glioblastoma              Brain                      Brain
##   Progression.or.Recurrence Project.Identifier
## 1                     False            CPTAC-3
## 2                      <NA>            CPTAC-3
## 3                      True            CPTAC-3
## 4                     False            CPTAC-3
## 5                      <NA>            CPTAC-3
## 6                      <NA>            CPTAC-3
##                                                 Project.Name Project.State
## 1 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus      released
## 2 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus      released
## 3 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus      released
## 4 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus      released
## 5 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus      released
## 6 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus      released
##   Race.Category Residual.Disease Number.of.Samples.Per.Patient   Sample.Type
## 1         WHITE             <NA>                             1 Primary Tumor
## 2         WHITE             <NA>                             1 Primary Tumor
## 3         WHITE             <NA>                             1 Primary Tumor
## 4         WHITE             <NA>                             1 Primary Tumor
## 5         OTHER             <NA>                             1 Primary Tumor
## 6         OTHER             <NA>                             1 Primary Tumor
##   Exposure.to.secondhand.smoke.as.a.child    Sex Years.Smoked
## 1                                    <NA>   Male           35
## 2                                    <NA> Female           30
## 3                                    <NA>   Male           NA
## 4                                    <NA> Female           36
## 5                                    True   Male           46
## 6                                    True Female           NA
##   Smoking.Onset..year. Person.Cigarette.Smoking.History.Pack.Year.Value
## 1                 1971                                             35.0
## 2                 1980                                             30.0
## 3                   NA                                               NA
## 4                 1986                                             29.0
## 5                 1976                                             61.5
## 6                   NA                                               NA
##   Smoking.Quit..year.                            Smoking.Status Tumor.Focality
## 1                2006 Current Reformed Smoker for < or = 15 yrs             NA
## 2                2010 Current Reformed Smoker for < or = 15 yrs             NA
## 3                  NA            Smoking history not documented             NA
## 4                  NA                            Current Smoker             NA
## 5                  NA                            Current Smoker             NA
## 6                  NA                       Lifelong Non-Smoker             NA
##   Tumor.Grade Tumor.s.largest.diameter.or.dimension Type.of.smoking.exposure
## 1          NA                                   1.0                     <NA>
## 2          NA                                   1.2                     <NA>
## 3          NA                                   4.9                     <NA>
## 4          NA                                   4.5                     <NA>
## 5          NA                                    NA      Smoke exposure, NOS
## 6          NA                                    NA      Smoke exposure, NOS
##   Patient.s.Vital.Status Weight..kgs. Year.of.Death Year.of.Diagnosis
## 1                   Dead       115.00          2016              2016
## 2                   Dead        54.10          2017              2016
## 3                   Dead       102.20          2018              2016
## 4                   Dead        52.27          2017              2016
## 5                   Dead        70.00          2017              2017
## 6                   Dead        60.00          2017              2017
colnames(raw)
##  [1] "Study.ID"                                        
##  [2] "Patient.ID"                                      
##  [3] "Sample.ID"                                       
##  [4] "Diagnosis.Age"                                   
##  [5] "AIDS.risk.factors"                               
##  [6] "Alcohol.History.Documented"                      
##  [7] "Alcohol.Intensity"                               
##  [8] "Biopsy.Site"                                     
##  [9] "Body.Mass.Index"                                 
## [10] "Cancer.Type"                                     
## [11] "Cancer.Type.Detailed"                            
## [12] "Cause.of.Death"                                  
## [13] "Comorbidity"                                     
## [14] "Consent.Type"                                    
## [15] "Disease.Free..Months."                           
## [16] "Disease.Free.Status"                             
## [17] "Diabetes.Treatment.Type"                         
## [18] "Disease.Response"                                
## [19] "Disease.Status"                                  
## [20] "Disease.Type"                                    
## [21] "ECOG.Performance.Status"                         
## [22] "Ethnicity.Category"                              
## [23] "Height..cms."                                    
## [24] "Index.Date"                                      
## [25] "KPS"                                             
## [26] "Number.of.positive.lymph.nodes"                  
## [27] "Morphology"                                      
## [28] "Mutation.Count"                                  
## [29] "Oncotree.Code"                                   
## [30] "Overall.Survival..Months."                       
## [31] "Overall.Survival.Status"                         
## [32] "Other.Patient.ID"                                
## [33] "Other.Sample.ID"                                 
## [34] "Primary.Diagnosis"                               
## [35] "Primary.Tumor.Site"                              
## [36] "Patient.Primary.Tumor.Site"                      
## [37] "Progression.or.Recurrence"                       
## [38] "Project.Identifier"                              
## [39] "Project.Name"                                    
## [40] "Project.State"                                   
## [41] "Race.Category"                                   
## [42] "Residual.Disease"                                
## [43] "Number.of.Samples.Per.Patient"                   
## [44] "Sample.Type"                                     
## [45] "Exposure.to.secondhand.smoke.as.a.child"         
## [46] "Sex"                                             
## [47] "Years.Smoked"                                    
## [48] "Smoking.Onset..year."                            
## [49] "Person.Cigarette.Smoking.History.Pack.Year.Value"
## [50] "Smoking.Quit..year."                             
## [51] "Smoking.Status"                                  
## [52] "Tumor.Focality"                                  
## [53] "Tumor.Grade"                                     
## [54] "Tumor.s.largest.diameter.or.dimension"           
## [55] "Type.of.smoking.exposure"                        
## [56] "Patient.s.Vital.Status"                          
## [57] "Weight..kgs."                                    
## [58] "Year.of.Death"                                   
## [59] "Year.of.Diagnosis"
dim(raw)
## [1] 74 59

Data Analysis

#data formatting
(a <- raw[,c("Mutation.Count", "Overall.Survival..Months.", "Biopsy.Site")])
##    Mutation.Count Overall.Survival..Months.    Biopsy.Site
## 1              59                4.23784494   Frontal lobe
## 2              62               10.57818660  Parietal lobe
## 3              38               15.70302234   Frontal lobe
## 4            1190                5.09198423   Frontal lobe
## 5              32                2.59526938  Parietal lobe
## 6              58                2.16819974  Temporal lobe
## 7              67               26.47831800  Temporal lobe
## 8              47               21.87910644     Brain, NOS
## 9              39               32.32588699  Temporal lobe
## 10             68               21.22207622   Frontal lobe
## 11             17               16.32720105   Frontal lobe
## 12             47               55.94612352  Temporal lobe
## 13             56               33.83705650  Temporal lobe
## 14             40               19.41524310   Frontal lobe
## 15             53               28.67936925 Occipital lobe
## 16            122                5.91327201  Temporal lobe
## 17             50               27.20105125   Frontal lobe
## 18             53               34.59264126 Occipital lobe
## 19             22               12.41787122  Temporal lobe
## 20             66               12.51642576  Temporal lobe
## 21             37                1.41261498  Temporal lobe
## 22             46                9.23127464  Parietal lobe
## 23             66               13.23915900  Parietal lobe
## 24             47                2.85808147     Brain, NOS
## 25             29                0.22996058  Temporal lobe
## 26             65               23.68593955  Parietal lobe
## 27             54                4.17214192   Frontal lobe
## 28             62               13.17345598   Frontal lobe
## 29             75               45.79500657     Brain, NOS
## 30             58                9.00131406  Parietal lobe
## 31             40               36.36662286  Temporal lobe
## 32             55                3.81077530  Temporal lobe
## 33             28                0.03285151   Frontal lobe
## 34             48               52.06964520   Frontal lobe
## 35             65               39.71747700   Frontal lobe
## 36             49               21.45203679   Frontal lobe
## 37             47                0.22996058 Occipital lobe
## 38             17               64.78318003  Temporal lobe
## 39             49                0.19710907   Frontal lobe
## 40             37               11.46517740  Temporal lobe
## 41             38                8.04862024   Frontal lobe
## 42             43               19.90801577  Temporal lobe
## 43             63                6.30749014  Parietal lobe
## 44             74                0.62417871   Frontal lobe
## 45             39               36.30091984   Frontal lobe
## 46             61               26.44546649  Temporal lobe
## 47             45                3.97503285     Brain, NOS
## 48             12               27.62812089  Temporal lobe
## 49             58                8.86990802  Temporal lobe
## 50             32               11.92509855     Brain, NOS
## 51             53                0.09855453   Frontal lobe
## 52             55               12.54927727 Occipital lobe
## 53             48               12.74638633  Temporal lobe
## 54             31               11.16951380  Temporal lobe
## 55             48               12.68068331   Frontal lobe
## 56             55                8.67279895   Frontal lobe
## 57             42               13.50197109  Temporal lobe
## 58             48                8.70565046 Occipital lobe
## 59             53                8.63994744  Temporal lobe
## 60             36               36.62943495   Frontal lobe
## 61             50               43.98817346  Parietal lobe
## 62             11               46.51773982     Brain, NOS
## 63             47               17.93692510     Brain, NOS
## 64             22               34.39553219     Brain, NOS
## 65             53               27.75952694  Temporal lobe
## 66             60               27.75952694  Temporal lobe
## 67             66               10.90670171  Temporal lobe
## 68             10                4.66491459 Occipital lobe
## 69             23               16.78712221     Brain, NOS
## 70            145               25.06570302     Brain, NOS
## 71             29               12.81208936     Brain, NOS
## 72             56               21.22207622     Brain, NOS
## 73             40               33.57424442     Brain, NOS
## 74             29                9.75689882     Brain, NOS
a$Biopsy.Site <- as.factor(a$Biopsy.Site)
a$Overall.Survival..Months. <- as.numeric(a$Overall.Survival..Months.)
a$Mutation.Count <- as.numeric(a$Mutation.Count)

summary(a)
##  Mutation.Count    Overall.Survival..Months.         Biopsy.Site
##  Min.   :  10.00   Min.   : 0.03285          Brain, NOS    :14  
##  1st Qu.:  38.00   1st Qu.: 8.19645          Frontal lobe  :21  
##  Median :  48.00   Median :12.99277          Occipital lobe: 6  
##  Mean   :  63.99   Mean   :18.25168          Parietal lobe : 8  
##  3rd Qu.:  58.00   3rd Qu.:27.52135          Temporal lobe :25  
##  Max.   :1190.00   Max.   :64.78318
dim(a)
## [1] 74  3
ggplot(a, aes(x = Biopsy.Site, y = Overall.Survival..Months.)) +
  geom_boxplot(fill = "grey", color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Biopsy Site vs. Overall.Survival..Months.",
       x = "Biopsy Site",
       y = "Overall.Survival..Months.") #not many data point so change to just the points and not the box

ggplot(a, aes(x = Biopsy.Site, y = Mutation.Count)) +
  geom_boxplot(fill = "grey", color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Biopsy Site vs. Mutation Count",
       x = "Biopsy Site",
       y = "Mutation Count")

There seem to be outliers effecting the distribution. I then removed them and showed the scatter plot for a better representation of the data.

#find Q1, Q3, and interquartile range for values in column A
Q1 <- quantile(a$Mutation.Count, .25)
Q3 <- quantile(a$Mutation.Count, .75)
IQR <- IQR(a$Mutation.Count)

#only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
b <- subset(a, a$Mutation.Count> (Q1 - 1.5*IQR) & a$Mutation.Count< (Q3 + 1.5*IQR))

#view row and column count of new data frame
summary(b) 
##  Mutation.Count  Overall.Survival..Months.         Biopsy.Site
##  Min.   :10.00   Min.   : 0.03285          Brain, NOS    :13  
##  1st Qu.:37.50   1st Qu.: 8.65637          Frontal lobe  :20  
##  Median :48.00   Median :13.17346          Occipital lobe: 6  
##  Mean   :46.17   Mean   :18.51483          Parietal lobe : 8  
##  3rd Qu.:57.00   3rd Qu.:27.69382          Temporal lobe :24  
##  Max.   :75.00   Max.   :64.78318
dim(b)
## [1] 71  3
#mutation count and overall survival fill with biopsy site
ggplot(b, aes(x=Mutation.Count, y=Overall.Survival..Months., color=Biopsy.Site)) + 
    geom_point()+ggtitle("Mutations and Overall Survival")+xlab("Mutations")+ylab("Overall Survival Months")+theme_bw()

Test normality

#split the data into mutations and overall survival
(mc<-b[,c(1,3)]) #mutation count
##    Mutation.Count    Biopsy.Site
## 1              59   Frontal lobe
## 2              62  Parietal lobe
## 3              38   Frontal lobe
## 5              32  Parietal lobe
## 6              58  Temporal lobe
## 7              67  Temporal lobe
## 8              47     Brain, NOS
## 9              39  Temporal lobe
## 10             68   Frontal lobe
## 11             17   Frontal lobe
## 12             47  Temporal lobe
## 13             56  Temporal lobe
## 14             40   Frontal lobe
## 15             53 Occipital lobe
## 17             50   Frontal lobe
## 18             53 Occipital lobe
## 19             22  Temporal lobe
## 20             66  Temporal lobe
## 21             37  Temporal lobe
## 22             46  Parietal lobe
## 23             66  Parietal lobe
## 24             47     Brain, NOS
## 25             29  Temporal lobe
## 26             65  Parietal lobe
## 27             54   Frontal lobe
## 28             62   Frontal lobe
## 29             75     Brain, NOS
## 30             58  Parietal lobe
## 31             40  Temporal lobe
## 32             55  Temporal lobe
## 33             28   Frontal lobe
## 34             48   Frontal lobe
## 35             65   Frontal lobe
## 36             49   Frontal lobe
## 37             47 Occipital lobe
## 38             17  Temporal lobe
## 39             49   Frontal lobe
## 40             37  Temporal lobe
## 41             38   Frontal lobe
## 42             43  Temporal lobe
## 43             63  Parietal lobe
## 44             74   Frontal lobe
## 45             39   Frontal lobe
## 46             61  Temporal lobe
## 47             45     Brain, NOS
## 48             12  Temporal lobe
## 49             58  Temporal lobe
## 50             32     Brain, NOS
## 51             53   Frontal lobe
## 52             55 Occipital lobe
## 53             48  Temporal lobe
## 54             31  Temporal lobe
## 55             48   Frontal lobe
## 56             55   Frontal lobe
## 57             42  Temporal lobe
## 58             48 Occipital lobe
## 59             53  Temporal lobe
## 60             36   Frontal lobe
## 61             50  Parietal lobe
## 62             11     Brain, NOS
## 63             47     Brain, NOS
## 64             22     Brain, NOS
## 65             53  Temporal lobe
## 66             60  Temporal lobe
## 67             66  Temporal lobe
## 68             10 Occipital lobe
## 69             23     Brain, NOS
## 71             29     Brain, NOS
## 72             56     Brain, NOS
## 73             40     Brain, NOS
## 74             29     Brain, NOS
#perform shapiro-wilk test: mutation count
bn<-subset(mc,Biopsy.Site=="Brain, NOS")
fl<-subset(mc,Biopsy.Site=="Frontal lobe")
ol<-subset(mc,Biopsy.Site=="Occipital lobe")
pl<-subset(mc,Biopsy.Site=="Parietal lobe")
tl<-subset(mc,Biopsy.Site=="Temporal lobe")
shapiro.test(bn[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  bn[, 1]
## W = 0.96299, p-value = 0.799
shapiro.test(fl[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  fl[, 1]
## W = 0.9818, p-value = 0.9552
shapiro.test(ol[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  ol[, 1]
## W = 0.65923, p-value = 0.002234
shapiro.test(pl[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  pl[, 1]
## W = 0.86536, p-value = 0.1357
shapiro.test(tl[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  tl[, 1]
## W = 0.94849, p-value = 0.2512
#if not normal, use non-parametric tests
(os<-b[,c(2,3)]) #overall survival
##    Overall.Survival..Months.    Biopsy.Site
## 1                 4.23784494   Frontal lobe
## 2                10.57818660  Parietal lobe
## 3                15.70302234   Frontal lobe
## 5                 2.59526938  Parietal lobe
## 6                 2.16819974  Temporal lobe
## 7                26.47831800  Temporal lobe
## 8                21.87910644     Brain, NOS
## 9                32.32588699  Temporal lobe
## 10               21.22207622   Frontal lobe
## 11               16.32720105   Frontal lobe
## 12               55.94612352  Temporal lobe
## 13               33.83705650  Temporal lobe
## 14               19.41524310   Frontal lobe
## 15               28.67936925 Occipital lobe
## 17               27.20105125   Frontal lobe
## 18               34.59264126 Occipital lobe
## 19               12.41787122  Temporal lobe
## 20               12.51642576  Temporal lobe
## 21                1.41261498  Temporal lobe
## 22                9.23127464  Parietal lobe
## 23               13.23915900  Parietal lobe
## 24                2.85808147     Brain, NOS
## 25                0.22996058  Temporal lobe
## 26               23.68593955  Parietal lobe
## 27                4.17214192   Frontal lobe
## 28               13.17345598   Frontal lobe
## 29               45.79500657     Brain, NOS
## 30                9.00131406  Parietal lobe
## 31               36.36662286  Temporal lobe
## 32                3.81077530  Temporal lobe
## 33                0.03285151   Frontal lobe
## 34               52.06964520   Frontal lobe
## 35               39.71747700   Frontal lobe
## 36               21.45203679   Frontal lobe
## 37                0.22996058 Occipital lobe
## 38               64.78318003  Temporal lobe
## 39                0.19710907   Frontal lobe
## 40               11.46517740  Temporal lobe
## 41                8.04862024   Frontal lobe
## 42               19.90801577  Temporal lobe
## 43                6.30749014  Parietal lobe
## 44                0.62417871   Frontal lobe
## 45               36.30091984   Frontal lobe
## 46               26.44546649  Temporal lobe
## 47                3.97503285     Brain, NOS
## 48               27.62812089  Temporal lobe
## 49                8.86990802  Temporal lobe
## 50               11.92509855     Brain, NOS
## 51                0.09855453   Frontal lobe
## 52               12.54927727 Occipital lobe
## 53               12.74638633  Temporal lobe
## 54               11.16951380  Temporal lobe
## 55               12.68068331   Frontal lobe
## 56                8.67279895   Frontal lobe
## 57               13.50197109  Temporal lobe
## 58                8.70565046 Occipital lobe
## 59                8.63994744  Temporal lobe
## 60               36.62943495   Frontal lobe
## 61               43.98817346  Parietal lobe
## 62               46.51773982     Brain, NOS
## 63               17.93692510     Brain, NOS
## 64               34.39553219     Brain, NOS
## 65               27.75952694  Temporal lobe
## 66               27.75952694  Temporal lobe
## 67               10.90670171  Temporal lobe
## 68                4.66491459 Occipital lobe
## 69               16.78712221     Brain, NOS
## 71               12.81208936     Brain, NOS
## 72               21.22207622     Brain, NOS
## 73               33.57424442     Brain, NOS
## 74                9.75689882     Brain, NOS
#perform shapiro-wilk test: overall survival
bn2<-subset(os,Biopsy.Site=="Brain, NOS")
fl2<-subset(os,Biopsy.Site=="Frontal lobe")
ol2<-subset(os,Biopsy.Site=="Occipital lobe")
pl2<-subset(os,Biopsy.Site=="Parietal lobe")
tl2<-subset(os,Biopsy.Site=="Temporal lobe")
shapiro.test(bn2[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  bn2[, 1]
## W = 0.91991, p-value = 0.2502
shapiro.test(fl2[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  fl2[, 1]
## W = 0.91402, p-value = 0.07607
shapiro.test(ol2[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  ol2[, 1]
## W = 0.90401, p-value = 0.3982
shapiro.test(pl2[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  pl2[, 1]
## W = 0.79512, p-value = 0.02538
shapiro.test(tl2[,1])
## 
##  Shapiro-Wilk normality test
## 
## data:  tl2[, 1]
## W = 0.8895, p-value = 0.01298

This data Does not follow normal distribution.

Kruskal Wallis

Perform the Kruskal Wallis test to compare the median values.

kruskal.test(Overall.Survival..Months. ~ Biopsy.Site, data = b)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Overall.Survival..Months. by Biopsy.Site
## Kruskal-Wallis chi-squared = 2.2114, df = 4, p-value = 0.6969
kruskal.test(Mutation.Count ~ Biopsy.Site, data = b)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Mutation.Count by Biopsy.Site
## Kruskal-Wallis chi-squared = 7.2197, df = 4, p-value = 0.1247

The p-values > alpha (0.05). failed to reject the H0 for both over-all survival and mutation count for the biopsy groups.

Results

Bellow are the plotted results of my analysis.

#install.packages("ggpubr")
library(ggpubr)
stat_compare_means(mapping = NULL, comparisons = NULL, hide.ns = FALSE,
                   label = NULL,  label.x = NULL, label.y = NULL)
## geom_text: na.rm = FALSE
## stat_compare_means: label.x.npc = left, label.y.npc = top, label.x = NULL, label.y = NULL, label.sep = , , method = NULL, method.args = list(), paired = FALSE, ref.group = NULL, symnum.args = list(), hide.ns = FALSE, na.rm = FALSE
## position_identity
#compare_means(Overall.Survival..Months. ~ Biopsy.Site,  data = b, method = "anova")
# Default method = "kruskal.test" for multiple groups
ggboxplot(b, x = "Biopsy.Site", y = "Overall.Survival..Months.",
          color = "Biopsy.Site", palette = "jco", add = "jitter")+
  stat_compare_means()

#complete for mutation vs. bs
ggboxplot(b, x = "Biopsy.Site", y = "Mutation.Count",
          color = "Biopsy.Site", palette = "jco", add = "jitter")+
  stat_compare_means()

Conclusions

My analysis did not show statistical significance for over-all survival or mutation count for the biopsy sites in this cohort. I chose to do the Kruskal Wallis test because the data did not meet the parameters for the ANOVA test. In conclusion, I failed to reject the null hypothesis 1 and 2.

Resources

https://www.cbioportal.org/study/summary?id=brain_cptac_gdc

https://www.statology.org/kruskal-wallis-test/