#library(arsenal) playing with descriptive stats tables
library(devtools)
## Warning: package 'devtools' was built under R version 4.3.3
## Loading required package: usethis
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
library(survival)
library(survminer)
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.3.3
##
## Attaching package: 'survminer'
##
## The following object is masked from 'package:survival':
##
## myeloma
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.3.3
##
## Attaching package: 'ggsurvfit'
##
## The following object is masked from 'package:psych':
##
## %+%
This project uses CNS/Brain Cancer data from CPTAC CNS/Brain Cancer GDC and generated in Jul 2024 to understand clinical predictors of overall survival. The first null hypothesis is that there is no difference in the median over-all survival time across the five biopsy groups; any observed difference would result from pure coincidences or sampling errors and occurred randomly. The second null hypothesis being tested is that there is no difference in the median mutation count time across the five biopsy groups; any observed difference would result from pure coincidences or sampling errors and occurred randomly. I set the alpha is 0.05. The Kruskal Wallis test type of statistical test that determines if the medians of independent groups are statistically significant. It is the nonparametric equivalent of the one-way ANOVA test.
raw <- read.delim("C:/Users/maddoxp/OneDrive - Children's Hospital of Philadelphia/Documents/Biostatistics Class/brain_cptac_gdc_clinical_data.tsv", header = TRUE, sep = "\t")
#write.csv(raw, "final_project_p1.csv")
head(raw)
## Study.ID Patient.ID Sample.ID Diagnosis.Age AIDS.risk.factors
## 1 brain_cptac_gdc C3L-00104 C3L-00104-01 58 <NA>
## 2 brain_cptac_gdc C3L-00365 C3L-00365-01 59 <NA>
## 3 brain_cptac_gdc C3L-00674 C3L-00674-71 45 <NA>
## 4 brain_cptac_gdc C3L-00677 C3L-00677-72 69 <NA>
## 5 brain_cptac_gdc C3L-01043 C3L-01043-01 59 <NA>
## 6 brain_cptac_gdc C3L-01045 C3L-01045-03 73 <NA>
## Alcohol.History.Documented Alcohol.Intensity Biopsy.Site Body.Mass.Index
## 1 True Occasional Drinker Frontal lobe 32.54
## 2 True Occasional Drinker Parietal lobe 20.61
## 3 <NA> <NA> Frontal lobe 27.44
## 4 <NA> <NA> Frontal lobe 19.32
## 5 False Lifelong Non-Drinker Parietal lobe 24.22
## 6 False Lifelong Non-Drinker Temporal lobe 22.04
## Cancer.Type Cancer.Type.Detailed Cause.of.Death Comorbidity Consent.Type
## 1 CNS/Brain Cancer CNS/Brain <NA> <NA> NA
## 2 CNS/Brain Cancer CNS/Brain <NA> Depression NA
## 3 CNS/Brain Cancer CNS/Brain Cancer Related <NA> NA
## 4 CNS/Brain Cancer CNS/Brain Cancer Related <NA> NA
## 5 CNS/Brain Cancer CNS/Brain Cancer Related <NA> NA
## 6 CNS/Brain Cancer CNS/Brain Infection <NA> NA
## Disease.Free..Months. Disease.Free.Status Diabetes.Treatment.Type
## 1 3.318003 0:DiseaseFree NA
## 2 10.578187 <NA> NA
## 3 8.048620 1:Recurred/Progressed NA
## 4 5.091984 0:DiseaseFree NA
## 5 2.595269 <NA> NA
## 6 2.168200 <NA> NA
## Disease.Response Disease.Status Disease.Type ECOG.Performance.Status
## 1 <NA> With tumor Gliomas NA
## 2 <NA> Unknown tumor status Gliomas NA
## 3 <NA> Unknown tumor status Gliomas NA
## 4 <NA> Unknown tumor status Gliomas 1
## 5 <NA> With tumor Gliomas NA
## 6 <NA> With tumor Gliomas NA
## Ethnicity.Category Height..cms. Index.Date KPS
## 1 NOT HISPANIC OR LATINO 188.00 Diagnosis NA
## 2 NOT HISPANIC OR LATINO 162.00 Diagnosis NA
## 3 NOT HISPANIC OR LATINO 193.00 Diagnosis NA
## 4 NOT HISPANIC OR LATINO 164.47 Diagnosis 70
## 5 <NA> 170.00 Diagnosis 0
## 6 <NA> 165.00 Diagnosis 0
## Number.of.positive.lymph.nodes Morphology Mutation.Count Oncotree.Code
## 1 NA 9440/3 59 BRAIN
## 2 NA 9440/3 62 BRAIN
## 3 NA 9440/3 38 BRAIN
## 4 NA 9440/3 1190 BRAIN
## 5 NA 9440/3 32 BRAIN
## 6 NA 9440/3 58 BRAIN
## Overall.Survival..Months. Overall.Survival.Status
## 1 4.237845 1:DECEASED
## 2 10.578187 1:DECEASED
## 3 15.703022 1:DECEASED
## 4 5.091984 1:DECEASED
## 5 2.595269 1:DECEASED
## 6 2.168200 1:DECEASED
## Other.Patient.ID Other.Sample.ID
## 1 d42636dd-728c-45a4-b953-db165a30c761 331b4168-af74-44c4-9c2c-235b295a32aa
## 2 0c38f553-6892-4f67-b306-40503b8842b5 c6cb701b-98e7-479b-858d-48725a02f55c
## 3 e1bd6dfa-0567-46e7-991c-63eefaf5bd7a 0d7a2e99-e3c0-4b06-8109-24f3b7b5cdcb
## 4 f5287051-d58d-4243-8a07-5fdfb5afa301 b8952cfa-68f7-42da-a6fb-aac4357e0813
## 5 206f20bb-c212-4a26-967a-5b3700f322be 68de0994-c58e-4042-9b8d-863f798171b3
## 6 b137313e-0200-4fb7-a151-3c2c76170dea 4c8f9392-2b64-4ef1-a6f2-db31e3f2b02b
## Primary.Diagnosis Primary.Tumor.Site Patient.Primary.Tumor.Site
## 1 Glioblastoma Brain Brain
## 2 Glioblastoma Brain Brain
## 3 Glioblastoma Brain Brain
## 4 Glioblastoma Brain Brain
## 5 Glioblastoma Brain Brain
## 6 Glioblastoma Brain Brain
## Progression.or.Recurrence Project.Identifier
## 1 False CPTAC-3
## 2 <NA> CPTAC-3
## 3 True CPTAC-3
## 4 False CPTAC-3
## 5 <NA> CPTAC-3
## 6 <NA> CPTAC-3
## Project.Name Project.State
## 1 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus released
## 2 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus released
## 3 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus released
## 4 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus released
## 5 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus released
## 6 CPTAC-Brain, Head and Neck, Kidney, Lung, Pancreas, Uterus released
## Race.Category Residual.Disease Number.of.Samples.Per.Patient Sample.Type
## 1 WHITE <NA> 1 Primary Tumor
## 2 WHITE <NA> 1 Primary Tumor
## 3 WHITE <NA> 1 Primary Tumor
## 4 WHITE <NA> 1 Primary Tumor
## 5 OTHER <NA> 1 Primary Tumor
## 6 OTHER <NA> 1 Primary Tumor
## Exposure.to.secondhand.smoke.as.a.child Sex Years.Smoked
## 1 <NA> Male 35
## 2 <NA> Female 30
## 3 <NA> Male NA
## 4 <NA> Female 36
## 5 True Male 46
## 6 True Female NA
## Smoking.Onset..year. Person.Cigarette.Smoking.History.Pack.Year.Value
## 1 1971 35.0
## 2 1980 30.0
## 3 NA NA
## 4 1986 29.0
## 5 1976 61.5
## 6 NA NA
## Smoking.Quit..year. Smoking.Status Tumor.Focality
## 1 2006 Current Reformed Smoker for < or = 15 yrs NA
## 2 2010 Current Reformed Smoker for < or = 15 yrs NA
## 3 NA Smoking history not documented NA
## 4 NA Current Smoker NA
## 5 NA Current Smoker NA
## 6 NA Lifelong Non-Smoker NA
## Tumor.Grade Tumor.s.largest.diameter.or.dimension Type.of.smoking.exposure
## 1 NA 1.0 <NA>
## 2 NA 1.2 <NA>
## 3 NA 4.9 <NA>
## 4 NA 4.5 <NA>
## 5 NA NA Smoke exposure, NOS
## 6 NA NA Smoke exposure, NOS
## Patient.s.Vital.Status Weight..kgs. Year.of.Death Year.of.Diagnosis
## 1 Dead 115.00 2016 2016
## 2 Dead 54.10 2017 2016
## 3 Dead 102.20 2018 2016
## 4 Dead 52.27 2017 2016
## 5 Dead 70.00 2017 2017
## 6 Dead 60.00 2017 2017
colnames(raw)
## [1] "Study.ID"
## [2] "Patient.ID"
## [3] "Sample.ID"
## [4] "Diagnosis.Age"
## [5] "AIDS.risk.factors"
## [6] "Alcohol.History.Documented"
## [7] "Alcohol.Intensity"
## [8] "Biopsy.Site"
## [9] "Body.Mass.Index"
## [10] "Cancer.Type"
## [11] "Cancer.Type.Detailed"
## [12] "Cause.of.Death"
## [13] "Comorbidity"
## [14] "Consent.Type"
## [15] "Disease.Free..Months."
## [16] "Disease.Free.Status"
## [17] "Diabetes.Treatment.Type"
## [18] "Disease.Response"
## [19] "Disease.Status"
## [20] "Disease.Type"
## [21] "ECOG.Performance.Status"
## [22] "Ethnicity.Category"
## [23] "Height..cms."
## [24] "Index.Date"
## [25] "KPS"
## [26] "Number.of.positive.lymph.nodes"
## [27] "Morphology"
## [28] "Mutation.Count"
## [29] "Oncotree.Code"
## [30] "Overall.Survival..Months."
## [31] "Overall.Survival.Status"
## [32] "Other.Patient.ID"
## [33] "Other.Sample.ID"
## [34] "Primary.Diagnosis"
## [35] "Primary.Tumor.Site"
## [36] "Patient.Primary.Tumor.Site"
## [37] "Progression.or.Recurrence"
## [38] "Project.Identifier"
## [39] "Project.Name"
## [40] "Project.State"
## [41] "Race.Category"
## [42] "Residual.Disease"
## [43] "Number.of.Samples.Per.Patient"
## [44] "Sample.Type"
## [45] "Exposure.to.secondhand.smoke.as.a.child"
## [46] "Sex"
## [47] "Years.Smoked"
## [48] "Smoking.Onset..year."
## [49] "Person.Cigarette.Smoking.History.Pack.Year.Value"
## [50] "Smoking.Quit..year."
## [51] "Smoking.Status"
## [52] "Tumor.Focality"
## [53] "Tumor.Grade"
## [54] "Tumor.s.largest.diameter.or.dimension"
## [55] "Type.of.smoking.exposure"
## [56] "Patient.s.Vital.Status"
## [57] "Weight..kgs."
## [58] "Year.of.Death"
## [59] "Year.of.Diagnosis"
dim(raw)
## [1] 74 59
#data formatting
(a <- raw[,c("Mutation.Count", "Overall.Survival..Months.", "Biopsy.Site")])
## Mutation.Count Overall.Survival..Months. Biopsy.Site
## 1 59 4.23784494 Frontal lobe
## 2 62 10.57818660 Parietal lobe
## 3 38 15.70302234 Frontal lobe
## 4 1190 5.09198423 Frontal lobe
## 5 32 2.59526938 Parietal lobe
## 6 58 2.16819974 Temporal lobe
## 7 67 26.47831800 Temporal lobe
## 8 47 21.87910644 Brain, NOS
## 9 39 32.32588699 Temporal lobe
## 10 68 21.22207622 Frontal lobe
## 11 17 16.32720105 Frontal lobe
## 12 47 55.94612352 Temporal lobe
## 13 56 33.83705650 Temporal lobe
## 14 40 19.41524310 Frontal lobe
## 15 53 28.67936925 Occipital lobe
## 16 122 5.91327201 Temporal lobe
## 17 50 27.20105125 Frontal lobe
## 18 53 34.59264126 Occipital lobe
## 19 22 12.41787122 Temporal lobe
## 20 66 12.51642576 Temporal lobe
## 21 37 1.41261498 Temporal lobe
## 22 46 9.23127464 Parietal lobe
## 23 66 13.23915900 Parietal lobe
## 24 47 2.85808147 Brain, NOS
## 25 29 0.22996058 Temporal lobe
## 26 65 23.68593955 Parietal lobe
## 27 54 4.17214192 Frontal lobe
## 28 62 13.17345598 Frontal lobe
## 29 75 45.79500657 Brain, NOS
## 30 58 9.00131406 Parietal lobe
## 31 40 36.36662286 Temporal lobe
## 32 55 3.81077530 Temporal lobe
## 33 28 0.03285151 Frontal lobe
## 34 48 52.06964520 Frontal lobe
## 35 65 39.71747700 Frontal lobe
## 36 49 21.45203679 Frontal lobe
## 37 47 0.22996058 Occipital lobe
## 38 17 64.78318003 Temporal lobe
## 39 49 0.19710907 Frontal lobe
## 40 37 11.46517740 Temporal lobe
## 41 38 8.04862024 Frontal lobe
## 42 43 19.90801577 Temporal lobe
## 43 63 6.30749014 Parietal lobe
## 44 74 0.62417871 Frontal lobe
## 45 39 36.30091984 Frontal lobe
## 46 61 26.44546649 Temporal lobe
## 47 45 3.97503285 Brain, NOS
## 48 12 27.62812089 Temporal lobe
## 49 58 8.86990802 Temporal lobe
## 50 32 11.92509855 Brain, NOS
## 51 53 0.09855453 Frontal lobe
## 52 55 12.54927727 Occipital lobe
## 53 48 12.74638633 Temporal lobe
## 54 31 11.16951380 Temporal lobe
## 55 48 12.68068331 Frontal lobe
## 56 55 8.67279895 Frontal lobe
## 57 42 13.50197109 Temporal lobe
## 58 48 8.70565046 Occipital lobe
## 59 53 8.63994744 Temporal lobe
## 60 36 36.62943495 Frontal lobe
## 61 50 43.98817346 Parietal lobe
## 62 11 46.51773982 Brain, NOS
## 63 47 17.93692510 Brain, NOS
## 64 22 34.39553219 Brain, NOS
## 65 53 27.75952694 Temporal lobe
## 66 60 27.75952694 Temporal lobe
## 67 66 10.90670171 Temporal lobe
## 68 10 4.66491459 Occipital lobe
## 69 23 16.78712221 Brain, NOS
## 70 145 25.06570302 Brain, NOS
## 71 29 12.81208936 Brain, NOS
## 72 56 21.22207622 Brain, NOS
## 73 40 33.57424442 Brain, NOS
## 74 29 9.75689882 Brain, NOS
a$Biopsy.Site <- as.factor(a$Biopsy.Site)
a$Overall.Survival..Months. <- as.numeric(a$Overall.Survival..Months.)
a$Mutation.Count <- as.numeric(a$Mutation.Count)
summary(a)
## Mutation.Count Overall.Survival..Months. Biopsy.Site
## Min. : 10.00 Min. : 0.03285 Brain, NOS :14
## 1st Qu.: 38.00 1st Qu.: 8.19645 Frontal lobe :21
## Median : 48.00 Median :12.99277 Occipital lobe: 6
## Mean : 63.99 Mean :18.25168 Parietal lobe : 8
## 3rd Qu.: 58.00 3rd Qu.:27.52135 Temporal lobe :25
## Max. :1190.00 Max. :64.78318
dim(a)
## [1] 74 3
ggplot(a, aes(x = Biopsy.Site, y = Overall.Survival..Months.)) +
geom_boxplot(fill = "grey", color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Biopsy Site vs. Overall.Survival..Months.",
x = "Biopsy Site",
y = "Overall.Survival..Months.") #not many data point so change to just the points and not the box
ggplot(a, aes(x = Biopsy.Site, y = Mutation.Count)) +
geom_boxplot(fill = "grey", color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Biopsy Site vs. Mutation Count",
x = "Biopsy Site",
y = "Mutation Count")
There seem to be outliers effecting the distribution. I then removed them and showed the scatter plot for a better representation of the data.
#find Q1, Q3, and interquartile range for values in column A
Q1 <- quantile(a$Mutation.Count, .25)
Q3 <- quantile(a$Mutation.Count, .75)
IQR <- IQR(a$Mutation.Count)
#only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
b <- subset(a, a$Mutation.Count> (Q1 - 1.5*IQR) & a$Mutation.Count< (Q3 + 1.5*IQR))
#view row and column count of new data frame
summary(b)
## Mutation.Count Overall.Survival..Months. Biopsy.Site
## Min. :10.00 Min. : 0.03285 Brain, NOS :13
## 1st Qu.:37.50 1st Qu.: 8.65637 Frontal lobe :20
## Median :48.00 Median :13.17346 Occipital lobe: 6
## Mean :46.17 Mean :18.51483 Parietal lobe : 8
## 3rd Qu.:57.00 3rd Qu.:27.69382 Temporal lobe :24
## Max. :75.00 Max. :64.78318
dim(b)
## [1] 71 3
#mutation count and overall survival fill with biopsy site
ggplot(b, aes(x=Mutation.Count, y=Overall.Survival..Months., color=Biopsy.Site)) +
geom_point()+ggtitle("Mutations and Overall Survival")+xlab("Mutations")+ylab("Overall Survival Months")+theme_bw()
#split the data into mutations and overall survival
(mc<-b[,c(1,3)]) #mutation count
## Mutation.Count Biopsy.Site
## 1 59 Frontal lobe
## 2 62 Parietal lobe
## 3 38 Frontal lobe
## 5 32 Parietal lobe
## 6 58 Temporal lobe
## 7 67 Temporal lobe
## 8 47 Brain, NOS
## 9 39 Temporal lobe
## 10 68 Frontal lobe
## 11 17 Frontal lobe
## 12 47 Temporal lobe
## 13 56 Temporal lobe
## 14 40 Frontal lobe
## 15 53 Occipital lobe
## 17 50 Frontal lobe
## 18 53 Occipital lobe
## 19 22 Temporal lobe
## 20 66 Temporal lobe
## 21 37 Temporal lobe
## 22 46 Parietal lobe
## 23 66 Parietal lobe
## 24 47 Brain, NOS
## 25 29 Temporal lobe
## 26 65 Parietal lobe
## 27 54 Frontal lobe
## 28 62 Frontal lobe
## 29 75 Brain, NOS
## 30 58 Parietal lobe
## 31 40 Temporal lobe
## 32 55 Temporal lobe
## 33 28 Frontal lobe
## 34 48 Frontal lobe
## 35 65 Frontal lobe
## 36 49 Frontal lobe
## 37 47 Occipital lobe
## 38 17 Temporal lobe
## 39 49 Frontal lobe
## 40 37 Temporal lobe
## 41 38 Frontal lobe
## 42 43 Temporal lobe
## 43 63 Parietal lobe
## 44 74 Frontal lobe
## 45 39 Frontal lobe
## 46 61 Temporal lobe
## 47 45 Brain, NOS
## 48 12 Temporal lobe
## 49 58 Temporal lobe
## 50 32 Brain, NOS
## 51 53 Frontal lobe
## 52 55 Occipital lobe
## 53 48 Temporal lobe
## 54 31 Temporal lobe
## 55 48 Frontal lobe
## 56 55 Frontal lobe
## 57 42 Temporal lobe
## 58 48 Occipital lobe
## 59 53 Temporal lobe
## 60 36 Frontal lobe
## 61 50 Parietal lobe
## 62 11 Brain, NOS
## 63 47 Brain, NOS
## 64 22 Brain, NOS
## 65 53 Temporal lobe
## 66 60 Temporal lobe
## 67 66 Temporal lobe
## 68 10 Occipital lobe
## 69 23 Brain, NOS
## 71 29 Brain, NOS
## 72 56 Brain, NOS
## 73 40 Brain, NOS
## 74 29 Brain, NOS
#perform shapiro-wilk test: mutation count
bn<-subset(mc,Biopsy.Site=="Brain, NOS")
fl<-subset(mc,Biopsy.Site=="Frontal lobe")
ol<-subset(mc,Biopsy.Site=="Occipital lobe")
pl<-subset(mc,Biopsy.Site=="Parietal lobe")
tl<-subset(mc,Biopsy.Site=="Temporal lobe")
shapiro.test(bn[,1])
##
## Shapiro-Wilk normality test
##
## data: bn[, 1]
## W = 0.96299, p-value = 0.799
shapiro.test(fl[,1])
##
## Shapiro-Wilk normality test
##
## data: fl[, 1]
## W = 0.9818, p-value = 0.9552
shapiro.test(ol[,1])
##
## Shapiro-Wilk normality test
##
## data: ol[, 1]
## W = 0.65923, p-value = 0.002234
shapiro.test(pl[,1])
##
## Shapiro-Wilk normality test
##
## data: pl[, 1]
## W = 0.86536, p-value = 0.1357
shapiro.test(tl[,1])
##
## Shapiro-Wilk normality test
##
## data: tl[, 1]
## W = 0.94849, p-value = 0.2512
#if not normal, use non-parametric tests
(os<-b[,c(2,3)]) #overall survival
## Overall.Survival..Months. Biopsy.Site
## 1 4.23784494 Frontal lobe
## 2 10.57818660 Parietal lobe
## 3 15.70302234 Frontal lobe
## 5 2.59526938 Parietal lobe
## 6 2.16819974 Temporal lobe
## 7 26.47831800 Temporal lobe
## 8 21.87910644 Brain, NOS
## 9 32.32588699 Temporal lobe
## 10 21.22207622 Frontal lobe
## 11 16.32720105 Frontal lobe
## 12 55.94612352 Temporal lobe
## 13 33.83705650 Temporal lobe
## 14 19.41524310 Frontal lobe
## 15 28.67936925 Occipital lobe
## 17 27.20105125 Frontal lobe
## 18 34.59264126 Occipital lobe
## 19 12.41787122 Temporal lobe
## 20 12.51642576 Temporal lobe
## 21 1.41261498 Temporal lobe
## 22 9.23127464 Parietal lobe
## 23 13.23915900 Parietal lobe
## 24 2.85808147 Brain, NOS
## 25 0.22996058 Temporal lobe
## 26 23.68593955 Parietal lobe
## 27 4.17214192 Frontal lobe
## 28 13.17345598 Frontal lobe
## 29 45.79500657 Brain, NOS
## 30 9.00131406 Parietal lobe
## 31 36.36662286 Temporal lobe
## 32 3.81077530 Temporal lobe
## 33 0.03285151 Frontal lobe
## 34 52.06964520 Frontal lobe
## 35 39.71747700 Frontal lobe
## 36 21.45203679 Frontal lobe
## 37 0.22996058 Occipital lobe
## 38 64.78318003 Temporal lobe
## 39 0.19710907 Frontal lobe
## 40 11.46517740 Temporal lobe
## 41 8.04862024 Frontal lobe
## 42 19.90801577 Temporal lobe
## 43 6.30749014 Parietal lobe
## 44 0.62417871 Frontal lobe
## 45 36.30091984 Frontal lobe
## 46 26.44546649 Temporal lobe
## 47 3.97503285 Brain, NOS
## 48 27.62812089 Temporal lobe
## 49 8.86990802 Temporal lobe
## 50 11.92509855 Brain, NOS
## 51 0.09855453 Frontal lobe
## 52 12.54927727 Occipital lobe
## 53 12.74638633 Temporal lobe
## 54 11.16951380 Temporal lobe
## 55 12.68068331 Frontal lobe
## 56 8.67279895 Frontal lobe
## 57 13.50197109 Temporal lobe
## 58 8.70565046 Occipital lobe
## 59 8.63994744 Temporal lobe
## 60 36.62943495 Frontal lobe
## 61 43.98817346 Parietal lobe
## 62 46.51773982 Brain, NOS
## 63 17.93692510 Brain, NOS
## 64 34.39553219 Brain, NOS
## 65 27.75952694 Temporal lobe
## 66 27.75952694 Temporal lobe
## 67 10.90670171 Temporal lobe
## 68 4.66491459 Occipital lobe
## 69 16.78712221 Brain, NOS
## 71 12.81208936 Brain, NOS
## 72 21.22207622 Brain, NOS
## 73 33.57424442 Brain, NOS
## 74 9.75689882 Brain, NOS
#perform shapiro-wilk test: overall survival
bn2<-subset(os,Biopsy.Site=="Brain, NOS")
fl2<-subset(os,Biopsy.Site=="Frontal lobe")
ol2<-subset(os,Biopsy.Site=="Occipital lobe")
pl2<-subset(os,Biopsy.Site=="Parietal lobe")
tl2<-subset(os,Biopsy.Site=="Temporal lobe")
shapiro.test(bn2[,1])
##
## Shapiro-Wilk normality test
##
## data: bn2[, 1]
## W = 0.91991, p-value = 0.2502
shapiro.test(fl2[,1])
##
## Shapiro-Wilk normality test
##
## data: fl2[, 1]
## W = 0.91402, p-value = 0.07607
shapiro.test(ol2[,1])
##
## Shapiro-Wilk normality test
##
## data: ol2[, 1]
## W = 0.90401, p-value = 0.3982
shapiro.test(pl2[,1])
##
## Shapiro-Wilk normality test
##
## data: pl2[, 1]
## W = 0.79512, p-value = 0.02538
shapiro.test(tl2[,1])
##
## Shapiro-Wilk normality test
##
## data: tl2[, 1]
## W = 0.8895, p-value = 0.01298
This data Does not follow normal distribution.
Perform the Kruskal Wallis test to compare the median values.
kruskal.test(Overall.Survival..Months. ~ Biopsy.Site, data = b)
##
## Kruskal-Wallis rank sum test
##
## data: Overall.Survival..Months. by Biopsy.Site
## Kruskal-Wallis chi-squared = 2.2114, df = 4, p-value = 0.6969
kruskal.test(Mutation.Count ~ Biopsy.Site, data = b)
##
## Kruskal-Wallis rank sum test
##
## data: Mutation.Count by Biopsy.Site
## Kruskal-Wallis chi-squared = 7.2197, df = 4, p-value = 0.1247
The p-values > alpha (0.05). failed to reject the H0 for both over-all survival and mutation count for the biopsy groups.
Bellow are the plotted results of my analysis.
#install.packages("ggpubr")
library(ggpubr)
stat_compare_means(mapping = NULL, comparisons = NULL, hide.ns = FALSE,
label = NULL, label.x = NULL, label.y = NULL)
## geom_text: na.rm = FALSE
## stat_compare_means: label.x.npc = left, label.y.npc = top, label.x = NULL, label.y = NULL, label.sep = , , method = NULL, method.args = list(), paired = FALSE, ref.group = NULL, symnum.args = list(), hide.ns = FALSE, na.rm = FALSE
## position_identity
#compare_means(Overall.Survival..Months. ~ Biopsy.Site, data = b, method = "anova")
# Default method = "kruskal.test" for multiple groups
ggboxplot(b, x = "Biopsy.Site", y = "Overall.Survival..Months.",
color = "Biopsy.Site", palette = "jco", add = "jitter")+
stat_compare_means()
#complete for mutation vs. bs
ggboxplot(b, x = "Biopsy.Site", y = "Mutation.Count",
color = "Biopsy.Site", palette = "jco", add = "jitter")+
stat_compare_means()
My analysis did not show statistical significance for over-all survival or mutation count for the biopsy sites in this cohort. I chose to do the Kruskal Wallis test because the data did not meet the parameters for the ANOVA test. In conclusion, I failed to reject the null hypothesis 1 and 2.