#library
library(readxl)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Problem set 1 ##read data
library(readxl)
data <- read_table("/Volumes/SSD/1.University/0.Master_course/Lectures/26_spring/advanced_medical_data_science/framingham.txt",)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Cholesterol = col_double(),
## Age = col_double(),
## Sex = col_character(),
## SBP = col_double(),
## DBP = col_double(),
## CIG = col_double()
## )
data %>% view()
colnames(data)
## [1] "Cholesterol" "Age" "Sex" "SBP" "DBP"
## [6] "CIG"
spec(data)
## cols(
## Cholesterol = col_double(),
## Age = col_double(),
## Sex = col_character(),
## SBP = col_double(),
## DBP = col_double(),
## CIG = col_double()
## )
##data wrangling
data_2 <- data
data_2$DIFF <- data$SBP - data$DBP
data_2 %>% view()
##Shapiro-Wilk test
?shapiro.test
shapiro_SBP<- shapiro.test(data$SBP)
shapiro_DBP<- shapiro.test(data$DBP)
shapiro_SBP
##
## Shapiro-Wilk normality test
##
## data: data$SBP
## W = 0.91602, p-value < 2.2e-16
shapiro_DBP
##
## Shapiro-Wilk normality test
##
## data: data$DBP
## W = 0.9573, p-value < 2.2e-16
shapiro_DIFF <- shapiro.test(data_2$DIFF)
shapiro_DIFF
##
## Shapiro-Wilk normality test
##
## data: data_2$DIFF
## W = 0.90876, p-value < 2.2e-16
##Wilcoxon signed rank test
?wilcox.test
wilcoxon_SBP_DBP <- wilcox.test(data_2$SBP, data_2$DBP, paired = TRUE, alternative = "two.sided")
wilcoxon_DIFF <- wilcox.test(data_2$DIFF, alternative = "two.sided")
#Probelm set 2 ##load data
## Warning: package 'survival' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## pair time cens treat
## 1 1 1 1 control
## 2 1 10 1 6-MP
## 3 2 22 1 control
## 4 2 7 1 6-MP
## 5 3 3 1 control
## 6 3 32 0 6-MP
#survival formula for whole group
??survfit
?Surv
fit_entire <- survfit(data = gehan, Surv(time, cens) ~ 1)
fit_entire %>% summary()
## Call: survfit(formula = Surv(time, cens) ~ 1, data = gehan)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 42 2 0.952 0.0329 0.8901 1.000
## 2 40 2 0.905 0.0453 0.8202 0.998
## 3 38 1 0.881 0.0500 0.7883 0.985
## 4 37 2 0.833 0.0575 0.7279 0.954
## 5 35 2 0.786 0.0633 0.6709 0.920
## 6 33 3 0.714 0.0697 0.5899 0.865
## 7 29 1 0.690 0.0715 0.5628 0.845
## 8 28 4 0.591 0.0764 0.4588 0.762
## 10 23 1 0.565 0.0773 0.4325 0.739
## 11 21 2 0.512 0.0788 0.3783 0.692
## 12 18 2 0.455 0.0796 0.3227 0.641
## 13 16 1 0.426 0.0795 0.2958 0.615
## 15 15 1 0.398 0.0791 0.2694 0.588
## 16 14 1 0.369 0.0784 0.2437 0.560
## 17 13 1 0.341 0.0774 0.2186 0.532
## 22 9 2 0.265 0.0765 0.1507 0.467
## 23 7 2 0.189 0.0710 0.0909 0.395
print(fit_entire, print.rmean = TRUE)
## Call: survfit(formula = Surv(time, cens) ~ 1, data = gehan)
##
## n events rmean* se(rmean) median 0.95LCL 0.95UCL
## [1,] 42 30 15.3 1.86 12 8 22
## * restricted mean with upper limit = 35
fit_group <- survfit(data = gehan, Surv(time, cens) ~ treat)
fit_group %>% summary()
## Call: survfit(formula = Surv(time, cens) ~ treat, data = gehan)
##
## treat=6-MP
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 6 21 3 0.857 0.0764 0.720 1.000
## 7 17 1 0.807 0.0869 0.653 0.996
## 10 15 1 0.753 0.0963 0.586 0.968
## 13 12 1 0.690 0.1068 0.510 0.935
## 16 11 1 0.627 0.1141 0.439 0.896
## 22 7 1 0.538 0.1282 0.337 0.858
## 23 6 1 0.448 0.1346 0.249 0.807
##
## treat=control
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 21 2 0.9048 0.0641 0.78754 1.000
## 2 19 2 0.8095 0.0857 0.65785 0.996
## 3 17 1 0.7619 0.0929 0.59988 0.968
## 4 16 2 0.6667 0.1029 0.49268 0.902
## 5 14 2 0.5714 0.1080 0.39455 0.828
## 8 12 4 0.3810 0.1060 0.22085 0.657
## 11 8 2 0.2857 0.0986 0.14529 0.562
## 12 6 2 0.1905 0.0857 0.07887 0.460
## 15 4 1 0.1429 0.0764 0.05011 0.407
## 17 3 1 0.0952 0.0641 0.02549 0.356
## 22 2 1 0.0476 0.0465 0.00703 0.322
## 23 1 1 0.0000 NaN NA NA
print(fit_entire, print.rmean = TRUE)
## Call: survfit(formula = Surv(time, cens) ~ 1, data = gehan)
##
## n events rmean* se(rmean) median 0.95LCL 0.95UCL
## [1,] 42 30 15.3 1.86 12 8 22
## * restricted mean with upper limit = 35
#plot KM survival curve
plot(fit_entire, lty = 1, lwd = 2, conf.int = 'none', mark.time = T, col = '#a6020f',
xlab = "Remission Time (weeks)", ylab = "Survival Probability",
main = "Kaplan-Meier Curves for All Patients"
)
plot(fit_group, lty = 1, lwd = 2, conf.int = 'none', mark.time = T, col = c('#a6020f','#1f78b4'),
xlab = "Remission Time (weeks)", ylab = "Survival Probability",
main = "Kaplan-Meier Curves for Each Group"
)
legend("topright",
legend = levels(gehan$treat),
col = c("#a6020f", "#1f78b4"),
lwd = 3,
lty = 1,
bty = "n")