advanced_medical_data_science_midterm

#library

library(readxl)
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'readr' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Problem set 1 ##read data

library(readxl)
data <- read_table("/Volumes/SSD/1.University/0.Master_course/Lectures/26_spring/advanced_medical_data_science/framingham.txt",)

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Cholesterol = col_double(),
##   Age = col_double(),
##   Sex = col_character(),
##   SBP = col_double(),
##   DBP = col_double(),
##   CIG = col_double()
## )

data %>% view()
colnames(data)

## [1] "Cholesterol" "Age"         "Sex"         "SBP"         "DBP"        
## [6] "CIG"

spec(data)

## cols(
##   Cholesterol = col_double(),
##   Age = col_double(),
##   Sex = col_character(),
##   SBP = col_double(),
##   DBP = col_double(),
##   CIG = col_double()
## )

##data wrangling

data_2 <- data
data_2$DIFF <- data$SBP - data$DBP
data_2 %>% view()

##Shapiro-Wilk test

?shapiro.test
shapiro_SBP<- shapiro.test(data$SBP)
shapiro_DBP<- shapiro.test(data$DBP)

shapiro_SBP

## 
##  Shapiro-Wilk normality test
## 
## data:  data$SBP
## W = 0.91602, p-value < 2.2e-16

shapiro_DBP

## 
##  Shapiro-Wilk normality test
## 
## data:  data$DBP
## W = 0.9573, p-value < 2.2e-16

shapiro_DIFF <- shapiro.test(data_2$DIFF)
shapiro_DIFF

## 
##  Shapiro-Wilk normality test
## 
## data:  data_2$DIFF
## W = 0.90876, p-value < 2.2e-16

##Wilcoxon signed rank test

?wilcox.test
wilcoxon_SBP_DBP <- wilcox.test(data_2$SBP, data_2$DBP, paired = TRUE, alternative = "two.sided")
wilcoxon_DIFF <- wilcox.test(data_2$DIFF, alternative = "two.sided")

#Probelm set 2 ##load data

## Warning: package 'survival' was built under R version 4.4.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

##   pair time cens   treat
## 1    1    1    1 control
## 2    1   10    1    6-MP
## 3    2   22    1 control
## 4    2    7    1    6-MP
## 5    3    3    1 control
## 6    3   32    0    6-MP

#survival formula for whole group

??survfit
?Surv

fit_entire <- survfit(data = gehan, Surv(time, cens) ~ 1)
fit_entire %>% summary()

## Call: survfit(formula = Surv(time, cens) ~ 1, data = gehan)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     42       2    0.952  0.0329       0.8901        1.000
##     2     40       2    0.905  0.0453       0.8202        0.998
##     3     38       1    0.881  0.0500       0.7883        0.985
##     4     37       2    0.833  0.0575       0.7279        0.954
##     5     35       2    0.786  0.0633       0.6709        0.920
##     6     33       3    0.714  0.0697       0.5899        0.865
##     7     29       1    0.690  0.0715       0.5628        0.845
##     8     28       4    0.591  0.0764       0.4588        0.762
##    10     23       1    0.565  0.0773       0.4325        0.739
##    11     21       2    0.512  0.0788       0.3783        0.692
##    12     18       2    0.455  0.0796       0.3227        0.641
##    13     16       1    0.426  0.0795       0.2958        0.615
##    15     15       1    0.398  0.0791       0.2694        0.588
##    16     14       1    0.369  0.0784       0.2437        0.560
##    17     13       1    0.341  0.0774       0.2186        0.532
##    22      9       2    0.265  0.0765       0.1507        0.467
##    23      7       2    0.189  0.0710       0.0909        0.395

print(fit_entire, print.rmean = TRUE)

## Call: survfit(formula = Surv(time, cens) ~ 1, data = gehan)
## 
##       n events rmean* se(rmean) median 0.95LCL 0.95UCL
## [1,] 42     30   15.3      1.86     12       8      22
##     * restricted mean with upper limit =  35

fit_group <- survfit(data = gehan, Surv(time, cens) ~ treat)
fit_group %>% summary()

## Call: survfit(formula = Surv(time, cens) ~ treat, data = gehan)
## 
##                 treat=6-MP 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     6     21       3    0.857  0.0764        0.720        1.000
##     7     17       1    0.807  0.0869        0.653        0.996
##    10     15       1    0.753  0.0963        0.586        0.968
##    13     12       1    0.690  0.1068        0.510        0.935
##    16     11       1    0.627  0.1141        0.439        0.896
##    22      7       1    0.538  0.1282        0.337        0.858
##    23      6       1    0.448  0.1346        0.249        0.807
## 
##                 treat=control 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     21       2   0.9048  0.0641      0.78754        1.000
##     2     19       2   0.8095  0.0857      0.65785        0.996
##     3     17       1   0.7619  0.0929      0.59988        0.968
##     4     16       2   0.6667  0.1029      0.49268        0.902
##     5     14       2   0.5714  0.1080      0.39455        0.828
##     8     12       4   0.3810  0.1060      0.22085        0.657
##    11      8       2   0.2857  0.0986      0.14529        0.562
##    12      6       2   0.1905  0.0857      0.07887        0.460
##    15      4       1   0.1429  0.0764      0.05011        0.407
##    17      3       1   0.0952  0.0641      0.02549        0.356
##    22      2       1   0.0476  0.0465      0.00703        0.322
##    23      1       1   0.0000     NaN           NA           NA

print(fit_entire, print.rmean = TRUE)

## Call: survfit(formula = Surv(time, cens) ~ 1, data = gehan)
## 
##       n events rmean* se(rmean) median 0.95LCL 0.95UCL
## [1,] 42     30   15.3      1.86     12       8      22
##     * restricted mean with upper limit =  35

#plot KM survival curve

plot(fit_entire, lty = 1, lwd = 2, conf.int = 'none', mark.time = T, col = '#a6020f',
     xlab = "Remission Time (weeks)", ylab = "Survival Probability",
     main = "Kaplan-Meier Curves for All Patients"
     )

plot(fit_group, lty = 1, lwd = 2, conf.int = 'none', mark.time = T, col = c('#a6020f','#1f78b4'),
     xlab = "Remission Time (weeks)", ylab = "Survival Probability",
     main = "Kaplan-Meier Curves for Each Group"
     )
legend("topright",
       legend = levels(gehan$treat),
       col = c("#a6020f", "#1f78b4"),
       lwd = 3,
       lty = 1,
       bty = "n")

advanced_medical_data_science_midterm_hw

Junha Park

2026-04-21