## [1] "id" "version" "V99RNTCNT" "V99ERKDATE" "V99ERKFLDT"
## [6] "V99ERKRPCF" "V99ERKTLPR" "V99ERKTPPR" "V99ERKBLRP" "V99ERKVSRP"
## [11] "V99ERKPODX" "V99ERKDAYS" "V99ERKVSPR" "V99ERKVSAF" "V99ERKRPSN"
## [16] "V99ERKXRPR" "V99ERKXRAF" "V99ELKDATE" "V99ELKFLDT" "V99ELKRPCF"
## [21] "V99ELKTLPR" "V99ELKTPPR" "V99ELKBLRP" "V99ELKVSRP" "V99ELKPODX"
## [26] "V99ELKDAYS" "V99ELKVSPR" "V99ELKVSAF" "V99ELKRPSN" "V99ELKXRPR"
## [31] "V99ELKXRAF" "V99ERHDATE" "V99ERHFLDT" "V99ERHRPCF" "V99ERHPODX"
## [36] "V99ERHDAYS" "V99ERHVSPR" "V99ERHVSAF" "V99ERHRPSN" "V99ERHXRPR"
## [41] "V99ERHXRAF" "V99ERHVSRP" "V99ERHBLRP" "V99ELHDATE" "V99ELHFLDT"
## [46] "V99ELHRPCF" "V99ELHPODX" "V99ELHDAYS" "V99ELHVSPR" "V99ELHVSAF"
## [51] "V99ELHRPSN" "V99ELHXRPR" "V99ELHXRAF" "V99ELHVSRP" "V99ELHBLRP"
## [56] "V99EXLVSQD" "V99ERXIOA" "V99ERXIOAN" "V99ERXNOA" "V99ERXNOAN"
## [61] "V99ERKLOA" "V99ERKLOAN" "V99ELXIOA" "V99ELXIOAN" "V99ELXNOA"
## [66] "V99ELXNOAN" "V99ELKLOA" "V99ELKLOAN" "V99ERXJSNM" "V99ERXJSNL"
## [71] "V99ELXJSNM" "V99ELXJSNL" "V99ERJSFP" "V99ERJSLP" "V99ERNJSLP"
## [76] "V99ERJSTFP" "V99ERJSFW" "V99ERJSLW" "V99ERNJSLW" "V99ERJSTFW"
## [81] "V99ELJSFP" "V99ELJSLP" "V99ELNJSLP" "V99ELJSTFP" "V99ELJSFW"
## [86] "V99ELJSLW" "V99ELNJSLW" "V99ELJSTFW" "V99EDDCF" "V99EDDDATE"
## [91] "V99EDDFLDT" "V99EDDVSPR"
## [1] 4796 92
We will be considering the following variables:
V99RNTCNT: Most recent OAI contactV99ERKVSPR: Closest OAI contact prior to knee replacement (right knee)V99ELKVSPR: Closest OAI contact prior to knee replacement (left knee)V99ERKDATE: Date of right knee replacementV99ELKDATE: Date of left knee replacementSelect variables of interest from the outcomes.csv file for knee replacement and exclude rows with NaN’s for V99RNTCNT
summary(outcomes$V99RNTCNT)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 10.000 11.000 9.817 11.000 11.000 146
For some patients (146) there is no information for the most recent OAI contact: we will delete these rows
nrow(subset(outcomes, (is.na(outcomes$V99RNTCNT)) & (!is.na(outcomes$V99ELKVSPR))))
## [1] 1
subset(outcomes, (is.na(outcomes$V99RNTCNT)) & (!is.na(outcomes$V99ELKVSPR)), c(id, V99ELKVSPR))
## id V99ELKVSPR
## 3962 9825520 0
One patient that replaced the left knee will be removed (V99ELKVSPR = 0, id = 9825520)
nrow(subset(outcomes, (is.na(outcomes$V99RNTCNT)) & (!is.na(outcomes$V99ERKVSPR))))
## [1] 0
No patients that replaced the right knee will be removed
temp <- subset(outcomes, !is.na(outcomes$V99RNTCNT),
select = c(id, V99RNTCNT, V99ERKVSPR, V99ELKVSPR, V99ERKDATE, V99ELKDATE))
dim(temp)
## [1] 4650 6
head(temp, 20)
## id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE
## 1 9000099 11 NA NA
## 2 9000296 11 NA NA
## 3 9000622 1 NA NA
## 4 9000798 11 NA NA
## 5 9001104 3 NA NA
## 6 9001400 11 NA NA
## 7 9001695 9 NA NA
## 8 9001897 11 NA NA
## 9 9002116 11 NA NA
## 10 9002316 11 NA NA
## 11 9002411 6 NA NA
## 12 9002430 11 6 NA 2010-02-15
## 14 9002817 11 NA NA
## 15 9003126 11 NA NA
## 16 9003175 11 NA NA
## 17 9003316 11 NA NA
## 18 9003380 11 NA NA
## 19 9003406 11 NA NA
## 20 9003430 9 NA NA
## 21 9003658 11 NA NA
This column is the time of last visit before the event for those patients who replaced the knee and the time of last known visit for those patients who didn’t
temp <- transform(temp, time = ifelse(!is.na(temp$V99ERKVSPR), temp$V99ERKVSPR,
ifelse(!is.na(temp$V99ELKVSPR), temp$V99ELKVSPR,
temp$V99RNTCNT)))
head(temp, 30)
## id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE time
## 1 9000099 11 NA NA 11
## 2 9000296 11 NA NA 11
## 3 9000622 1 NA NA 1
## 4 9000798 11 NA NA 11
## 5 9001104 3 NA NA 3
## 6 9001400 11 NA NA 11
## 7 9001695 9 NA NA 9
## 8 9001897 11 NA NA 11
## 9 9002116 11 NA NA 11
## 10 9002316 11 NA NA 11
## 11 9002411 6 NA NA 6
## 12 9002430 11 6 NA 2010-02-15 6
## 14 9002817 11 NA NA 11
## 15 9003126 11 NA NA 11
## 16 9003175 11 NA NA 11
## 17 9003316 11 NA NA 11
## 18 9003380 11 NA NA 11
## 19 9003406 11 NA NA 11
## 20 9003430 9 NA NA 9
## 21 9003658 11 NA NA 11
## 22 9003815 11 NA NA 11
## 23 9003895 11 NA NA 11
## 24 9004175 11 NA NA 11
## 25 9004184 11 NA NA 11
## 26 9004315 11 NA NA 11
## 27 9004462 1 NA NA 1
## 28 9004669 11 NA NA 11
## 29 9004905 11 NA NA 11
## 30 9005075 5 NA NA 5
## 31 9005132 11 NA 3 2007-09-25 3
temp$time[temp$time == 1] <- 12
temp$time[temp$time == 2] <- 18
temp$time[temp$time == 3] <- 24
temp$time[temp$time == 4] <- 30
temp$time[temp$time == 5] <- 36
temp$time[temp$time == 6] <- 48
temp$time[temp$time == 7] <- 60
temp$time[temp$time == 8] <- 72
temp$time[temp$time == 9] <- 84
temp$time[temp$time == 10] <- 96
temp$time[temp$time == 11] <- 108
head(temp, 30)
## id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE time
## 1 9000099 11 NA NA 108
## 2 9000296 11 NA NA 108
## 3 9000622 1 NA NA 12
## 4 9000798 11 NA NA 108
## 5 9001104 3 NA NA 24
## 6 9001400 11 NA NA 108
## 7 9001695 9 NA NA 84
## 8 9001897 11 NA NA 108
## 9 9002116 11 NA NA 108
## 10 9002316 11 NA NA 108
## 11 9002411 6 NA NA 48
## 12 9002430 11 6 NA 2010-02-15 48
## 14 9002817 11 NA NA 108
## 15 9003126 11 NA NA 108
## 16 9003175 11 NA NA 108
## 17 9003316 11 NA NA 108
## 18 9003380 11 NA NA 108
## 19 9003406 11 NA NA 108
## 20 9003430 9 NA NA 84
## 21 9003658 11 NA NA 108
## 22 9003815 11 NA NA 108
## 23 9003895 11 NA NA 108
## 24 9004175 11 NA NA 108
## 25 9004184 11 NA NA 108
## 26 9004315 11 NA NA 108
## 27 9004462 1 NA NA 12
## 28 9004669 11 NA NA 108
## 29 9004905 11 NA NA 108
## 30 9005075 5 NA NA 36
## 31 9005132 11 NA 3 2007-09-25 24
Replace empty values for V99ERKDATE and V99ELKDATE with NaN’s
temp$V99ERKDATE[temp$V99ERKDATE == ""] <- NA
temp$V99ELKDATE[temp$V99ELKDATE == ""] <- NA
temp <- transform(temp, cens = ifelse(!is.na(temp$V99ERKDATE), 1,
ifelse(!is.na(temp$V99ELKDATE), 1, 0)))
head(temp, 30)
## id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE time cens
## 1 9000099 11 NA NA <NA> <NA> 108 0
## 2 9000296 11 NA NA <NA> <NA> 108 0
## 3 9000622 1 NA NA <NA> <NA> 12 0
## 4 9000798 11 NA NA <NA> <NA> 108 0
## 5 9001104 3 NA NA <NA> <NA> 24 0
## 6 9001400 11 NA NA <NA> <NA> 108 0
## 7 9001695 9 NA NA <NA> <NA> 84 0
## 8 9001897 11 NA NA <NA> <NA> 108 0
## 9 9002116 11 NA NA <NA> <NA> 108 0
## 10 9002316 11 NA NA <NA> <NA> 108 0
## 11 9002411 6 NA NA <NA> <NA> 48 0
## 12 9002430 11 6 NA 2010-02-15 <NA> 48 1
## 14 9002817 11 NA NA <NA> <NA> 108 0
## 15 9003126 11 NA NA <NA> <NA> 108 0
## 16 9003175 11 NA NA <NA> <NA> 108 0
## 17 9003316 11 NA NA <NA> <NA> 108 0
## 18 9003380 11 NA NA <NA> <NA> 108 0
## 19 9003406 11 NA NA <NA> <NA> 108 0
## 20 9003430 9 NA NA <NA> <NA> 84 0
## 21 9003658 11 NA NA <NA> <NA> 108 0
## 22 9003815 11 NA NA <NA> <NA> 108 0
## 23 9003895 11 NA NA <NA> <NA> 108 0
## 24 9004175 11 NA NA <NA> <NA> 108 0
## 25 9004184 11 NA NA <NA> <NA> 108 0
## 26 9004315 11 NA NA <NA> <NA> 108 0
## 27 9004462 1 NA NA <NA> <NA> 12 0
## 28 9004669 11 NA NA <NA> <NA> 108 0
## 29 9004905 11 NA NA <NA> <NA> 108 0
## 30 9005075 5 NA NA <NA> <NA> 36 0
## 31 9005132 11 NA 3 <NA> 2007-09-25 24 1
dim(temp)
## [1] 4650 8
base <- subset(temp, select = c(id, time, cens))
head(base, 30)
## id time cens
## 1 9000099 108 0
## 2 9000296 108 0
## 3 9000622 12 0
## 4 9000798 108 0
## 5 9001104 24 0
## 6 9001400 108 0
## 7 9001695 84 0
## 8 9001897 108 0
## 9 9002116 108 0
## 10 9002316 108 0
## 11 9002411 48 0
## 12 9002430 48 1
## 14 9002817 108 0
## 15 9003126 108 0
## 16 9003175 108 0
## 17 9003316 108 0
## 18 9003380 108 0
## 19 9003406 108 0
## 20 9003430 84 0
## 21 9003658 108 0
## 22 9003815 108 0
## 23 9003895 108 0
## 24 9004175 108 0
## 25 9004184 108 0
## 26 9004315 108 0
## 27 9004462 12 0
## 28 9004669 108 0
## 29 9004905 108 0
## 30 9005075 36 0
## 31 9005132 24 1
dim(base)
## [1] 4650 3
P02SEX: sexP02RACE: raceV00AGE: ageP01BMI: body mass indexV00WLKAID: 20-meter walk using walking aid such as caneV00STROKE: patient had or did not have a stroke (or similar medical problem)The covariates P02SEX and P02RACE are stored in the enrollees.csv file
## [1] 4796 60
race_sex <- subset(enrollees, select = c(ID, P02RACE, P02SEX))
colnames(race_sex) <- c("id", "race", "sex")
head(race_sex)
## id race sex
## 1 9000099 1 1
## 2 9000296 1 1
## 3 9000622 1 2
## 4 9000798 1 1
## 5 9001104 1 2
## 6 9001400 1 2
OAI_sa <- merge(x = base, y = race_sex, by = "id")
head(OAI_sa)
## id time cens race sex
## 1 9000099 108 0 1 1
## 2 9000296 108 0 1 1
## 3 9000622 12 0 1 2
## 4 9000798 108 0 1 1
## 5 9001104 24 0 1 2
## 6 9001400 108 0 1 2
dim(OAI_sa)
## [1] 4650 5
The covariate V00AGE is stored in the subjectchar00.csv file
## [1] 4796 106
age <- subset(subjectchar, select = c(ID, V00AGE))
colnames(age) <- c("id", "age")
head(age)
## id age
## 1 9000099 59
## 2 9000296 69
## 3 9000622 71
## 4 9000798 56
## 5 9001104 72
## 6 9001400 75
OAI_sa <- merge(x = OAI_sa, y = age, by = "id")
head(OAI_sa)
## id time cens race sex age
## 1 9000099 108 0 1 1 59
## 2 9000296 108 0 1 1 69
## 3 9000622 12 0 1 2 71
## 4 9000798 108 0 1 1 56
## 5 9001104 24 0 1 2 72
## 6 9001400 108 0 1 2 75
dim(OAI_sa)
## [1] 4650 6
The covariates P01BMI and V00WLKAID are stored in the physexam00.csv file
## [1] 4796 227
bmi_wlkaid <- subset(physexam, select = c(ID, P01BMI, V00WLKAID))
colnames(bmi_wlkaid) <- c("id", "bmi", "wlkaid")
head(bmi_wlkaid)
## id bmi wlkaid
## 1 9000099 23.8 0
## 2 9000296 29.8 0
## 3 9000622 22.7 0
## 4 9000798 32.4 0
## 5 9001104 30.7 0
## 6 9001400 23.5 0
OAI_sa <- merge(x = OAI_sa, y = bmi_wlkaid, by = "id")
head(OAI_sa)
## id time cens race sex age bmi wlkaid
## 1 9000099 108 0 1 1 59 23.8 0
## 2 9000296 108 0 1 1 69 29.8 0
## 3 9000622 12 0 1 2 71 22.7 0
## 4 9000798 108 0 1 1 56 32.4 0
## 5 9001104 24 0 1 2 72 30.7 0
## 6 9001400 108 0 1 2 75 23.5 0
dim(OAI_sa)
## [1] 4650 8
The covariate V00STROKE is stored in the medhist00.csv file
## [1] 4796 312
stroke <- subset(medhist, select = c(ID, V00STROKE))
colnames(stroke) <- c("id", "stroke")
head(stroke)
## id stroke
## 1 9000099 0
## 2 9000296 0
## 3 9000622 1
## 4 9000798 0
## 5 9001104 0
## 6 9001400 0
OAI_sa <- merge(x = OAI_sa, y = stroke, by = "id")
write.csv(OAI_sa, "./data/data_COX_PH_t0.csv")
head(OAI_sa, 20)
## id time cens race sex age bmi wlkaid stroke
## 1 9000099 108 0 1 1 59 23.8 0 0
## 2 9000296 108 0 1 1 69 29.8 0 0
## 3 9000622 12 0 1 2 71 22.7 0 1
## 4 9000798 108 0 1 1 56 32.4 0 0
## 5 9001104 24 0 1 2 72 30.7 0 0
## 6 9001400 108 0 1 2 75 23.5 0 0
## 7 9001695 84 0 1 2 52 28.6 0 0
## 8 9001897 108 0 1 1 72 25.9 0 0
## 9 9002116 108 0 2 1 61 36.5 0 0
## 10 9002316 108 0 1 1 76 25.1 0 0
## 11 9002411 48 0 1 1 79 31.8 0 0
## 12 9002430 48 1 1 1 68 27.4 0 0
## 13 9002817 108 0 1 2 77 22.9 0 0
## 14 9003126 108 0 1 2 63 35.5 0 0
## 15 9003175 108 0 2 2 56 22.5 0 0
## 16 9003316 108 0 2 1 64 36.0 0 0
## 17 9003380 108 0 1 1 64 33.1 0 0
## 18 9003406 108 0 2 1 45 41.3 0 0
## 19 9003430 84 0 1 1 72 30.0 0 0
## 20 9003658 108 0 1 1 50 30.7 0 0
dim(OAI_sa)
## [1] 4650 9
sobj <- Surv(OAI_sa$time, OAI_sa$cens)
glimpse(sobj)
## 'Surv' num [1:4650, 1:2] 108+ 108+ 12+ 108+ 24+ 108+ 84+ 108+ 108+ 108+ ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "time" "status"
## - attr(*, "type")= chr "right"
summary(sobj)
## time status
## Min. : 0.00 Min. :0.00000
## 1st Qu.: 84.00 1st Qu.:0.00000
## Median :108.00 Median :0.00000
## Mean : 90.31 Mean :0.09419
## 3rd Qu.:108.00 3rd Qu.:0.00000
## Max. :108.00 Max. :1.00000
sobj[1:20]
## [1] 108+ 108+ 12+ 108+ 24+ 108+ 84+ 108+ 108+ 108+ 48+ 48 108+ 108+
## [15] 108+ 108+ 108+ 108+ 84+ 108+
Survival times for censored patients (cens = 0) are followed by a “+” sign in sobj.
table(sobj)
## sobj
## 0 12+ 12 18+ 18 24+ 24 30+ 30 36+ 36 48+ 48 60+ 60
## 26 120 33 3 4 93 44 7 8 109 46 310 58 81 52
## 72+ 72 84+ 84 96+ 96 108+
## 52 58 111 53 273 56 3053
table(OAI_sa$cens)
##
## 0 1
## 4212 438
438 patients had a surgery for knee replacement.
km <- survfit(Surv(time, cens) ~ 1, data = OAI_sa)
ggsurvplot(km, ylim = c(0.85, 1), risk.table = TRUE)
surv_summary(km)
## time n.risk n.event n.censor surv std.err upper lower
## 1 0 4650 26 0 0.9944086 0.001099642 0.9965541 0.9922677
## 2 12 4624 33 120 0.9873118 0.001662440 0.9905341 0.9841001
## 3 18 4471 4 3 0.9864285 0.001721623 0.9897627 0.9831056
## 4 24 4464 44 93 0.9767057 0.002279033 0.9810782 0.9723526
## 5 30 4327 8 7 0.9748999 0.002371090 0.9794410 0.9703798
## 6 36 4312 46 109 0.9644997 0.002850044 0.9699025 0.9591271
## 7 48 4157 58 310 0.9510427 0.003395084 0.9573923 0.9447352
## 8 60 3789 52 81 0.9379906 0.003898595 0.9451854 0.9308507
## 9 72 3656 58 52 0.9231100 0.004428121 0.9311565 0.9151331
## 10 84 3546 53 111 0.9093128 0.004887455 0.9180652 0.9006439
## 11 96 3382 56 273 0.8942562 0.005372676 0.9037227 0.8848889
## 12 108 3053 0 3053 0.8942562 0.005372676 0.9037227 0.8848889
age, race and bmiIn order to do this, we need to transform these variables into categorical variables.
race into a categorical variable race_catOAI_sa$race_cat <- factor(OAI_sa$race,
levels = c("0", "1", "2", "3"),
labels = c("Other Non-white", "White or Caucasian",
"Black or African American", "Asian"))
table(OAI_sa$race_cat)
##
## Other Non-white White or Caucasian
## 75 3708
## Black or African American Asian
## 820 43
levels(OAI_sa$race_cat)
## [1] "Other Non-white" "White or Caucasian"
## [3] "Black or African American" "Asian"
The most common value is “White or Caucasian” and the first level is “Other Non-white”. We will change this so that “White or Caucasian” is taken as reference when computing the hazard ratios.
OAI_sa$race_cat <- relevel(OAI_sa$race_cat, "White or Caucasian")
levels(OAI_sa$race_cat)
## [1] "White or Caucasian" "Other Non-white"
## [3] "Black or African American" "Asian"
age into a categorical variable age_groupsummary(OAI_sa$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 45.00 53.00 61.00 61.13 69.00 79.00
We will create three groups: - 45-59 - 60-74 - >= 75
OAI_sa <- OAI_sa %>% mutate(age_group = ifelse(OAI_sa$age < 60, "45-59",
ifelse(OAI_sa$age < 75, "60-74", "75+")))
OAI_sa$age_group <- factor(OAI_sa$age_group)
table(OAI_sa$age_group)
##
## 45-59 60-74 75+
## 2152 2081 417
levels(OAI_sa$age_group)
## [1] "45-59" "60-74" "75+"
bmi into a categorical variable bmi_catOAI_sa <- OAI_sa %>% mutate(bmi_cat = ifelse(bmi <= 18.5, "Underweight",
ifelse(bmi <= 24.9, "Healthy",
ifelse(bmi <= 29.9, "Overweight",
ifelse(bmi <= 39.9, "Obese",
"Morbidly obese")))))
OAI_sa$bmi_cat <- factor(OAI_sa$bmi_cat)
table(OAI_sa$bmi_cat)
##
## Healthy Morbidly obese Obese Overweight Underweight
## 1104 75 1625 1827 15
levels(OAI_sa$bmi_cat)
## [1] "Healthy" "Morbidly obese" "Obese" "Overweight"
## [5] "Underweight"
race_catkm <- survfit(Surv(time, cens) ~ race_cat, data = OAI_sa)
ggsurvplot(km,
ylim = c(0.85, 1),
font.x = c(12, "bold"),
font.y = c(12, "bold"),
conf.int = FALSE,
pval = TRUE,
pval.coord = c(5, 0.87),
pval.method = TRUE,
pval.method.coord = c(5, 0.89),
test.for.trend = FALSE,
risk.table = "nrisk_cumevents",
cumevents = FALSE,
cumcensor = FALSE,
tables.height = 0.3,
fontsize = 2.5,
linetype = "strata",
legend = "top",
legend.title = "Race",
legend.labs = c("White/Cauc.", "O. Non-white",
"Black/African A.", "Asian")
)
To evaluate whether or not KM curves for two or more groups are statistically equivalent the most popular testing method is the log-rank test.
Overall, the curves are significantly different. Since we are comparing more than two curves, we must use a pairwise comparison to draw any conclusions.
res <- pairwise_survdiff(Surv(time, cens) ~ race_cat, data = OAI_sa)
res
##
## Pairwise comparisons using Log-Rank test
##
## data: OAI_sa and race_cat
##
## White or Caucasian Other Non-white
## Other Non-white 0.0751 -
## Black or African American 0.0223 0.0082
## Asian 0.5921 0.5962
## Black or African American
## Other Non-white -
## Black or African American -
## Asian 0.1944
##
## P value adjustment method: BH
surv_summary(km, data = OAI_sa)
## time n.risk n.event n.censor surv std.err upper lower
## 1 0 3708 19 0 0.9948759 0.001178562 0.9971767 0.9925805
## 2 12 3689 28 77 0.9873247 0.001860712 0.9909320 0.9837306
## 3 18 3584 3 2 0.9864983 0.001922498 0.9902224 0.9827881
## 4 24 3579 37 66 0.9762998 0.002571908 0.9812336 0.9713908
## 5 30 3476 5 4 0.9748954 0.002651250 0.9799745 0.9698427
## 6 36 3467 39 62 0.9639289 0.003211014 0.9700145 0.9578815
## 7 48 3366 51 243 0.9493239 0.003857616 0.9565288 0.9421733
## 8 60 3072 44 61 0.9357268 0.004428471 0.9438840 0.9276402
## 9 72 2967 47 37 0.9209041 0.005003632 0.9299797 0.9119169
## 10 84 2883 43 90 0.9071688 0.005503462 0.9170070 0.8974361
## 11 96 2750 52 213 0.8900150 0.006107099 0.9007322 0.8794253
## 12 108 2485 0 2485 0.8900150 0.006107099 0.9007322 0.8794253
## 13 0 75 2 0 0.9733333 0.019112739 1.0000000 0.9375465
## 14 12 73 0 3 0.9733333 0.019112739 1.0000000 0.9375465
## 15 24 70 0 1 0.9733333 0.019112739 1.0000000 0.9375465
## 16 30 69 1 0 0.9592271 0.024050479 1.0000000 0.9150601
## 17 36 68 2 3 0.9310145 0.032000911 0.9912785 0.8744142
## 18 48 63 1 8 0.9162365 0.035778132 0.9827928 0.8541874
## 19 60 54 3 1 0.8653345 0.048676476 0.9519573 0.7865938
## 20 72 50 1 2 0.8480278 0.052702586 0.9403090 0.7648029
## 21 84 47 2 3 0.8119415 0.061017941 0.9150889 0.7204207
## 22 96 42 0 4 0.8119415 0.061017941 0.9150889 0.7204207
## 23 108 38 0 38 0.8119415 0.061017941 0.9150889 0.7204207
## 24 0 820 5 0 0.9939024 0.002735264 0.9992451 0.9885884
## 25 12 815 4 37 0.9890244 0.003678781 0.9961813 0.9819189
## 26 18 774 1 1 0.9877466 0.003899336 0.9953244 0.9802264
## 27 24 772 5 26 0.9813493 0.004863024 0.9907476 0.9720401
## 28 30 741 2 3 0.9787005 0.005225065 0.9887749 0.9687289
## 29 36 736 5 41 0.9720518 0.006049356 0.9836455 0.9605947
## 30 48 690 5 53 0.9650079 0.006868287 0.9780863 0.9521044
## 31 60 632 5 18 0.9573734 0.007732478 0.9719932 0.9429734
## 32 72 609 9 13 0.9432250 0.009188131 0.9603648 0.9263910
## 33 84 587 7 18 0.9319770 0.010246081 0.9508821 0.9134477
## 34 96 562 4 51 0.9253437 0.010850688 0.9452337 0.9058722
## 35 108 507 0 507 0.9253437 0.010850688 0.9452337 0.9058722
## 36 12 43 1 3 0.9767442 0.023531040 1.0000000 0.9327198
## 37 24 39 1 0 0.9516995 0.035049589 1.0000000 0.8885166
## 38 36 38 0 3 0.9516995 0.035049589 1.0000000 0.8885166
## 39 48 35 1 6 0.9245081 0.045484171 1.0000000 0.8456575
## 40 60 28 0 1 0.9245081 0.045484171 1.0000000 0.8456575
## 41 72 27 1 0 0.8902670 0.059104241 0.9996084 0.7928858
## 42 84 26 1 0 0.8560260 0.070934990 0.9837096 0.7449154
## 43 96 25 0 3 0.8560260 0.070934990 0.9837096 0.7449154
## 44 108 22 0 22 0.8560260 0.070934990 0.9837096 0.7449154
## strata race_cat
## 1 race_cat=White or Caucasian White or Caucasian
## 2 race_cat=White or Caucasian White or Caucasian
## 3 race_cat=White or Caucasian White or Caucasian
## 4 race_cat=White or Caucasian White or Caucasian
## 5 race_cat=White or Caucasian White or Caucasian
## 6 race_cat=White or Caucasian White or Caucasian
## 7 race_cat=White or Caucasian White or Caucasian
## 8 race_cat=White or Caucasian White or Caucasian
## 9 race_cat=White or Caucasian White or Caucasian
## 10 race_cat=White or Caucasian White or Caucasian
## 11 race_cat=White or Caucasian White or Caucasian
## 12 race_cat=White or Caucasian White or Caucasian
## 13 race_cat=Other Non-white Other Non-white
## 14 race_cat=Other Non-white Other Non-white
## 15 race_cat=Other Non-white Other Non-white
## 16 race_cat=Other Non-white Other Non-white
## 17 race_cat=Other Non-white Other Non-white
## 18 race_cat=Other Non-white Other Non-white
## 19 race_cat=Other Non-white Other Non-white
## 20 race_cat=Other Non-white Other Non-white
## 21 race_cat=Other Non-white Other Non-white
## 22 race_cat=Other Non-white Other Non-white
## 23 race_cat=Other Non-white Other Non-white
## 24 race_cat=Black or African American Black or African American
## 25 race_cat=Black or African American Black or African American
## 26 race_cat=Black or African American Black or African American
## 27 race_cat=Black or African American Black or African American
## 28 race_cat=Black or African American Black or African American
## 29 race_cat=Black or African American Black or African American
## 30 race_cat=Black or African American Black or African American
## 31 race_cat=Black or African American Black or African American
## 32 race_cat=Black or African American Black or African American
## 33 race_cat=Black or African American Black or African American
## 34 race_cat=Black or African American Black or African American
## 35 race_cat=Black or African American Black or African American
## 36 race_cat=Asian Asian
## 37 race_cat=Asian Asian
## 38 race_cat=Asian Asian
## 39 race_cat=Asian Asian
## 40 race_cat=Asian Asian
## 41 race_cat=Asian Asian
## 42 race_cat=Asian Asian
## 43 race_cat=Asian Asian
## 44 race_cat=Asian Asian
age_groupkm <- survfit(Surv(time, cens) ~ age_group, data = OAI_sa)
ggsurvplot(km,
ylim = c(0.85, 1),
font.x = c(12, "bold"),
font.y = c(12, "bold"),
conf.int = FALSE,
pval = TRUE,
test.for.trend = TRUE,
pval.coord = c(5, 0.87),
pval.method = TRUE,
pval.method.coord = c(5, 0.89),
risk.table = "nrisk_cumevents",
tables.height = 0.3,
fontsize = 2.5,
linetype = "strata",
legend = "top",
legend.title = "Age",
legend.labs = c("45-59", "60-74", ">= 75")
)
Thr log-rank for trend is statistically different. Since we are comparing more than two curves, let’s use a pairwise comparison to draw any further conclusions.
res <- pairwise_survdiff(Surv(time, cens) ~ age_group, data = OAI_sa)
res
##
## Pairwise comparisons using Log-Rank test
##
## data: OAI_sa and age_group
##
## 45-59 60-74
## 60-74 2.9e-09 -
## 75+ 3.5e-07 0.25
##
## P value adjustment method: BH
surv_summary(km, data = OAI_sa)
## time n.risk n.event n.censor surv std.err upper lower
## 1 0 2152 7 0 0.9967472 0.001231443 0.9991559 0.9943444
## 2 12 2145 9 54 0.9925651 0.001865685 0.9962012 0.9889422
## 3 18 2082 3 1 0.9911348 0.002043004 0.9951115 0.9871741
## 4 24 2078 11 48 0.9858882 0.002595158 0.9909156 0.9808863
## 5 30 2019 4 1 0.9839350 0.002778140 0.9893072 0.9785920
## 6 36 2014 11 62 0.9785610 0.003231850 0.9847792 0.9723821
## 7 48 1941 19 123 0.9689821 0.003941810 0.9764972 0.9615247
## 8 60 1799 17 29 0.9598255 0.004565164 0.9684521 0.9512757
## 9 72 1753 22 16 0.9477798 0.005300077 0.9576766 0.9379852
## 10 84 1715 17 31 0.9383849 0.005824826 0.9491593 0.9277328
## 11 96 1667 19 121 0.9276894 0.006390985 0.9393828 0.9161416
## 12 108 1527 0 1527 0.9276894 0.006390985 0.9393828 0.9161416
## 13 0 2081 15 0 0.9927919 0.001867860 0.9964331 0.9891640
## 14 12 2066 19 50 0.9836617 0.002825169 0.9891236 0.9782300
## 15 18 1997 1 2 0.9831691 0.002869226 0.9887136 0.9776557
## 16 24 1994 25 41 0.9708425 0.003820990 0.9781405 0.9635991
## 17 30 1928 4 4 0.9688283 0.003959581 0.9763763 0.9613387
## 18 36 1920 27 40 0.9552042 0.004806971 0.9642462 0.9462470
## 19 48 1853 30 141 0.9397395 0.005655786 0.9502146 0.9293799
## 20 60 1682 27 36 0.9246545 0.006456562 0.9364300 0.9130270
## 21 72 1619 33 24 0.9058073 0.007385052 0.9190137 0.8927907
## 22 84 1562 31 57 0.8878303 0.008215960 0.9022428 0.8736481
## 23 96 1474 33 113 0.8679535 0.009112545 0.8835947 0.8525893
## 24 108 1328 0 1328 0.8679535 0.009112545 0.8835947 0.8525893
## 25 0 417 4 0 0.9904077 0.004819333 0.9998071 0.9810966
## 26 12 413 5 16 0.9784173 0.007273160 0.9924646 0.9645687
## 27 24 392 8 4 0.9584496 0.010297821 0.9779908 0.9392988
## 28 30 380 0 2 0.9584496 0.010297821 0.9779908 0.9392988
## 29 36 378 8 7 0.9381649 0.012776743 0.9619550 0.9149632
## 30 48 363 9 46 0.9149046 0.015273604 0.9427070 0.8879222
## 31 60 308 8 16 0.8911409 0.017884716 0.9229323 0.8604445
## 32 72 284 3 12 0.8817274 0.018906485 0.9150137 0.8496521
## 33 84 269 5 23 0.8653384 0.020684822 0.9011414 0.8309579
## 34 96 241 4 39 0.8509760 0.022313528 0.8890181 0.8145617
## 35 108 198 0 198 0.8509760 0.022313528 0.8890181 0.8145617
## strata age_group
## 1 age_group=45-59 45-59
## 2 age_group=45-59 45-59
## 3 age_group=45-59 45-59
## 4 age_group=45-59 45-59
## 5 age_group=45-59 45-59
## 6 age_group=45-59 45-59
## 7 age_group=45-59 45-59
## 8 age_group=45-59 45-59
## 9 age_group=45-59 45-59
## 10 age_group=45-59 45-59
## 11 age_group=45-59 45-59
## 12 age_group=45-59 45-59
## 13 age_group=60-74 60-74
## 14 age_group=60-74 60-74
## 15 age_group=60-74 60-74
## 16 age_group=60-74 60-74
## 17 age_group=60-74 60-74
## 18 age_group=60-74 60-74
## 19 age_group=60-74 60-74
## 20 age_group=60-74 60-74
## 21 age_group=60-74 60-74
## 22 age_group=60-74 60-74
## 23 age_group=60-74 60-74
## 24 age_group=60-74 60-74
## 25 age_group=75+ 75+
## 26 age_group=75+ 75+
## 27 age_group=75+ 75+
## 28 age_group=75+ 75+
## 29 age_group=75+ 75+
## 30 age_group=75+ 75+
## 31 age_group=75+ 75+
## 32 age_group=75+ 75+
## 33 age_group=75+ 75+
## 34 age_group=75+ 75+
## 35 age_group=75+ 75+
bmi_catkm <- survfit(Surv(time, cens) ~ bmi_cat, data = OAI_sa)
ggsurvplot(km,
ylim = c(0.85, 1),
font.x = c(12, "bold"),
font.y = c(12, "bold"),
conf.int = FALSE,
pval = TRUE,
pval.coord = c(5, 0.87),
pval.method = TRUE,
pval.method.coord = c(5, 0.89),
risk.table = "nrisk_cumevents",
tables.height = 0.4,
fontsize = 2.5,
linetype = "strata",
legend = "top",
legend.title = "BMI",
legend.labs = c("Healthy", "Morb. obese", "Obese",
"Overweight", "Underweight")
)
Overall, the curves are significantly different. Since we are comparing more than two curves, we must use a pairwise comparison to draw any conclusions.
res <- pairwise_survdiff(Surv(time, cens) ~ bmi_cat, data = OAI_sa)
res
##
## Pairwise comparisons using Log-Rank test
##
## data: OAI_sa and bmi_cat
##
## Healthy Morbidly obese Obese Overweight
## Morbidly obese 0.04856 - - -
## Obese 3.1e-08 0.92535 - -
## Overweight 0.00073 0.50655 0.01953 -
## Underweight 0.42589 0.25124 0.25124 0.29317
##
## P value adjustment method: BH
surv_summary(km, data = OAI_sa)
## time n.risk n.event n.censor surv std.err upper lower
## 1 0 1104 2 0 0.9981884 0.001282152 1.0000000 0.9956831
## 2 12 1102 4 23 0.9945652 0.002224795 0.9989115 0.9902378
## 3 24 1075 5 20 0.9899393 0.003049031 0.9958729 0.9840411
## 4 30 1050 1 1 0.9889965 0.003194446 0.9952081 0.9828238
## 5 36 1048 7 29 0.9823906 0.004076862 0.9902719 0.9745721
## 6 48 1012 13 74 0.9697710 0.005429504 0.9801461 0.9595058
## 7 60 925 8 16 0.9613838 0.006237866 0.9732098 0.9497015
## 8 72 901 5 12 0.9560487 0.006715988 0.9687164 0.9435466
## 9 84 884 11 23 0.9441522 0.007704424 0.9585174 0.9300022
## 10 96 850 5 68 0.9385983 0.008143679 0.9536998 0.9237360
## 11 108 777 0 777 0.9385983 0.008143679 0.9536998 0.9237360
## 12 0 75 1 0 0.9866667 0.013423121 1.0000000 0.9610471
## 13 12 74 1 2 0.9733333 0.019112739 1.0000000 0.9375465
## 14 24 71 0 1 0.9733333 0.019112739 1.0000000 0.9375465
## 15 36 70 2 2 0.9455238 0.028026146 0.9989146 0.8949867
## 16 48 66 0 5 0.9455238 0.028026146 0.9989146 0.8949867
## 17 60 61 1 1 0.9300234 0.032537500 0.9912652 0.8725653
## 18 72 59 1 1 0.9142603 0.036754805 0.9825522 0.8507151
## 19 84 57 1 1 0.8982207 0.040794594 0.9729879 0.8291987
## 20 96 55 2 7 0.8655581 0.048479947 0.9518367 0.7871002
## 21 108 46 0 46 0.8655581 0.048479947 0.9518367 0.7871002
## 22 0 1625 16 0 0.9901538 0.002473747 0.9949662 0.9853647
## 23 12 1609 17 38 0.9796923 0.003571567 0.9865744 0.9728583
## 24 18 1554 1 2 0.9790619 0.003629111 0.9860507 0.9721226
## 25 24 1551 21 38 0.9658057 0.004692536 0.9747294 0.9569637
## 26 30 1492 4 4 0.9632164 0.004880739 0.9724748 0.9540462
## 27 36 1484 19 49 0.9508841 0.005706227 0.9615785 0.9403087
## 28 48 1416 23 118 0.9354390 0.006649919 0.9477109 0.9233260
## 29 60 1275 17 33 0.9229665 0.007404071 0.9364579 0.9096694
## 30 72 1225 33 17 0.8981029 0.008798858 0.9137254 0.8827475
## 31 84 1175 23 37 0.8805230 0.009716565 0.8974525 0.8639129
## 32 96 1115 22 88 0.8631494 0.010604892 0.8812779 0.8453939
## 33 108 1005 0 1005 0.8631494 0.010604892 0.8812779 0.8453939
## 34 0 1827 7 0 0.9961686 0.001450922 0.9990055 0.9933397
## 35 12 1820 11 57 0.9901478 0.002333714 0.9946871 0.9856292
## 36 18 1752 3 1 0.9884523 0.002534809 0.9933753 0.9835537
## 37 24 1748 18 33 0.9782738 0.003518175 0.9850427 0.9715513
## 38 30 1697 3 2 0.9765443 0.003663487 0.9835815 0.9695576
## 39 36 1692 18 28 0.9661556 0.004447039 0.9746134 0.9577711
## 40 48 1646 22 113 0.9532422 0.005292097 0.9631810 0.9434060
## 41 60 1511 26 31 0.9368396 0.006292344 0.9484650 0.9253567
## 42 72 1454 19 22 0.9245976 0.006978524 0.9373308 0.9120373
## 43 84 1413 18 50 0.9128192 0.007604708 0.9265267 0.8993146
## 44 96 1345 27 109 0.8944950 0.008547661 0.9096068 0.8796342
## 45 108 1209 0 1209 0.8944950 0.008547661 0.9096068 0.8796342
## 46 36 15 0 1 1.0000000 0.000000000 1.0000000 1.0000000
## 47 96 14 0 1 1.0000000 0.000000000 1.0000000 1.0000000
## 48 108 13 0 13 1.0000000 0.000000000 1.0000000 1.0000000
## strata bmi_cat
## 1 bmi_cat=Healthy Healthy
## 2 bmi_cat=Healthy Healthy
## 3 bmi_cat=Healthy Healthy
## 4 bmi_cat=Healthy Healthy
## 5 bmi_cat=Healthy Healthy
## 6 bmi_cat=Healthy Healthy
## 7 bmi_cat=Healthy Healthy
## 8 bmi_cat=Healthy Healthy
## 9 bmi_cat=Healthy Healthy
## 10 bmi_cat=Healthy Healthy
## 11 bmi_cat=Healthy Healthy
## 12 bmi_cat=Morbidly obese Morbidly obese
## 13 bmi_cat=Morbidly obese Morbidly obese
## 14 bmi_cat=Morbidly obese Morbidly obese
## 15 bmi_cat=Morbidly obese Morbidly obese
## 16 bmi_cat=Morbidly obese Morbidly obese
## 17 bmi_cat=Morbidly obese Morbidly obese
## 18 bmi_cat=Morbidly obese Morbidly obese
## 19 bmi_cat=Morbidly obese Morbidly obese
## 20 bmi_cat=Morbidly obese Morbidly obese
## 21 bmi_cat=Morbidly obese Morbidly obese
## 22 bmi_cat=Obese Obese
## 23 bmi_cat=Obese Obese
## 24 bmi_cat=Obese Obese
## 25 bmi_cat=Obese Obese
## 26 bmi_cat=Obese Obese
## 27 bmi_cat=Obese Obese
## 28 bmi_cat=Obese Obese
## 29 bmi_cat=Obese Obese
## 30 bmi_cat=Obese Obese
## 31 bmi_cat=Obese Obese
## 32 bmi_cat=Obese Obese
## 33 bmi_cat=Obese Obese
## 34 bmi_cat=Overweight Overweight
## 35 bmi_cat=Overweight Overweight
## 36 bmi_cat=Overweight Overweight
## 37 bmi_cat=Overweight Overweight
## 38 bmi_cat=Overweight Overweight
## 39 bmi_cat=Overweight Overweight
## 40 bmi_cat=Overweight Overweight
## 41 bmi_cat=Overweight Overweight
## 42 bmi_cat=Overweight Overweight
## 43 bmi_cat=Overweight Overweight
## 44 bmi_cat=Overweight Overweight
## 45 bmi_cat=Overweight Overweight
## 46 bmi_cat=Underweight Underweight
## 47 bmi_cat=Underweight Underweight
## 48 bmi_cat=Underweight Underweight
Whereas the log-rank test compares two Kaplan-Meier survival curves, which might be derived from splitting a patient population into subgroups, Cox proportional hazards models are derived from the underlying baseline hazard functions of the patient populations in question and an arbitrary number of covariates. While the log-rank test and Kaplan-Meier plots require categorical variables, Cox regression can deal with continuous variables.
sex, race, age, bmi, stroke and wlkaid as covariatescxmod <- coxph(Surv(time, cens) ~ sex + age + race + bmi + stroke + wlkaid, data = OAI_sa)
summary(cxmod)
## Call:
## coxph(formula = Surv(time, cens) ~ sex + age + race + bmi + stroke +
## wlkaid, data = OAI_sa)
##
## n= 4535, number of events= 431
## (115 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sex 0.094300 1.098889 0.098808 0.954 0.339897
## age 0.040913 1.041762 0.005493 7.448 9.46e-14 ***
## race -0.461094 0.630594 0.128756 -3.581 0.000342 ***
## bmi 0.074016 1.076824 0.009853 7.512 5.83e-14 ***
## stroke 0.401883 1.494637 0.220317 1.824 0.068135 .
## wlkaid 0.047472 1.048617 0.411977 0.115 0.908263
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sex 1.0989 0.9100 0.9054 1.3337
## age 1.0418 0.9599 1.0306 1.0530
## race 0.6306 1.5858 0.4900 0.8116
## bmi 1.0768 0.9287 1.0562 1.0978
## stroke 1.4946 0.6691 0.9705 2.3018
## wlkaid 1.0486 0.9536 0.4677 2.3512
##
## Concordance= 0.651 (se = 0.014 )
## Rsquare= 0.026 (max possible= 0.792 )
## Likelihood ratio test= 118.3 on 6 df, p=<2e-16
## Wald test = 115.7 on 6 df, p=<2e-16
## Score (logrank) test = 117.5 on 6 df, p=<2e-16
coef(cxmod)
## sex age race bmi stroke wlkaid
## 0.09429951 0.04091311 -0.46109358 0.07401585 0.40188304 0.04747201
race_catggadjustedcurves(cxmod,
ylim = c(0.85, 1),
variable = "race_cat",
legend = "bottom",
legend.title = "Race",
legend.labs = c("White/Cauc.", "O. Non-white",
"Black/African A.", "Asian"),
data = OAI_sa)
age_groupggadjustedcurves(cxmod,
ylim = c(0.85, 1),
variable = "age_group",
legend = "bottom",
legend.title = "Age",
legend.labs = c("45-59", "60-74", ">= 75"),
data = OAI_sa)
bmi_catggadjustedcurves(cxmod,
ylim = c(0.85, 1),
variable = "bmi_cat",
legend = "bottom",
legend.title = "BMI",
legend.labs = c("Healthy", "Morb. obese", "Obese",
"Overweight", "Underweight"),
data = OAI_sa)
ggforest(cxmod, data = OAI_sa)
The ggforest() function creates a forest plot for a Cox regression model. This gives us hazard ratio estimates along with confidence intervals and p-values for each variable.
test.ph <- cox.zph(cxmod)
test.ph
## rho chisq p
## sex -0.0188 0.151 0.6975
## age -0.0706 1.797 0.1800
## race -0.0319 0.504 0.4776
## bmi -0.0165 0.101 0.7502
## stroke -0.0209 0.187 0.6652
## wlkaid -0.0974 4.071 0.0436
## GLOBAL NA 7.105 0.3112
If the p-value is not statistically significant for each of the covariates and for the GLOBAL row, we can assume that the PH assumption holds true.
In our case, the p-value for the variable wlkaid is the only one that is statistically significant and so maybe we should remove it from our analysis.
Let’s remove the variable wlkaid from the model and run the test again.
sex, race, age, bmi, and stroke as covariatescxmod <- coxph(Surv(time, cens) ~ sex + age + race + bmi + stroke, data = OAI_sa)
summary(cxmod)
## Call:
## coxph(formula = Surv(time, cens) ~ sex + age + race + bmi + stroke,
## data = OAI_sa)
##
## n= 4556, number of events= 432
## (94 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sex 0.092963 1.097421 0.098676 0.942 0.34614
## age 0.040477 1.041307 0.005482 7.384 1.54e-13 ***
## race -0.455676 0.634019 0.127973 -3.561 0.00037 ***
## bmi 0.074056 1.076867 0.009840 7.526 5.23e-14 ***
## stroke 0.401393 1.493905 0.220291 1.822 0.06844 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sex 1.097 0.9112 0.9044 1.3316
## age 1.041 0.9603 1.0302 1.0526
## race 0.634 1.5772 0.4934 0.8148
## bmi 1.077 0.9286 1.0563 1.0978
## stroke 1.494 0.6694 0.9701 2.3006
##
## Concordance= 0.65 (se = 0.014 )
## Rsquare= 0.025 (max possible= 0.791 )
## Likelihood ratio test= 117.2 on 5 df, p=<2e-16
## Wald test = 114.7 on 5 df, p=<2e-16
## Score (logrank) test = 116.5 on 5 df, p=<2e-16
test.ph <- cox.zph(cxmod)
test.ph
## rho chisq p
## sex -0.0124 0.0666 0.796
## age -0.0771 2.1381 0.144
## race -0.0348 0.5968 0.440
## bmi -0.0197 0.1447 0.704
## stroke -0.0219 0.2057 0.650
## GLOBAL NA 3.1648 0.675
The PH assumption now holds true for all our variables and our model as a whole.
summary(OAI_sa[6:7])
## age bmi
## Min. :45.00 Min. :16.90
## 1st Qu.:53.00 1st Qu.:25.10
## Median :61.00 Median :28.20
## Mean :61.13 Mean :28.59
## 3rd Qu.:69.00 3rd Qu.:31.70
## Max. :79.00 Max. :48.70
## NA's :4
table(OAI_sa[,4:5])
## sex
## race 1 2
## 0 31 44
## 1 1633 2075
## 2 257 563
## 3 12 31
table(OAI_sa[9])
##
## 0 1
## 4431 133
Let’s create 3 “imaginary” subjects:
subject1 <- data.frame(race = 1, sex = 2, age = 61, bmi = 28, stroke = 0)
subject2 <- data.frame(race = 2, sex = 1, age = 45, bmi = 17, stroke = 0)
subject3 <- data.frame(race = 0, sex = 2, age = 79, bmi = 48, stroke = 1)
pred1 <- survfit(cxmod, subject1, data = OAI_sa)
ggsurvplot(pred1, ylab = "No surgery probability")
pred2 <- survfit(cxmod, subject2, data = OAI_sa)
ggsurvplot(pred2, ylab = "No surgery probability")
pred3 <- survfit(cxmod, subject3, data = OAI_sa)
ggsurvplot(pred3, surv.median.line = "hv", ylab = "No surgery probability")
preds <- list(subject1 = pred1, subject2 = pred2, subject3 = pred3)
ggsurvplot_combine(preds, surv.median.line = "hv", ylab = "No surgery probability", OAI_sa)
As expected, subject3 is the one more likely to undergo surgery and subject2 is the one less likely to undergo surgery.
We can also plot the cumulative hazard function. Let’s do that for subject3.
ggsurvplot(pred3, fun="cumhaz")
As expected, the risk of surgery increases quite considerably with time for subject3.
In survival analysis, a common way to evaluate a model is to consider the relative risk of an event for different instances instead of the absolute survival times for each instance. This can be done by computing the concordance probability or the concordance index (C-index).
The idea is to consider all possible pairs of observations and sort them into concordant and discordant groups based on their outcomes and the model’s predictions.
This value is directly available through the summary() function of the survival package.
summary(cxmod)
## Call:
## coxph(formula = Surv(time, cens) ~ sex + age + race + bmi + stroke,
## data = OAI_sa)
##
## n= 4556, number of events= 432
## (94 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sex 0.092963 1.097421 0.098676 0.942 0.34614
## age 0.040477 1.041307 0.005482 7.384 1.54e-13 ***
## race -0.455676 0.634019 0.127973 -3.561 0.00037 ***
## bmi 0.074056 1.076867 0.009840 7.526 5.23e-14 ***
## stroke 0.401393 1.493905 0.220291 1.822 0.06844 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sex 1.097 0.9112 0.9044 1.3316
## age 1.041 0.9603 1.0302 1.0526
## race 0.634 1.5772 0.4934 0.8148
## bmi 1.077 0.9286 1.0563 1.0978
## stroke 1.494 0.6694 0.9701 2.3006
##
## Concordance= 0.65 (se = 0.014 )
## Rsquare= 0.025 (max possible= 0.791 )
## Likelihood ratio test= 117.2 on 5 df, p=<2e-16
## Wald test = 114.7 on 5 df, p=<2e-16
## Score (logrank) test = 116.5 on 5 df, p=<2e-16
We have a concordance value of 0.65. This value must be between 0 and 1, with 1 representing perfect agreement between model and observation and 0.5 representing random guesses.
To have a better understanding of what this values means, we can also take a closer look to how this value is calculated:
\[C = \frac{n_{c}+ 0.5n_{t}}{n_{c}+ n_{d} + n_{t}}\]
where:
names(cxmod)
## [1] "coefficients" "var" "loglik"
## [4] "score" "iter" "linear.predictors"
## [7] "residuals" "means" "method"
## [10] "concordance" "n" "nevent"
## [13] "terms" "assign" "wald.test"
## [16] "na.action" "y" "formula"
## [19] "call"
cxmod$concordance
## concordant discordant tied.risk tied.time std(c-d)
## 1085625.00 585797.00 99.00 10098.00 47274.56
## [1] 4796 193
new_pain_vars <- subset(pain, select = c(ID, P02KPN, V00SF5))
new_medhist_vars <- subset(medhist, select = c(ID, P02KSURG))
colnames(new_pain_vars)[1] <- "id"
colnames(new_medhist_vars)[1] <- "id"
head(new_pain_vars)
## id P02KPN V00SF5
## 1 9000099 1 5
## 2 9000296 0 5
## 3 9000622 1 3
## 4 9000798 1 4
## 5 9001104 1 4
## 6 9001400 1 5
head(new_medhist_vars)
## id P02KSURG
## 1 9000099 1
## 2 9000296 1
## 3 9000622 0
## 4 9000798 1
## 5 9001104 0
## 6 9001400 0
OAI_sa_new <- merge(x = OAI_sa, y = new_pain_vars, by = "id")
OAI_sa_new <- merge(x = OAI_sa_new, y = new_medhist_vars, by = "id")
write.csv(OAI_sa_new, "./data/final_data_COX_PH_t0.csv")
head(OAI_sa_new, 20)
## id time cens race sex age bmi wlkaid stroke
## 1 9000099 108 0 1 1 59 23.8 0 0
## 2 9000296 108 0 1 1 69 29.8 0 0
## 3 9000622 12 0 1 2 71 22.7 0 1
## 4 9000798 108 0 1 1 56 32.4 0 0
## 5 9001104 24 0 1 2 72 30.7 0 0
## 6 9001400 108 0 1 2 75 23.5 0 0
## 7 9001695 84 0 1 2 52 28.6 0 0
## 8 9001897 108 0 1 1 72 25.9 0 0
## 9 9002116 108 0 2 1 61 36.5 0 0
## 10 9002316 108 0 1 1 76 25.1 0 0
## 11 9002411 48 0 1 1 79 31.8 0 0
## 12 9002430 48 1 1 1 68 27.4 0 0
## 13 9002817 108 0 1 2 77 22.9 0 0
## 14 9003126 108 0 1 2 63 35.5 0 0
## 15 9003175 108 0 2 2 56 22.5 0 0
## 16 9003316 108 0 2 1 64 36.0 0 0
## 17 9003380 108 0 1 1 64 33.1 0 0
## 18 9003406 108 0 2 1 45 41.3 0 0
## 19 9003430 84 0 1 1 72 30.0 0 0
## 20 9003658 108 0 1 1 50 30.7 0 0
## race_cat age_group bmi_cat P02KPN V00SF5
## 1 White or Caucasian 45-59 Healthy 1 5
## 2 White or Caucasian 60-74 Overweight 0 5
## 3 White or Caucasian 60-74 Healthy 1 3
## 4 White or Caucasian 45-59 Obese 1 4
## 5 White or Caucasian 60-74 Obese 1 4
## 6 White or Caucasian 75+ Healthy 1 5
## 7 White or Caucasian 45-59 Overweight 1 5
## 8 White or Caucasian 60-74 Overweight 1 5
## 9 Black or African American 60-74 Obese 1 3
## 10 White or Caucasian 75+ Overweight 1 5
## 11 White or Caucasian 75+ Obese 0 4
## 12 White or Caucasian 60-74 Overweight 1 4
## 13 White or Caucasian 75+ Healthy 1 5
## 14 White or Caucasian 60-74 Obese 0 3
## 15 Black or African American 45-59 Healthy 1 4
## 16 Black or African American 60-74 Obese 1 5
## 17 White or Caucasian 60-74 Obese 1 3
## 18 Black or African American 45-59 Morbidly obese 1 3
## 19 White or Caucasian 60-74 Obese 1 4
## 20 White or Caucasian 45-59 Obese 1 5
## P02KSURG
## 1 1
## 2 1
## 3 0
## 4 1
## 5 0
## 6 0
## 7 1
## 8 1
## 9 0
## 10 0
## 11 0
## 12 0
## 13 0
## 14 0
## 15 0
## 16 0
## 17 0
## 18 1
## 19 0
## 20 0
dim(OAI_sa_new)
## [1] 4650 15
new_cxmod <- coxph(Surv(time, cens) ~ sex + age + race + bmi +
+ P02KSURG + P02KPN + V00SF5,
data = OAI_sa_new)
summary(new_cxmod)
## Call:
## coxph(formula = Surv(time, cens) ~ sex + age + race + bmi + +P02KSURG +
## P02KPN + V00SF5, data = OAI_sa_new)
##
## n= 4602, number of events= 435
## (48 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sex 0.266703 1.305652 0.101080 2.639 0.008327 **
## age 0.047293 1.048429 0.005474 8.639 < 2e-16 ***
## race -0.434968 0.647285 0.123811 -3.513 0.000443 ***
## bmi 0.057300 1.058974 0.009915 5.779 7.51e-09 ***
## P02KSURG 1.018740 2.769704 0.100722 10.114 < 2e-16 ***
## P02KPN 0.942573 2.566577 0.217070 4.342 1.41e-05 ***
## V00SF5 -0.228088 0.796054 0.045040 -5.064 4.10e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sex 1.3057 0.7659 1.0710 1.5917
## age 1.0484 0.9538 1.0372 1.0597
## race 0.6473 1.5449 0.5078 0.8251
## bmi 1.0590 0.9443 1.0386 1.0798
## P02KSURG 2.7697 0.3610 2.2735 3.3742
## P02KPN 2.5666 0.3896 1.6772 3.9276
## V00SF5 0.7961 1.2562 0.7288 0.8695
##
## Concordance= 0.731 (se = 0.014 )
## Rsquare= 0.059 (max possible= 0.791 )
## Likelihood ratio test= 278.7 on 7 df, p=<2e-16
## Wald test = 281.4 on 7 df, p=<2e-16
## Score (logrank) test = 293 on 7 df, p=<2e-16
test.ph <- cox.zph(new_cxmod)
test.ph
## rho chisq p
## sex -0.0132 0.0779 0.7802
## age -0.0905 2.8456 0.0916
## race -0.0388 0.7334 0.3918
## bmi 0.0138 0.0744 0.7851
## P02KSURG -0.0641 1.8264 0.1766
## P02KPN -0.0709 2.1427 0.1433
## V00SF5 0.0735 2.2859 0.1306
## GLOBAL NA 10.3784 0.1681
The PH assumption holds true for all our variables and our model as a whole.
We have improved our model: all our variables are statistically significant and the concordance value is now 0.73.
Let’s create 3 “imaginary” subjects:
subject1 <- data.frame(race = 1, sex = 2, age = 61, bmi = 28, P02KSURG = 0,
P02KPN = 1, V00SF5 = 3)
subject2 <- data.frame(race = 2, sex = 1, age = 45, bmi = 17, P02KSURG = 0,
P02KPN = 0, V00SF5 = 5)
subject3 <- data.frame(race = 0, sex = 2, age = 79, bmi = 48, P02KSURG = 1,
P02KPN = 1, V00SF5 = 1)
pred1 <- survfit(new_cxmod, subject1, data = OAI_sa_new)
ggsurvplot(pred1, ylab = "No surgery probability")
pred2 <- survfit(new_cxmod, subject2, data = OAI_sa_new)
ggsurvplot(pred2, ylab = "No surgery probability")
pred3 <- survfit(new_cxmod, subject3, data = OAI_sa_new)
ggsurvplot(pred3, surv.median.line = "hv", ylab = "No surgery probability")
preds <- list(subject1 = pred1, subject2 = pred2, subject3 = pred3)
ggsurvplot_combine(preds, surv.median.line = "hv", ylab = "No surgery probability", OAI_sa_new)
Like before, subject3 is the one more likely to undergo surgery and subject2 is the one less likely to undergo surgery.
Once again, we can also plot the cumulative hazard function. Let’s do that for subject3.
ggsurvplot(pred3, fun="cumhaz")
As expected, the risk of surgery increases quite considerably with time for subject3.
Even though the variables stroke and wlkaid are not statistically significant and do not increase the concordance value of the model, we will include them for illustration purposes in our dashboard.
dash_cxmod <- coxph(Surv(time, cens) ~ sex + age + race + bmi +
+ P02KSURG + P02KPN + V00SF5 + stroke + wlkaid,
data = OAI_sa_new)
summary(dash_cxmod)
## Call:
## coxph(formula = Surv(time, cens) ~ sex + age + race + bmi + +P02KSURG +
## P02KPN + V00SF5 + stroke + wlkaid, data = OAI_sa_new)
##
## n= 4530, number of events= 431
## (120 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sex 0.249904 1.283902 0.101265 2.468 0.013594 *
## age 0.048257 1.049440 0.005541 8.709 < 2e-16 ***
## race -0.465422 0.627870 0.126517 -3.679 0.000234 ***
## bmi 0.058001 1.059716 0.009966 5.820 5.90e-09 ***
## P02KSURG 1.024060 2.784478 0.101014 10.138 < 2e-16 ***
## P02KPN 0.925509 2.523151 0.217142 4.262 2.02e-05 ***
## V00SF5 -0.239530 0.786997 0.045511 -5.263 1.42e-07 ***
## stroke 0.252502 1.287242 0.221470 1.140 0.254238
## wlkaid -0.323792 0.723401 0.414142 -0.782 0.434309
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sex 1.2839 0.7789 1.0528 1.5658
## age 1.0494 0.9529 1.0381 1.0609
## race 0.6279 1.5927 0.4900 0.8046
## bmi 1.0597 0.9436 1.0392 1.0806
## P02KSURG 2.7845 0.3591 2.2843 3.3941
## P02KPN 2.5232 0.3963 1.6486 3.8617
## V00SF5 0.7870 1.2707 0.7198 0.8604
## stroke 1.2872 0.7769 0.8340 1.9869
## wlkaid 0.7234 1.3824 0.3213 1.6289
##
## Concordance= 0.733 (se = 0.014 )
## Rsquare= 0.061 (max possible= 0.792 )
## Likelihood ratio test= 287.1 on 9 df, p=<2e-16
## Wald test = 291.4 on 9 df, p=<2e-16
## Score (logrank) test = 303.2 on 9 df, p=<2e-16
test.ph <- cox.zph(dash_cxmod)
test.ph
## rho chisq p
## sex -0.02004 0.17692 0.6740
## age -0.08244 2.34309 0.1258
## race -0.03028 0.43824 0.5080
## bmi 0.00437 0.00725 0.9321
## P02KSURG -0.05498 1.32302 0.2501
## P02KPN -0.07167 2.16983 0.1407
## V00SF5 0.06634 1.84177 0.1747
## stroke -0.00484 0.00999 0.9204
## wlkaid -0.08458 3.10383 0.0781
## GLOBAL NA 12.96950 0.1640
The PH assumption holds true for all our variables and our model as a whole.