#Load Packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(survival)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tibble)
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.5.2
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(broom.helpers)
## Warning: package 'broom.helpers' was built under R version 4.5.2
##
## Attaching package: 'broom.helpers'
## The following objects are masked from 'package:gtsummary':
##
## all_categorical, all_continuous, all_contrasts, all_dichotomous,
## all_interaction, all_intercepts
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(MatchIt)
## Warning: package 'MatchIt' was built under R version 4.5.2
library(broom)
## Warning: package 'broom' was built under R version 4.5.2
data(cancer, package="survival")
cancer_data <- rotterdam
head(cancer_data)
## pid year age meno size grade nodes pgr er hormon chemo rtime recur dtime
## 1393 1 1992 74 1 <=20 3 0 35 291 0 0 1799 0 1799
## 1416 2 1984 79 1 20-50 3 0 36 611 0 0 2828 0 2828
## 2962 3 1983 44 0 <=20 2 0 138 0 0 0 6012 0 6012
## 1455 4 1985 70 1 20-50 3 0 0 12 0 0 2624 0 2624
## 977 5 1983 75 1 <=20 3 0 260 409 0 0 4915 0 4915
## 617 6 1983 52 0 <=20 3 0 139 303 0 0 5888 0 5888
## death
## 1393 0
## 1416 0
## 2962 0
## 1455 0
## 977 0
## 617 0
pid patient identifier year year of surgery age age at surgery meno menopausal status (0= premenopausal, 1= postmenopausal) size tumor size, a factor with levels <=20 20-50 >50 grade differentiation grade nodes number of positive lymph nodes pgr progesterone receptors (fmol/l) er estrogen receptors (fmol/l) hormon hormonal treatment (0=no, 1=yes) chemo chemotherapy rtime days to relapse or last follow-up recur 0= no relapse, 1= relapse dtime days to death or last follow-up death 0= alive, 1= dead
corrplot_data <- cancer_data %>% dplyr::select(2:4, 6:15)
corr_data <- cor(corrplot_data, use = "complete.obs")
corrplot::corrplot(corr_data, method = "circle")
cancer_hist_data <- cancer_data %>% dplyr::select(3, 6:9, 12, 14)
ggplot(gather(cancer_hist_data), aes(value)) +
geom_histogram(bins = 30, fill = "green") +
facet_wrap(~key, scales = 'free_x')+
theme_dark() + labs(title = "Distribution of Variables")
Surv(cancer_data$dtime, cancer_data$death)[1:11]
## [1] 1799+ 2828+ 6012+ 2624+ 4915+ 5888+ 2491+ 4150+ 3919+ 3647+ 2133+
scurve1 <- survfit(Surv(dtime, death) ~ 1, data = cancer_data)
str(scurve1)
## List of 17
## $ n : int 2982
## $ time : num [1:2215] 36 45 64 74 97 101 126 129 141 151 ...
## $ n.risk : num [1:2215] 2982 2981 2980 2979 2978 ...
## $ n.event : num [1:2215] 0 1 1 1 1 2 0 0 1 2 ...
## $ n.censor : num [1:2215] 1 0 0 0 0 1 1 1 0 0 ...
## $ surv : num [1:2215] 1 1 0.999 0.999 0.999 ...
## $ std.err : num [1:2215] 0 0.000336 0.000475 0.000581 0.000671 ...
## $ cumhaz : num [1:2215] 0 0.000335 0.000671 0.001007 0.001343 ...
## $ std.chaz : num [1:2215] 0 0.000335 0.000474 0.000581 0.000671 ...
## $ type : chr "right"
## $ logse : logi TRUE
## $ conf.int : num 0.95
## $ conf.type: chr "log"
## $ lower : num [1:2215] 1 0.999 0.998 0.998 0.997 ...
## $ upper : num [1:2215] 1 1 1 1 1 ...
## $ t0 : num 0
## $ call : language survfit(formula = Surv(dtime, death) ~ 1, data = cancer_data)
## - attr(*, "class")= chr "survfit"
#a Overall Kaplan Meier Plot
survfit2(Surv(dtime, death) ~ 1, data = cancer_data) %>%
ggsurvfit(color = "blue") +
labs(x = "Days", y = "Overall survival probability") +
add_confidence_interval()
survfit2(Surv(dtime, death) ~ hormon, data = cancer_data) %>%
ggsurvfit() +
labs(x = "Days", y = "Overall survival probability") +
add_confidence_interval()
#Chemotherapy
survfit2(Surv(dtime, death) ~ chemo, data = cancer_data) %>%
ggsurvfit() +
labs(x = "Days", y = "Overall survival probability") +
add_confidence_interval()
#Menopause
survfit2(Surv(dtime, death) ~ meno, data = cancer_data) %>%
ggsurvfit() +
labs(x = "Days", y = "Overall survival probability") +
add_confidence_interval()
#Tumor Size
survfit2(Surv(dtime, death) ~ size, data = cancer_data) %>%
ggsurvfit() +
labs(x = "Days", y = "Overall survival probability") +
add_confidence_interval()
survfit2(Surv(dtime, death) ~ age, data = cancer_data) %>%
ggsurvfit() +
labs(x = "Days", y = "Overall survival probability") +
add_confidence_interval()
#quantify the effect of age as a continuous predictor
cox_age <- coxph(Surv(dtime, death) ~ age, data = cancer_data)
summary(cox_age)
## Call:
## coxph(formula = Surv(dtime, death) ~ age, data = cancer_data)
##
## n= 2982, number of events= 1272
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.018803 1.018981 0.002248 8.366 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.019 0.9814 1.015 1.023
##
## Concordance= 0.553 (se = 0.009 )
## Likelihood ratio test= 69.99 on 1 df, p=<2e-16
## Wald test = 69.98 on 1 df, p=<2e-16
## Score (logrank) test = 70.5 on 1 df, p=<2e-16
summary(survfit(Surv(dtime, death) ~ 1, data = cancer_data), times = 365.25)
## Call: survfit(formula = Surv(dtime, death) ~ 1, data = cancer_data)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 365 2914 59 0.98 0.00256 0.975 0.985
Survival of 98%!
#nice output table
survfit(Surv(dtime, death) ~ 1, data = cancer_data) |>
tbl_survfit(
times = 365.25,
label_header = "**1-year survival (95% CI)**"
)
| Characteristic | 1-year survival (95% CI) |
|---|---|
| Overall | 98% (98%, 99%) |
summary(survfit(Surv(dtime, death) ~ 1, data = cancer_data), times = (365.25*2))
## Call: survfit(formula = Surv(dtime, death) ~ 1, data = cancer_data)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 730 2743 218 0.927 0.00479 0.917 0.936
Decline to about 92% which isnt bad
cancer_data %>%
filter(death == 1) %>%
summarize(median_surv = median(dtime))
## median_surv
## 1 1537.5
#median survival for those using hormonal therapy
cancer_data %>%
filter(death == 1) %>%
group_by(hormon) %>%
summarize(median_surv = median(dtime))
## # A tibble: 2 × 2
## hormon median_surv
## <int> <dbl>
## 1 0 1575
## 2 1 1265
#median survival for those using chemo
cancer_data %>%
filter(death == 1) %>%
group_by(chemo) %>%
summarize(median_surv = median(dtime))
## # A tibble: 2 × 2
## chemo median_surv
## <int> <dbl>
## 1 0 1542.
## 2 1 1494
#comparing survival times between groups - is the difference between the hormone therapy groups statistically significant?
survdiff(Surv(dtime, death) ~ hormon, data = cancer_data)
## Call:
## survdiff(formula = Surv(dtime, death) ~ hormon, data = cancer_data)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## hormon=0 2643 1113 1162 2.04 23.7
## hormon=1 339 159 110 21.43 23.7
##
## Chisq= 23.7 on 1 degrees of freedom, p= 1e-06
#survival times for chemo
survdiff(Surv(dtime, death) ~ chemo, data = cancer_data)
## Call:
## survdiff(formula = Surv(dtime, death) ~ chemo, data = cancer_data)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## chemo=0 2402 1014 1024 0.0963 0.495
## chemo=1 580 258 248 0.3977 0.495
##
## Chisq= 0.5 on 1 degrees of freedom, p= 0.5
#run a cox model
coxph(Surv(dtime, death) ~ hormon, data = cancer_data) %>%
#exponentiate coefficients
tbl_regression(exp = TRUE)
| Characteristic | HR | 95% CI | p-value |
|---|---|---|---|
| hormon | 1.51 | 1.28, 1.79 | <0.001 |
| Abbreviations: CI = Confidence Interval, HR = Hazard Ratio | |||
A HR < 1 indicates a reduced hazard of death, while a HR > 1 indicates an increased hazard of death. So, 1.5 as many patients with hormonal therapy are dying than patients without out (e.g. if 100 patients with no hormonal therapy die, only 69 with the therapy would have died)
cox_fit <- coxph(Surv(dtime, death) ~ age + size + grade + nodes + hormon + chemo,
data = cancer_data)
tbl_regression(cox_fit, exponentiate = TRUE)
| Characteristic | HR | 95% CI | p-value |
|---|---|---|---|
| age | 1.02 | 1.01, 1.02 | <0.001 |
| size | |||
|     <=20 | — | — | |
| Â Â Â Â 20-50 | 1.57 | 1.38, 1.78 | <0.001 |
| Â Â Â Â >50 | 2.28 | 1.91, 2.72 | <0.001 |
| grade | 1.42 | 1.24, 1.63 | <0.001 |
| nodes | 1.08 | 1.07, 1.09 | <0.001 |
| hormon | 0.96 | 0.80, 1.14 | 0.6 |
| chemo | 1.03 | 0.88, 1.20 | 0.7 |
| Abbreviations: CI = Confidence Interval, HR = Hazard Ratio | |||
#assess the fit of the cox model
mv_fit <- coxph(Surv(dtime, death) ~ hormon, data = cancer_data)
cz <- cox.zph(mv_fit)
cz
## chisq df p
## hormon 0.125 1 0.72
## GLOBAL 0.125 1 0.72
Signficant p value means assumption is violated
plot(cz)
#try it again with the menopause variable
mv_fit <- coxph(Surv(dtime, death) ~ meno, data = cancer_data)
cmz <- cox.zph(mv_fit)
cmz
## chisq df p
## meno 2.77 1 0.096
## GLOBAL 2.77 1 0.096
plot(cmz)
#try it again with the chemo variable
mvc_fit <- coxph(Surv(dtime, death) ~ chemo, data = cancer_data)
ccz <- cox.zph(mvc_fit)
ccz
## chisq df p
## chemo 1.16 1 0.28
## GLOBAL 1.16 1 0.28
plot(ccz)
#Propensity Score Matching
m1 <- lm(dtime ~ hormon + age + size + grade + nodes, data = cancer_data)
summary(m1)
##
## Call:
## lm(formula = dtime ~ hormon + age + size + grade + nodes, data = cancer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3118.3 -911.0 -22.9 805.1 4366.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4095.580 167.846 24.401 < 2e-16 ***
## hormon -267.972 73.684 -3.637 0.000281 ***
## age -7.180 1.763 -4.073 4.76e-05 ***
## size20-50 -170.410 48.575 -3.508 0.000458 ***
## size>50 -486.251 83.032 -5.856 5.25e-09 ***
## grade -276.789 50.973 -5.430 6.09e-08 ***
## nodes -68.000 5.598 -12.147 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1213 on 2975 degrees of freedom
## Multiple R-squared: 0.1282, Adjusted R-squared: 0.1264
## F-statistic: 72.88 on 6 and 2975 DF, p-value: < 2.2e-16
plot(m1)
matched_data <- matchit(hormon ~ age + size + grade + nodes,
data = cancer_data,
method = "nearest",
distance = "mahalanobis",
replace = FALSE)
#compare matches
summary(matched_data)
##
## Call:
## matchit(formula = hormon ~ age + size + grade + nodes, data = cancer_data,
## method = "nearest", distance = "mahalanobis", replace = FALSE)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## age 62.5487 54.0976 0.8519 0.5838 0.1262
## size<=20 0.3068 0.4854 -0.3874 . 0.1786
## size20-50 0.5074 0.4234 0.1680 . 0.0840
## size>50 0.1858 0.0912 0.2433 . 0.0947
## grade 2.8260 2.7219 0.2740 0.7179 0.0520
## nodes 5.7198 2.3265 0.7416 1.1831 0.1228
## eCDF Max
## age 0.3665
## size<=20 0.1786
## size20-50 0.0840
## size>50 0.0947
## grade 0.1041
## nodes 0.5433
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## age 62.5487 62.3569 0.0193 1.0009 0.0055
## size<=20 0.3068 0.3068 0.0000 . 0.0000
## size20-50 0.5074 0.5074 0.0000 . 0.0000
## size>50 0.1858 0.1858 0.0000 . 0.0000
## grade 2.8260 2.8260 0.0000 1.0000 0.0000
## nodes 5.7198 5.5870 0.0290 1.0574 0.0053
## eCDF Max Std. Pair Dist.
## age 0.0295 0.1264
## size<=20 0.0000 0.0000
## size20-50 0.0000 0.0000
## size>50 0.0000 0.0000
## grade 0.0000 0.0000
## nodes 0.0265 0.0638
##
## Sample Sizes:
## Control Treated
## All 2643 339
## Matched 339 339
## Unmatched 2304 0
## Discarded 0 0
matched_data <- matchit(hormon ~ age + size + grade + nodes,
data = cancer_data,
method = "nearest",
distance = "mahalanobis",
replace = TRUE)
summary(matched_data)
##
## Call:
## matchit(formula = hormon ~ age + size + grade + nodes, data = cancer_data,
## method = "nearest", distance = "mahalanobis", replace = TRUE)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## age 62.5487 54.0976 0.8519 0.5838 0.1262
## size<=20 0.3068 0.4854 -0.3874 . 0.1786
## size20-50 0.5074 0.4234 0.1680 . 0.0840
## size>50 0.1858 0.0912 0.2433 . 0.0947
## grade 2.8260 2.7219 0.2740 0.7179 0.0520
## nodes 5.7198 2.3265 0.7416 1.1831 0.1228
## eCDF Max
## age 0.3665
## size<=20 0.1786
## size20-50 0.0840
## size>50 0.0947
## grade 0.1041
## nodes 0.5433
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## age 62.5487 62.5074 0.0042 1.0009 0.0041
## size<=20 0.3068 0.3068 0.0000 . 0.0000
## size20-50 0.5074 0.5074 0.0000 . 0.0000
## size>50 0.1858 0.1858 0.0000 . 0.0000
## grade 2.8260 2.8260 0.0000 0.9982 0.0000
## nodes 5.7198 5.6549 0.0142 1.0409 0.0031
## eCDF Max Std. Pair Dist.
## age 0.0177 0.1059
## size<=20 0.0000 0.0000
## size20-50 0.0000 0.0000
## size>50 0.0000 0.0000
## grade 0.0000 0.0000
## nodes 0.0118 0.0348
##
## Sample Sizes:
## Control Treated
## All 2643. 339
## Matched (ESS) 212.42 339
## Matched 256. 339
## Unmatched 2387. 0
## Discarded 0. 0
matched_data <- match.data(matched_data)
model_matched <- lm(dtime ~ hormon,
data = matched_data)
summary(model_matched)
##
## Call:
## lm(formula = dtime ~ hormon, data = matched_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2354.3 -1002.4 -44.5 870.6 4239.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2418.29 75.78 31.911 < 2e-16 ***
## hormon -387.76 100.40 -3.862 0.000125 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1213 on 593 degrees of freedom
## Multiple R-squared: 0.02454, Adjusted R-squared: 0.02289
## F-statistic: 14.92 on 1 and 593 DF, p-value: 0.0001247
model_matched_wts <- lm(dtime ~ hormon,
data = matched_data,
weights = weights)
summary(model_matched_wts)
##
## Call:
## lm(formula = dtime ~ hormon, data = matched_data, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3368.1 -939.4 -17.5 873.9 4239.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2355.92 75.86 31.055 < 2e-16 ***
## hormon -325.39 100.50 -3.238 0.00127 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1214 on 593 degrees of freedom
## Multiple R-squared: 0.01737, Adjusted R-squared: 0.01571
## F-statistic: 10.48 on 1 and 593 DF, p-value: 0.001273
model_hormon <- glm(hormon ~ age + size + grade + nodes,
data = cancer_data,
family = binomial(link = "logit"))
summary(model_hormon)
##
## Call:
## glm(formula = hormon ~ age + size + grade + nodes, family = binomial(link = "logit"),
## data = cancer_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.777266 0.545828 -12.416 < 2e-16 ***
## age 0.050332 0.004993 10.080 < 2e-16 ***
## size20-50 0.236451 0.140385 1.684 0.09212 .
## size>50 0.214774 0.198598 1.081 0.27950
## grade 0.437331 0.158153 2.765 0.00569 **
## nodes 0.114359 0.011724 9.754 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2112.1 on 2981 degrees of freedom
## Residual deviance: 1847.2 on 2976 degrees of freedom
## AIC: 1859.2
##
## Number of Fisher Scoring iterations: 5
data_probs <- augment_columns(model_hormon,
data = cancer_data,
type.predict = "response") %>%
rename(propensity = .fitted)
hormon_ipw <- data_probs %>%
mutate(ipw = (hormon / propensity) + ((1 - hormon) / (1 - propensity)))
model_ipw <- lm(dtime ~ hormon,
data = hormon_ipw,
weights = ipw)
summary(model_ipw)
##
## Call:
## lm(formula = dtime ~ hormon, data = hormon_ipw, weights = ipw)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -6846.1 -1139.3 47.6 1073.8 25384.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2621.76 31.26 83.879 < 2e-16 ***
## hormon -284.76 45.84 -6.212 5.94e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1716 on 2980 degrees of freedom
## Multiple R-squared: 0.01279, Adjusted R-squared: 0.01245
## F-statistic: 38.59 on 1 and 2980 DF, p-value: 5.944e-10
#Cox model on matched data
cox_matched <- coxph(Surv(dtime, death) ~ hormon, data = matched_data,
weights = weights)
summary(cox_matched)
## Call:
## coxph(formula = Surv(dtime, death) ~ hormon, data = matched_data,
## weights = weights)
##
## n= 595, number of events= 322
##
## coef exp(coef) se(coef) robust se z Pr(>|z|)
## hormon -0.1618 0.8506 0.1127 0.1195 -1.354 0.176
##
## exp(coef) exp(-coef) lower .95 upper .95
## hormon 0.8506 1.176 0.673 1.075
##
## Concordance= 0.524 (se = 0.016 )
## Likelihood ratio test= 2.06 on 1 df, p=0.2
## Wald test = 1.83 on 1 df, p=0.2
## Score (logrank) test = 2.06 on 1 df, p=0.2, Robust = 1.8 p=0.2
##
## (Note: the likelihood ratio and score tests assume independence of
## observations within a cluster, the Wald and robust score tests do not).
survfit2(Surv(dtime, death) ~ hormon, data = matched_data,
weights = weights) %>%
ggsurvfit() +
labs(x = "Days", y = "Overall survival probability",
title = "Survival by Hormon therapy (Matched Data)") +
add_confidence_interval()
mv_fit <- coxph(Surv(dtime, death) ~ hormon,
data = matched_data,
weights = weights)
cz <- cox.zph(mv_fit)
cz
## chisq df p
## hormon 0.912 1 0.34
## GLOBAL 0.912 1 0.34
plot(cz)
#Cox model with IPW weights
cox_ipw <- coxph(Surv(dtime, death) ~ hormon, data = hormon_ipw,
weights = ipw)
summary(cox_ipw)
## Call:
## coxph(formula = Surv(dtime, death) ~ hormon, data = hormon_ipw,
## weights = ipw)
##
## n= 2982, number of events= 1272
##
## coef exp(coef) se(coef) robust se z Pr(>|z|)
## hormon -0.11334 0.89284 0.04282 0.12219 -0.928 0.354
##
## exp(coef) exp(-coef) lower .95 upper .95
## hormon 0.8928 1.12 0.7027 1.134
##
## Concordance= 0.51 (se = 0.014 )
## Likelihood ratio test= 7.05 on 1 df, p=0.008
## Wald test = 0.86 on 1 df, p=0.4
## Score (logrank) test = 7.01 on 1 df, p=0.008, Robust = 0.89 p=0.3
##
## (Note: the likelihood ratio and score tests assume independence of
## observations within a cluster, the Wald and robust score tests do not).
survfit2(Surv(dtime, death) ~ hormon, data = hormon_ipw,
weights = ipw) %>%
ggsurvfit() +
labs(x = "Days", y = "Overall survival probability",
title = "Survival by Hormon therapy (IPW weighted)") +
add_confidence_interval()
mv_fit <- coxph(Surv(dtime, death) ~ hormon,
data = hormon_ipw,
weights = ipw)
cz <- cox.zph(mv_fit)
cz
## chisq df p
## hormon 1.38 1 0.24
## GLOBAL 1.38 1 0.24
plot(cz)