#Load Packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(survival)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tibble)
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.5.2
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(broom.helpers)
## Warning: package 'broom.helpers' was built under R version 4.5.2
## 
## Attaching package: 'broom.helpers'
## The following objects are masked from 'package:gtsummary':
## 
##     all_categorical, all_continuous, all_contrasts, all_dichotomous,
##     all_interaction, all_intercepts
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(MatchIt)
## Warning: package 'MatchIt' was built under R version 4.5.2
library(broom)
## Warning: package 'broom' was built under R version 4.5.2
data(cancer, package="survival")

cancer_data <- rotterdam

head(cancer_data)
##      pid year age meno  size grade nodes pgr  er hormon chemo rtime recur dtime
## 1393   1 1992  74    1  <=20     3     0  35 291      0     0  1799     0  1799
## 1416   2 1984  79    1 20-50     3     0  36 611      0     0  2828     0  2828
## 2962   3 1983  44    0  <=20     2     0 138   0      0     0  6012     0  6012
## 1455   4 1985  70    1 20-50     3     0   0  12      0     0  2624     0  2624
## 977    5 1983  75    1  <=20     3     0 260 409      0     0  4915     0  4915
## 617    6 1983  52    0  <=20     3     0 139 303      0     0  5888     0  5888
##      death
## 1393     0
## 1416     0
## 2962     0
## 1455     0
## 977      0
## 617      0

pid patient identifier year year of surgery age age at surgery meno menopausal status (0= premenopausal, 1= postmenopausal) size tumor size, a factor with levels <=20 20-50 >50 grade differentiation grade nodes number of positive lymph nodes pgr progesterone receptors (fmol/l) er estrogen receptors (fmol/l) hormon hormonal treatment (0=no, 1=yes) chemo chemotherapy rtime days to relapse or last follow-up recur 0= no relapse, 1= relapse dtime days to death or last follow-up death 0= alive, 1= dead

corrplot_data <- cancer_data %>% dplyr::select(2:4, 6:15)
corr_data <- cor(corrplot_data, use = "complete.obs")
corrplot::corrplot(corr_data, method = "circle")

cancer_hist_data <- cancer_data %>% dplyr::select(3, 6:9, 12, 14)

ggplot(gather(cancer_hist_data), aes(value)) +
    geom_histogram(bins = 30, fill = "green") +
    facet_wrap(~key, scales = 'free_x')+
    theme_dark() + labs(title = "Distribution of Variables")

Surv(cancer_data$dtime, cancer_data$death)[1:11]
##  [1] 1799+ 2828+ 6012+ 2624+ 4915+ 5888+ 2491+ 4150+ 3919+ 3647+ 2133+
scurve1 <- survfit(Surv(dtime, death) ~ 1, data = cancer_data)
str(scurve1)
## List of 17
##  $ n        : int 2982
##  $ time     : num [1:2215] 36 45 64 74 97 101 126 129 141 151 ...
##  $ n.risk   : num [1:2215] 2982 2981 2980 2979 2978 ...
##  $ n.event  : num [1:2215] 0 1 1 1 1 2 0 0 1 2 ...
##  $ n.censor : num [1:2215] 1 0 0 0 0 1 1 1 0 0 ...
##  $ surv     : num [1:2215] 1 1 0.999 0.999 0.999 ...
##  $ std.err  : num [1:2215] 0 0.000336 0.000475 0.000581 0.000671 ...
##  $ cumhaz   : num [1:2215] 0 0.000335 0.000671 0.001007 0.001343 ...
##  $ std.chaz : num [1:2215] 0 0.000335 0.000474 0.000581 0.000671 ...
##  $ type     : chr "right"
##  $ logse    : logi TRUE
##  $ conf.int : num 0.95
##  $ conf.type: chr "log"
##  $ lower    : num [1:2215] 1 0.999 0.998 0.998 0.997 ...
##  $ upper    : num [1:2215] 1 1 1 1 1 ...
##  $ t0       : num 0
##  $ call     : language survfit(formula = Surv(dtime, death) ~ 1, data = cancer_data)
##  - attr(*, "class")= chr "survfit"
#a Overall Kaplan Meier Plot
survfit2(Surv(dtime, death) ~ 1, data = cancer_data) %>%  
  ggsurvfit(color = "blue") +
  labs(x = "Days", y = "Overall survival probability") +
  add_confidence_interval()

survfit2(Surv(dtime, death) ~ hormon, data = cancer_data) %>%  
  ggsurvfit() +
  labs(x = "Days", y = "Overall survival probability") +
  add_confidence_interval()

#Chemotherapy
survfit2(Surv(dtime, death) ~ chemo, data = cancer_data) %>%  
  ggsurvfit() +
  labs(x = "Days", y = "Overall survival probability") +
  add_confidence_interval()

#Menopause
survfit2(Surv(dtime, death) ~ meno, data = cancer_data) %>%  
  ggsurvfit() +
  labs(x = "Days", y = "Overall survival probability") +
  add_confidence_interval()

#Tumor Size
survfit2(Surv(dtime, death) ~ size, data = cancer_data) %>%
  ggsurvfit() +
  labs(x = "Days", y = "Overall survival probability") +
  add_confidence_interval()

survfit2(Surv(dtime, death) ~ age, data = cancer_data) %>%  
  ggsurvfit() +
  labs(x = "Days", y = "Overall survival probability") +
  add_confidence_interval()

#quantify the effect of age as a continuous predictor
cox_age <- coxph(Surv(dtime, death) ~ age, data = cancer_data)
summary(cox_age)
## Call:
## coxph(formula = Surv(dtime, death) ~ age, data = cancer_data)
## 
##   n= 2982, number of events= 1272 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)    
## age 0.018803  1.018981 0.002248 8.366   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.019     0.9814     1.015     1.023
## 
## Concordance= 0.553  (se = 0.009 )
## Likelihood ratio test= 69.99  on 1 df,   p=<2e-16
## Wald test            = 69.98  on 1 df,   p=<2e-16
## Score (logrank) test = 70.5  on 1 df,   p=<2e-16
summary(survfit(Surv(dtime, death) ~ 1, data = cancer_data), times = 365.25)
## Call: survfit(formula = Surv(dtime, death) ~ 1, data = cancer_data)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   365   2914      59     0.98 0.00256        0.975        0.985

Survival of 98%!

#nice output table
survfit(Surv(dtime, death) ~ 1, data = cancer_data) |> 
  tbl_survfit(
    times = 365.25,
    label_header = "**1-year survival (95% CI)**"
  )
Characteristic 1-year survival (95% CI)
Overall 98% (98%, 99%)
summary(survfit(Surv(dtime, death) ~ 1, data = cancer_data), times = (365.25*2))
## Call: survfit(formula = Surv(dtime, death) ~ 1, data = cancer_data)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   730   2743     218    0.927 0.00479        0.917        0.936

Decline to about 92% which isnt bad

cancer_data %>%  
  filter(death == 1) %>% 
  summarize(median_surv = median(dtime))
##   median_surv
## 1      1537.5
#median survival for those using hormonal therapy
cancer_data %>%  
  filter(death == 1) %>% 
  group_by(hormon) %>% 
  summarize(median_surv = median(dtime))
## # A tibble: 2 × 2
##   hormon median_surv
##    <int>       <dbl>
## 1      0        1575
## 2      1        1265
#median survival for those using chemo
cancer_data %>%  
  filter(death == 1) %>% 
  group_by(chemo) %>% 
  summarize(median_surv = median(dtime))
## # A tibble: 2 × 2
##   chemo median_surv
##   <int>       <dbl>
## 1     0       1542.
## 2     1       1494
#comparing survival times between groups - is the difference between the hormone therapy groups statistically significant?
survdiff(Surv(dtime, death) ~ hormon, data = cancer_data)
## Call:
## survdiff(formula = Surv(dtime, death) ~ hormon, data = cancer_data)
## 
##             N Observed Expected (O-E)^2/E (O-E)^2/V
## hormon=0 2643     1113     1162      2.04      23.7
## hormon=1  339      159      110     21.43      23.7
## 
##  Chisq= 23.7  on 1 degrees of freedom, p= 1e-06
#survival times for chemo
survdiff(Surv(dtime, death) ~ chemo, data = cancer_data)
## Call:
## survdiff(formula = Surv(dtime, death) ~ chemo, data = cancer_data)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## chemo=0 2402     1014     1024    0.0963     0.495
## chemo=1  580      258      248    0.3977     0.495
## 
##  Chisq= 0.5  on 1 degrees of freedom, p= 0.5
#run a cox model
coxph(Surv(dtime, death) ~ hormon, data = cancer_data) %>% 
  #exponentiate coefficients
  tbl_regression(exp = TRUE)
Characteristic HR 95% CI p-value
hormon 1.51 1.28, 1.79 <0.001
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio

A HR < 1 indicates a reduced hazard of death, while a HR > 1 indicates an increased hazard of death. So, 1.5 as many patients with hormonal therapy are dying than patients without out (e.g. if 100 patients with no hormonal therapy die, only 69 with the therapy would have died)

cox_fit <- coxph(Surv(dtime, death) ~ age + size + grade + nodes + hormon + chemo,
                 data = cancer_data)
tbl_regression(cox_fit, exponentiate = TRUE)
Characteristic HR 95% CI p-value
age 1.02 1.01, 1.02 <0.001
size


    <=20 — —
    20-50 1.57 1.38, 1.78 <0.001
    >50 2.28 1.91, 2.72 <0.001
grade 1.42 1.24, 1.63 <0.001
nodes 1.08 1.07, 1.09 <0.001
hormon 0.96 0.80, 1.14 0.6
chemo 1.03 0.88, 1.20 0.7
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio
#assess the fit of the cox model
mv_fit <- coxph(Surv(dtime, death) ~ hormon, data = cancer_data)
cz <- cox.zph(mv_fit)
cz
##        chisq df    p
## hormon 0.125  1 0.72
## GLOBAL 0.125  1 0.72

Signficant p value means assumption is violated

plot(cz)

#try it again with the menopause variable
mv_fit <- coxph(Surv(dtime, death) ~ meno, data = cancer_data)
cmz <- cox.zph(mv_fit)
cmz
##        chisq df     p
## meno    2.77  1 0.096
## GLOBAL  2.77  1 0.096
plot(cmz)

#try it again with the chemo variable
mvc_fit <- coxph(Surv(dtime, death) ~ chemo, data = cancer_data)
ccz <- cox.zph(mvc_fit)
ccz
##        chisq df    p
## chemo   1.16  1 0.28
## GLOBAL  1.16  1 0.28
plot(ccz)

#Propensity Score Matching
m1 <- lm(dtime ~ hormon + age + size + grade + nodes, data = cancer_data)
summary(m1)
## 
## Call:
## lm(formula = dtime ~ hormon + age + size + grade + nodes, data = cancer_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3118.3  -911.0   -22.9   805.1  4366.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4095.580    167.846  24.401  < 2e-16 ***
## hormon      -267.972     73.684  -3.637 0.000281 ***
## age           -7.180      1.763  -4.073 4.76e-05 ***
## size20-50   -170.410     48.575  -3.508 0.000458 ***
## size>50     -486.251     83.032  -5.856 5.25e-09 ***
## grade       -276.789     50.973  -5.430 6.09e-08 ***
## nodes        -68.000      5.598 -12.147  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1213 on 2975 degrees of freedom
## Multiple R-squared:  0.1282, Adjusted R-squared:  0.1264 
## F-statistic: 72.88 on 6 and 2975 DF,  p-value: < 2.2e-16
plot(m1)

matched_data <- matchit(hormon ~ age + size + grade + nodes,
                        data = cancer_data,
                        method = "nearest",
                        distance = "mahalanobis",
                        replace = FALSE)

#compare matches
summary(matched_data)
## 
## Call:
## matchit(formula = hormon ~ age + size + grade + nodes, data = cancer_data, 
##     method = "nearest", distance = "mahalanobis", replace = FALSE)
## 
## Summary of Balance for All Data:
##           Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## age             62.5487       54.0976          0.8519     0.5838    0.1262
## size<=20         0.3068        0.4854         -0.3874          .    0.1786
## size20-50        0.5074        0.4234          0.1680          .    0.0840
## size>50          0.1858        0.0912          0.2433          .    0.0947
## grade            2.8260        2.7219          0.2740     0.7179    0.0520
## nodes            5.7198        2.3265          0.7416     1.1831    0.1228
##           eCDF Max
## age         0.3665
## size<=20    0.1786
## size20-50   0.0840
## size>50     0.0947
## grade       0.1041
## nodes       0.5433
## 
## Summary of Balance for Matched Data:
##           Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## age             62.5487       62.3569          0.0193     1.0009    0.0055
## size<=20         0.3068        0.3068          0.0000          .    0.0000
## size20-50        0.5074        0.5074          0.0000          .    0.0000
## size>50          0.1858        0.1858          0.0000          .    0.0000
## grade            2.8260        2.8260          0.0000     1.0000    0.0000
## nodes            5.7198        5.5870          0.0290     1.0574    0.0053
##           eCDF Max Std. Pair Dist.
## age         0.0295          0.1264
## size<=20    0.0000          0.0000
## size20-50   0.0000          0.0000
## size>50     0.0000          0.0000
## grade       0.0000          0.0000
## nodes       0.0265          0.0638
## 
## Sample Sizes:
##           Control Treated
## All          2643     339
## Matched       339     339
## Unmatched    2304       0
## Discarded       0       0
matched_data <- matchit(hormon ~ age + size + grade + nodes,
                        data = cancer_data,
                        method = "nearest",
                        distance = "mahalanobis",
                        replace = TRUE)

summary(matched_data)
## 
## Call:
## matchit(formula = hormon ~ age + size + grade + nodes, data = cancer_data, 
##     method = "nearest", distance = "mahalanobis", replace = TRUE)
## 
## Summary of Balance for All Data:
##           Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## age             62.5487       54.0976          0.8519     0.5838    0.1262
## size<=20         0.3068        0.4854         -0.3874          .    0.1786
## size20-50        0.5074        0.4234          0.1680          .    0.0840
## size>50          0.1858        0.0912          0.2433          .    0.0947
## grade            2.8260        2.7219          0.2740     0.7179    0.0520
## nodes            5.7198        2.3265          0.7416     1.1831    0.1228
##           eCDF Max
## age         0.3665
## size<=20    0.1786
## size20-50   0.0840
## size>50     0.0947
## grade       0.1041
## nodes       0.5433
## 
## Summary of Balance for Matched Data:
##           Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## age             62.5487       62.5074          0.0042     1.0009    0.0041
## size<=20         0.3068        0.3068          0.0000          .    0.0000
## size20-50        0.5074        0.5074          0.0000          .    0.0000
## size>50          0.1858        0.1858          0.0000          .    0.0000
## grade            2.8260        2.8260          0.0000     0.9982    0.0000
## nodes            5.7198        5.6549          0.0142     1.0409    0.0031
##           eCDF Max Std. Pair Dist.
## age         0.0177          0.1059
## size<=20    0.0000          0.0000
## size20-50   0.0000          0.0000
## size>50     0.0000          0.0000
## grade       0.0000          0.0000
## nodes       0.0118          0.0348
## 
## Sample Sizes:
##               Control Treated
## All           2643.       339
## Matched (ESS)  212.42     339
## Matched        256.       339
## Unmatched     2387.         0
## Discarded        0.         0
matched_data <- match.data(matched_data)
model_matched <- lm(dtime ~ hormon,
                    data = matched_data)
summary(model_matched)
## 
## Call:
## lm(formula = dtime ~ hormon, data = matched_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2354.3 -1002.4   -44.5   870.6  4239.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2418.29      75.78  31.911  < 2e-16 ***
## hormon       -387.76     100.40  -3.862 0.000125 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1213 on 593 degrees of freedom
## Multiple R-squared:  0.02454,    Adjusted R-squared:  0.02289 
## F-statistic: 14.92 on 1 and 593 DF,  p-value: 0.0001247
model_matched_wts <- lm(dtime ~ hormon,
                    data = matched_data,
                    weights = weights)
summary(model_matched_wts)
## 
## Call:
## lm(formula = dtime ~ hormon, data = matched_data, weights = weights)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -3368.1  -939.4   -17.5   873.9  4239.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2355.92      75.86  31.055  < 2e-16 ***
## hormon       -325.39     100.50  -3.238  0.00127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1214 on 593 degrees of freedom
## Multiple R-squared:  0.01737,    Adjusted R-squared:  0.01571 
## F-statistic: 10.48 on 1 and 593 DF,  p-value: 0.001273
model_hormon <- glm(hormon ~ age + size + grade + nodes,
                 data = cancer_data,
                 family = binomial(link = "logit"))
summary(model_hormon)
## 
## Call:
## glm(formula = hormon ~ age + size + grade + nodes, family = binomial(link = "logit"), 
##     data = cancer_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.777266   0.545828 -12.416  < 2e-16 ***
## age          0.050332   0.004993  10.080  < 2e-16 ***
## size20-50    0.236451   0.140385   1.684  0.09212 .  
## size>50      0.214774   0.198598   1.081  0.27950    
## grade        0.437331   0.158153   2.765  0.00569 ** 
## nodes        0.114359   0.011724   9.754  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2112.1  on 2981  degrees of freedom
## Residual deviance: 1847.2  on 2976  degrees of freedom
## AIC: 1859.2
## 
## Number of Fisher Scoring iterations: 5
data_probs <- augment_columns(model_hormon,
                                     data = cancer_data,
                                     type.predict = "response") %>%

  rename(propensity = .fitted)

hormon_ipw <- data_probs %>%
  mutate(ipw = (hormon / propensity) + ((1 - hormon) / (1 - propensity)))
model_ipw <- lm(dtime ~ hormon,
                data = hormon_ipw,
                weights = ipw)

summary(model_ipw)
## 
## Call:
## lm(formula = dtime ~ hormon, data = hormon_ipw, weights = ipw)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -6846.1 -1139.3    47.6  1073.8 25384.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2621.76      31.26  83.879  < 2e-16 ***
## hormon       -284.76      45.84  -6.212 5.94e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1716 on 2980 degrees of freedom
## Multiple R-squared:  0.01279,    Adjusted R-squared:  0.01245 
## F-statistic: 38.59 on 1 and 2980 DF,  p-value: 5.944e-10
#Cox model on matched data
cox_matched <- coxph(Surv(dtime, death) ~ hormon, data = matched_data,
                          weights = weights)
summary(cox_matched)
## Call:
## coxph(formula = Surv(dtime, death) ~ hormon, data = matched_data, 
##     weights = weights)
## 
##   n= 595, number of events= 322 
## 
##           coef exp(coef) se(coef) robust se      z Pr(>|z|)
## hormon -0.1618    0.8506   0.1127    0.1195 -1.354    0.176
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## hormon    0.8506      1.176     0.673     1.075
## 
## Concordance= 0.524  (se = 0.016 )
## Likelihood ratio test= 2.06  on 1 df,   p=0.2
## Wald test            = 1.83  on 1 df,   p=0.2
## Score (logrank) test = 2.06  on 1 df,   p=0.2,   Robust = 1.8  p=0.2
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).
survfit2(Surv(dtime, death) ~ hormon, data = matched_data,
              weights = weights) %>%
  ggsurvfit() +
  labs(x = "Days", y = "Overall survival probability",
     title = "Survival by Hormon therapy (Matched Data)") +
  add_confidence_interval()

mv_fit <- coxph(Surv(dtime, death) ~ hormon,
                data = matched_data,
                weights = weights)

cz <- cox.zph(mv_fit)
cz
##        chisq df    p
## hormon 0.912  1 0.34
## GLOBAL 0.912  1 0.34
plot(cz)

#Cox model with IPW weights
cox_ipw <- coxph(Surv(dtime, death) ~ hormon, data = hormon_ipw,
                          weights = ipw)
summary(cox_ipw)
## Call:
## coxph(formula = Surv(dtime, death) ~ hormon, data = hormon_ipw, 
##     weights = ipw)
## 
##   n= 2982, number of events= 1272 
## 
##            coef exp(coef) se(coef) robust se      z Pr(>|z|)
## hormon -0.11334   0.89284  0.04282   0.12219 -0.928    0.354
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## hormon    0.8928       1.12    0.7027     1.134
## 
## Concordance= 0.51  (se = 0.014 )
## Likelihood ratio test= 7.05  on 1 df,   p=0.008
## Wald test            = 0.86  on 1 df,   p=0.4
## Score (logrank) test = 7.01  on 1 df,   p=0.008,   Robust = 0.89  p=0.3
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).
survfit2(Surv(dtime, death) ~ hormon, data = hormon_ipw,
              weights = ipw) %>%
  ggsurvfit() +
  labs(x = "Days", y = "Overall survival probability",
     title = "Survival by Hormon therapy (IPW weighted)") +
  add_confidence_interval()

mv_fit <- coxph(Surv(dtime, death) ~ hormon,
                data = hormon_ipw,
                weights = ipw)

cz <- cox.zph(mv_fit)
cz
##        chisq df    p
## hormon  1.38  1 0.24
## GLOBAL  1.38  1 0.24
plot(cz)