Question 4

Question 4.1

We test the PH assumption on variable hg, sz and Age for model 7. The output is as follows:

cox.zph(model7, terms = FALSE)

##                    chisq df     p
## Trt0.2 mg estrogen 0.913  1 0.339
## Trt1.0 mg estrogen 2.652  1 0.103
## Trt5.0 mg estrogen 0.054  1 0.816
## I(Stage == 4)TRUE  0.974  1 0.324
## I(hx == 1)TRUE     0.279  1 0.597
## fun_hg(hg)x        3.862  1 0.049
## fun_hg(hg)         1.402  1 0.236
## sz                 0.485  1 0.486
## fun_age(Age)       3.224  1 0.073
## GLOBAL             9.574  9 0.386

Under the significance level 0.05, we find that the PH assumption on the linear term of serum hemoglobin (hg) and piecewise linear term of age (Age) are violated the PH assumption.

Then, we plot the scaled Schoenfeld residuals for the two terms as follows:

df <- data.frame(time = prostateCancerSub$Time[prostateCancerSub$death == 1], 
                 residual1 = residuals(model7, type = "scaledsch")[, 6], 
                 residual2 = residuals(model7, type = "scaledsch")[, 9])

ggplot(df, aes(x = time, y = residual1)) + 
  geom_point() + geom_smooth(span = 2) + 
  labs(title = "Scaled Schoenfeld Residuals for Linear Serum Hemoglobin", x = "Time (years)", y = "scaled schoenfeld residual") + 
  theme(plot.title = element_text(hjust = 0.5, size = 13), axis.title = element_text(size = 14), 
        axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 12))

ggplot(df, aes(x = time, y = residual2)) + 
  geom_point() + geom_smooth(span = 2) + 
  labs(title = "Scaled Schoenfeld Residuals for Piecewise Age", x = "Time (years)", y = "scaled schoenfeld residual") + 
  theme(plot.title = element_text(hjust = 0.5, size = 13), axis.title = element_text(size = 14), 
        axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 12))

According to the plot of scaled Schoenfeld residuals, we find that they are only slightly departure from a horizontal line. Therefore, the violation of PH assumption are not serious for the linear term of serum hemoglobin (hg) and piecewise linear term of age (Age).

Question 4.2

To identify the influential observation on the parameter estimates, we plot the scaled dalta-beta residuals of \(\beta_6\), \(\beta_7\), \(\beta_8\) and \(\beta_9\) as follows:

r <- residuals(model7, type = 'dfbetas')
df <- data.frame(index = 1:(dim(r)[1]), status = factor(death), db_hg1 = r[, 6], db_hg2 = r[, 7], db_sz = r[, 8], db_age = r[, 9])
ggplot(df, aes(index, db_hg1, color = status)) + 
  geom_linerange(ymin = 0, ymax = df$db_hg1) +
  labs(title = "Standardized Delta-beta Residuals for Serum Hemoglobin 1", x = "index", y = "standardized delta-beta residual") + 
  theme(plot.title = element_text(hjust = 0.5, size = 13), axis.title = element_text(size = 14), 
        axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 12))

ggplot(df, aes(index, db_hg2, color = status)) + 
  geom_linerange(ymin = 0, ymax = df$db_hg2) +
  labs(title = "Standardized Delta-beta Residuals for Serum Hemoglobin 2", x = "index", y = "standardized delta-beta residual") + 
  theme(plot.title = element_text(hjust = 0.5, size = 13), axis.title = element_text(size = 14), 
        axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 12))

ggplot(df, aes(index, db_sz, color = status)) + 
  geom_linerange(ymin = 0, ymax = df$db_sz) +
  labs(title = "Standardized Delta-beta Residuals for Tumor Size", x = "index", y = "standardized delta-beta residual") + 
  theme(plot.title = element_text(hjust = 0.5, size = 13), axis.title = element_text(size = 14), 
        axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 12))

ggplot(df, aes(index, db_age, color = status)) + 
  geom_linerange(ymin = 0, ymax = df$db_age) +
  labs(title = "Standardized Delta-beta Residuals for Age", x = "index", y = "standardized delta-beta residual") + 
  theme(plot.title = element_text(hjust = 0.5, size = 13), axis.title = element_text(size = 14), 
        axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 12))

According to the plot above, we find that the scaled delta-beta residual plot of \(\beta_7\) show an influential observation, while the plots of \(\beta_6\), \(\beta_8\) and \(\beta_9\) don’t show any obvious influential observation. The estimated parameters with and without the influential observation for \(\beta_7\) are as follows:

model7

## Call:
## coxph(formula = Surv(Time, death != 0) ~ Trt + I(Stage == 4) + 
##     I(hx == 1) + fun_hg(hg) + sz + fun_age(Age), data = prostateCancerSub)
## 
##                         coef exp(coef)  se(coef)      z        p
## Trt0.2 mg estrogen  0.016839  1.016982  0.146116  0.115  0.90825
## Trt1.0 mg estrogen -0.388490  0.678080  0.157910 -2.460  0.01389
## Trt5.0 mg estrogen -0.004901  0.995111  0.147224 -0.033  0.97344
## I(Stage == 4)TRUE   0.221958  1.248519  0.112303  1.976  0.04811
## I(hx == 1)TRUE      0.428227  1.534534  0.109874  3.897 9.72e-05
## fun_hg(hg)x        -0.138289  0.870847  0.034795 -3.974 7.06e-05
## fun_hg(hg)          0.338444  1.402763  0.131669  2.570  0.01016
## sz                  0.019005  1.019187  0.004271  4.450 8.59e-06
## fun_age(Age)        0.051999  1.053374  0.015071  3.450  0.00056
## 
## Likelihood ratio test=84.55  on 9 df, p=2.009e-14
## n= 502, number of events= 354

model8 <- coxph(Surv(Time, death != 0) ~ Trt + I(Stage == 4) + I(hx == 1) + fun_hg(hg) + sz + fun_age(Age), data = prostateCancerSub[-502, ])
model8

## Call:
## coxph(formula = Surv(Time, death != 0) ~ Trt + I(Stage == 4) + 
##     I(hx == 1) + fun_hg(hg) + sz + fun_age(Age), data = prostateCancerSub[-502, 
##     ])
## 
##                          coef  exp(coef)   se(coef)      z        p
## Trt0.2 mg estrogen  0.0199094  1.0201089  0.1460920  0.136 0.891600
## Trt1.0 mg estrogen -0.3956485  0.6732433  0.1584300 -2.497 0.012514
## Trt5.0 mg estrogen  0.0009374  1.0009378  0.1472420  0.006 0.994921
## I(Stage == 4)TRUE   0.2111874  1.2351438  0.1124378  1.878 0.060346
## I(hx == 1)TRUE      0.4349731  1.5449215  0.1100975  3.951 7.79e-05
## fun_hg(hg)x        -0.1322795  0.8760961  0.0350766 -3.771 0.000162
## fun_hg(hg)          0.2639619  1.3020786  0.1422998  1.855 0.063600
## sz                  0.0188693  1.0190485  0.0043010  4.387 1.15e-05
## fun_age(Age)        0.0521700  1.0535549  0.0150780  3.460 0.000540
## 
## Likelihood ratio test=83.33  on 9 df, p=3.513e-14
## n= 501, number of events= 353

The estimate of \(\beta_7\) decreases from 0.338444 to 0.2639619 after removing the influential observation and the \(p\)-value of \(\beta_7\) increases from 0.01016 to 0.063600, which becomes less significant.

Question 4.3

The cumulative hazard plot of the Cox–Snell residuals is as follows:

cox_snell <- function(fit) {
  cumhaz0 <- basehaz(fit) # get the baseline hazard
  Lambda0 <- approxfun(cumhaz0$time, cumhaz0$hazard, method = 'constant')
  r_C <- Lambda0(fit$y[,1]) * exp(fit$linear.predictors)
  # obtain the KM estimate of the residuals
  km_rC <- survfit(Surv(r_C, fit$y[,2]) ~ 1)
  
  df <- data.frame(x = km_rC$time, y = -log(km_rC$surv), yl = -log(km_rC$upper), 
                    yu = -log(km_rC$lower))
  df <- df[km_rC$n.risk >= 10,] # keep the estimate when at least 10 individuals are at risk
  
  figure <- ggplot(df, aes(x)) +
    geom_step(aes(y = y), colour = 'blue') + 
    geom_step(aes(y = yu), lty = 2) + geom_step(aes(y = yl), lty = 2) + 
    geom_line(aes(y = x), colour = 'red') + 
    labs(title = "Cox-Snell Residuals", x = "Residual", y = "cumulative hazard") + 
    theme(plot.title = element_text(hjust = 0.5, size = 13), axis.title = element_text(size = 14), 
          axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 12))
  
  return(figure)
}
cox_snell(model7)

The estimated cumulative hazard function of the Cox–Snell residuals is close to the line with unit slope and zero intercept. However, cumulative hazard plots of the Cox-Snell residuals have not been proven to be very useful in practice. We can only conclude that our model is not seriously wrong.

Question 4.4

To check the outliers, we draw the deviance residual plot as follows:

r_D <- residuals(model7, type = 'deviance')
df <- data.frame(risk_score = model7$linear.predictors, status = as.factor(model7$y[, 2]), r_D = r_D)
ggplot(df, aes(x = risk_score, y = r_D, color = status)) +
  geom_point() + 
  labs(title = "Deviance Residuals", x = "risk score", y = "deviance residual") + 
  theme(plot.title = element_text(hjust = 0.5, size = 13), axis.title = element_text(size = 14), 
        axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 12))

We find that all the deviance residuals are between -3 and 3, so there is no obvious outlier in our model.

Assignment 3 of Survival Analysis

Zhao Yuanfan

2023-11-14

Question 4

Question 4.1

Question 4.2

Question 4.3

Question 4.4