Load packages and data prep
library(survival)
library(tidyr)
vet <- read.csv("veteran.csv") # Loading veteran dataset
dim(vet) # Get the size of vet dataset
## [1] 137 8
head(vet) # Check the first six of records
## treatment celltype tdays dead karnofsky tprior age therapy
## 1 1 1 72 1 60 7 69 0
## 2 1 1 411 1 70 5 64 10
## 3 1 1 228 1 60 3 38 0
## 4 1 1 126 1 60 9 63 10
## 5 1 1 118 1 70 11 65 10
## 6 1 1 10 1 20 5 49 0
Anyway, since we are using survival package, it also has
a built-in dataset called veteran. You can investigate
whether it is the same vets data we are currently loading.
Discussion
# This is just one out of sooo many ways to create dummy variables.
vet$celltype1 <- 1L * (vet$celltype == 1)
vet$celltype2 <- 1L * (vet$celltype == 2)
vet$celltype3 <- 1L * (vet$celltype == 3)
vet$celltype4 <- 1L * (vet$celltype == 4)
colnames(vet) # Note how there are 4 additional columns to veteran data frame
## [1] "treatment" "celltype" "tdays" "dead" "karnofsky" "tprior"
## [7] "age" "therapy" "celltype1" "celltype2" "celltype3" "celltype4"
vet.model1 <- coxph(Surv(tdays, dead) ~ treatment + karnofsky + tprior + age +
therapy+ celltype1 + celltype2 + celltype3, data = vet)
# celltype = 4 as reference
summary(vet.model1)
## Call:
## coxph(formula = Surv(tdays, dead) ~ treatment + karnofsky + tprior +
## age + therapy + celltype1 + celltype2 + celltype3, data = vet)
##
## n= 137, number of events= 128
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treatment 2.946e-01 1.343e+00 2.075e-01 1.419 0.15577
## karnofsky -3.282e-02 9.677e-01 5.508e-03 -5.958 2.55e-09 ***
## tprior 8.132e-05 1.000e+00 9.136e-03 0.009 0.99290
## age -8.706e-03 9.913e-01 9.300e-03 -0.936 0.34920
## therapy 7.159e-03 1.007e+00 2.323e-02 0.308 0.75794
## celltype1 -4.013e-01 6.695e-01 2.827e-01 -1.420 0.15574
## celltype2 4.603e-01 1.584e+00 2.662e-01 1.729 0.08383 .
## celltype3 7.948e-01 2.214e+00 3.029e-01 2.624 0.00869 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## treatment 1.3426 0.7448 0.8939 2.0166
## karnofsky 0.9677 1.0334 0.9573 0.9782
## tprior 1.0001 0.9999 0.9823 1.0182
## age 0.9913 1.0087 0.9734 1.0096
## therapy 1.0072 0.9929 0.9624 1.0541
## celltype1 0.6695 1.4938 0.3847 1.1651
## celltype2 1.5845 0.6311 0.9403 2.6699
## celltype3 2.2139 0.4517 1.2228 4.0084
##
## Concordance= 0.736 (se = 0.021 )
## Likelihood ratio test= 62.1 on 8 df, p=2e-10
## Wald test = 62.37 on 8 df, p=2e-10
## Score (logrank) test = 66.74 on 8 df, p=2e-11
cox.zph(vet.model1)
## chisq df p
## treatment 0.2644 1 0.60712
## karnofsky 12.9352 1 0.00032
## tprior 0.0129 1 0.90961
## age 1.8288 1 0.17627
## therapy 2.1656 1 0.14113
## celltype1 3.5940 1 0.05799
## celltype2 3.9879 1 0.04583
## celltype3 6.7929 1 0.00915
## GLOBAL 34.5525 8 3.2e-05
The three predictors not satisfying PH assumptions are karnofsky, celltype2 and celltype3. (The celltype1 variable is close to reject the H0 hypothesis, but let’s stick to \(\alpha = 0.05\).)
You may want to support this conclusion with log-log plots. I will leave them to you for exercise.
hist(vet$karnofsky)
summary(vet$karnofsky)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 40.00 60.00 58.57 75.00 99.00
## It's reasonable to divide the variable into
# 3 classes: (0 - 40), (40, 75), and (75 - 100)
vet$karno.cat <- cut(vet$karno, breaks = c(0, 40, 75, 100), labels = c(1,2,3))
The newly stratified variable \(Z^*\) is constructed by combining \(3\times 2 \times 2\) categories from \(Z_1, Z_2\) and \(Z_3\), respectively. Hence, \(k^*=12\).
vet$zstar <- c()
for (i in 1:length(vet$treatment)){
if (vet$karno.cat[i] == 1 & vet$celltype2[i] == 0 & vet$celltype3[i] == 0){
vet$zstar[i] <- 1
}
if (vet$karno.cat[i] == 1 & vet$celltype2[i] == 0 & vet$celltype3[i] == 1){
vet$zstar[i] <- 2
}
if (vet$karno.cat[i] == 1 & vet$celltype2[i] == 1 & vet$celltype3[i] == 0){
vet$zstar[i] <- 3
}
if (vet$karno.cat[i] == 1 & vet$celltype2[i] == 1 & vet$celltype3[i] == 1){
vet$zstar[i] <- 4
}
if (vet$karno.cat[i] == 2 & vet$celltype2[i] == 0 & vet$celltype3[i] == 0){
vet$zstar[i] <- 5
}
if (vet$karno.cat[i] == 2 & vet$celltype2[i] == 0 & vet$celltype3[i] == 1){
vet$zstar[i] <- 6
}
if (vet$karno.cat[i] == 2 & vet$celltype2[i] == 1 & vet$celltype3[i] == 0){
vet$zstar[i] <- 7
}
if (vet$karno.cat[i] == 2 & vet$celltype2[i] == 1 & vet$celltype3[i] == 1){
vet$zstar[i] <- 8
}
if (vet$karno.cat[i] == 3 & vet$celltype2[i] == 0 & vet$celltype3[i] == 0){
vet$zstar[i] <- 9
}
if (vet$karno.cat[i] == 3 & vet$celltype2[i] == 0 & vet$celltype3[i] == 1){
vet$zstar[i] <- 10
}
if (vet$karno.cat[i] == 3 & vet$celltype2[i] == 1 & vet$celltype3[i] == 0){
vet$zstar[i] <- 11
}
if (vet$karno.cat[i] == 3 & vet$celltype2[i] == 1 & vet$celltype3[i] == 1){
vet$zstar[i] <- 12
}
}
vet.model2 <- coxph(Surv(tdays, dead) ~ treatment + tprior + age +
therapy+ celltype1 + strata(zstar), data = vet)
summary(vet.model2)
## Call:
## coxph(formula = Surv(tdays, dead) ~ treatment + tprior + age +
## therapy + celltype1 + strata(zstar), data = vet)
##
## n= 137, number of events= 128
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treatment -0.027461 0.972913 0.210943 -0.130 0.896
## tprior 0.002268 1.002270 0.008726 0.260 0.795
## age -0.004332 0.995677 0.010166 -0.426 0.670
## therapy 0.002375 1.002378 0.023495 0.101 0.919
## celltype1 -0.424819 0.653888 0.293486 -1.447 0.148
##
## exp(coef) exp(-coef) lower .95 upper .95
## treatment 0.9729 1.0278 0.6435 1.471
## tprior 1.0023 0.9977 0.9853 1.020
## age 0.9957 1.0043 0.9760 1.016
## therapy 1.0024 0.9976 0.9573 1.050
## celltype1 0.6539 1.5293 0.3679 1.162
##
## Concordance= 0.52 (se = 0.039 )
## Likelihood ratio test= 2.65 on 5 df, p=0.8
## Wald test = 2.65 on 5 df, p=0.8
## Score (logrank) test = 2.69 on 5 df, p=0.7
vet.model3 <- coxph(Surv(tdays, dead) ~ strata(zstar)*treatment +
strata(zstar)*tprior + strata(zstar)*age +
strata(zstar)*therapy+ strata(zstar)*celltype1, data = vet)
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 13,37 ; coefficient may be infinite.
chisq_stat <- -2*(vet.model2$loglik[2] - vet.model3$loglik[2])
# The pvalue
1 - pchisq(chisq_stat, df = length(vet.model3$coefficients) - length(vet.model2$coefficients) )
## [1] 0.03336059
This means, the SC model might not be the true model. We cannot go on
and blindly evaluate vet.model2. There might be variables
that are not suitable for stratification. What if we stratify over
karnofsky and celltype3 only?