Packages

library(tidyverse)
library(skimr)
library(survival)
library(timereg)
library(cdnbcr)

Reading the dataset and cleaning

# Reading the dataset and cleaning ---------------------------------------------
e1690 <- read_table("C:/Users/rodri/Desktop/Simulation_CDNBCR/e1690_missing.dat",
                    skip = 12, na = ".")

# Filter lines with Study == 1690
e1690 <- e1690 %>% filter(STUDY == "1690")

# Excluding missing observations
e1690 <- na.exclude(e1690)

# Selecting variables and transforming
e1690 <- e1690 %>% select(TRT, SURVTIME, SURVCENS, AGE, NODES1, SEX, BRESLOW) %>%
  mutate(TRT = factor(TRT),
         NODES1 = factor(NODES1),
         NO2 = ifelse(NODES1 == 2, 1, 0),
         NO3 = ifelse(NODES1 == 3, 1, 0),
         NO4 = ifelse(NODES1 == 4, 1, 0), 
         TRT1 = ifelse(TRT == 2, 1, 0), 
         SEX1 = ifelse(SEX == 2, 1, 0),
         SEX = factor(SEX))


# SURVCENS
e1690 <- e1690 %>% mutate(SURVCENS = as.numeric(SURVCENS == 2))

glimpse(e1690)
## Rows: 417
## Columns: 12
## $ TRT      <fct> 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2~
## $ SURVTIME <dbl> 0.72279, 6.29979, 6.62286, 3.47159, 7.01164, 6.64750, 6.97604~
## $ SURVCENS <dbl> 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0~
## $ AGE      <dbl> 34.2834, 38.1711, 46.7023, 56.2382, 40.6571, 37.5880, 32.1780~
## $ NODES1   <fct> 3, 1, 3, 4, 1, 1, 2, 4, 1, 1, 4, 2, 1, 3, 3, 4, 2, 1, 2, 1, 2~
## $ SEX      <fct> 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1~
## $ BRESLOW  <dbl> 1.20, 4.50, 1.29, 1.14, 6.00, 7.00, 1.60, 4.20, 4.05, 6.00, 6~
## $ NO2      <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1~
## $ NO3      <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0~
## $ NO4      <dbl> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0~
## $ TRT1     <dbl> 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1~
## $ SEX1     <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0~

Descriptive analysis

# Summary
skim(e1690)
Data summary
Name e1690
Number of rows 417
Number of columns 12
_______________________
Column type frequency:
factor 3
numeric 9
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
TRT 0 1 FALSE 2 2: 213, 1: 204
NODES1 0 1 FALSE 4 2: 137, 1: 111, 3: 87, 4: 82
SEX 0 1 FALSE 2 1: 263, 2: 154

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SURVTIME 0 1 3.18 1.69 0.15 1.67 3.22 4.49 7.01 ▆▆▇▆▂
SURVCENS 0 1 0.44 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
AGE 0 1 48.00 13.12 19.13 38.04 47.11 57.56 78.05 ▂▇▇▆▃
BRESLOW 0 1 3.94 3.20 0.08 1.50 3.10 5.10 20.00 ▇▅▁▁▁
NO2 0 1 0.33 0.47 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
NO3 0 1 0.21 0.41 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
NO4 0 1 0.20 0.40 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
TRT1 0 1 0.51 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
SEX1 0 1 0.37 0.48 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
# Plots
hist(e1690$SURVTIME, main = "", xlab = "Time (years)", ylab = "Density", prob = TRUE)

boxplot(SURVTIME ~ TRT, e1690, xlab = "Treatment", ylab = "Time (years)")

boxplot(SURVTIME ~ NODES1, e1690, xlab = "Nodes", ylab = "Time (years)")

boxplot(SURVTIME ~ SEX, e1690, xlab = "Sex", ylab = "Time (years)")

plot(SURVTIME ~ AGE, e1690, pch = 16)

plot(SURVTIME ~ BRESLOW, e1690, pch = 16)

# Kaplan-Meier by treatment
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ TRT, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = c(1, 2))
legend("bottomleft", c("Trt: 1", "Trt: 2"), col = 1:2, lty = 1)

# Kaplan-Meier by nodes
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ NODES1, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = 1:4)
legend("bottomleft", c("Nodule: 1", "Nodule: 2", "Nodule: 3", "Nodule: 4"), col = 1:4, lty = 1)

# Kaplan-Meier by sex
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ SEX, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = 1:2)
legend("bottomleft", c("Sex: 1", "Sex: 2"), col = 1:2, lty = 1)

CDNBCR fit

Consider the following regression structure \[ \begin{align*} \log(\theta_i) & = \beta_{11}I(\texttt{Nodule}_i == 2) + \beta_{12}I(\texttt{Nodule}_i == 3) + \beta_{13}I(\texttt{Nodule}_i == 4)\\ \log\left(\dfrac{p_i}{1 - p_i}\right) & = \beta_{20} + \beta_{21}I(\texttt{Sex}_i == 2) + \beta_{22}I(\texttt{Treatment}_i == 2) + \beta_{23} \texttt{breslow}_i + \beta_{24} \texttt{Age}_i \end{align*} \]

Uncorrelated destructive fit (alpha = 0)

fit0 <- cdnbcr(formula = Surv(SURVTIME, SURVCENS) ~ NO2 + NO3 + NO4 - 1 |
                 SEX1 + TRT1 + BRESLOW + AGE | 1, alpha = FALSE, data = e1690)


summary(fit0)
## Call:
## cdnbcr(formula = Surv(SURVTIME, SURVCENS) ~ NO2 + NO3 + NO4 - 
##     1 | SEX1 + TRT1 + BRESLOW + AGE | 1, data = e1690, alpha = FALSE)
## 
## Summary for residuals:
##      Mean Std. dev. Skewness Kurtosis
##   0.00032   0.99574  0.00095  2.87583
## 
## ----------------------------------------------------------------
## Mean competing causes:
## ----------------------------------------------------------------
## Coefficients:
##     Estimate Std. Error t value  Pr(>|t|)    
## NO2  1.00394    0.44215  2.2706  0.023171 *  
## NO3  1.53028    0.54644  2.8004  0.005103 ** 
## NO4  2.36206    0.54865  4.3052 1.668e-05 ***
## 
## ----------------------------------------------------------------
## Activation probability:
## ----------------------------------------------------------------
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.408018   1.482600 -0.9497   0.3423
## SEX1        -0.661460   0.881681 -0.7502   0.4531
## TRT1        -0.623070   1.161186 -0.5366   0.5916
## BRESLOW      0.199055   0.227001  0.8769   0.3805
## AGE          0.045400   0.038319  1.1848   0.2361
## 
## ----------------------------------------------------------------
## Mean lifetime:
## ----------------------------------------------------------------
## Coefficients:
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  1.14678    0.12431  9.2254 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## phi: 2.63826 Std. Error: 1.044329
## sigma: 2.187949 Std. Error: 0.2384147
## 
## In addition, Log-lik value: -505.0681 
## AIC: 1032.136 and BIC: 1076.5 
## EM iterations: 353

Some diagnostic plots:

## Fitted median cure rate by ulc
theta_h <- fit0$fitted$competing
phi_h <- fit0$nuisance$phi
p_h <- fit0$fitted$activation
alpha_h <- 0L

p0 <- cure_rate(theta_h, phi_h, p_h, alpha_h)

p0_1 <- median(p0[which(e1690$NODES1 == 1)])
p0_2 <- median(p0[which(e1690$NODES1 == 2)])
p0_3 <- median(p0[which(e1690$NODES1 == 3)])
p0_4 <- median(p0[which(e1690$NODES1 == 4)])

# Graphically
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ NODES1, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = 1:4)
abline(h = c(p0_1, p0_2, p0_3, p0_4), col = 1:4, lty = c(2, 2))
legend("bottomleft", c("Nodule: 1", "Nodule: 2", "Nodule: 3", "Nodule: 4"), col = 1:4, lty = 1)

# Quantile residuals
nrep <- 1000
mqresid <- NULL

# Populational survival function
Spop <- pcdnbcr(e1690$SURVTIME,
                fit0$fitted$competing,
                fit0$nuisance$phi,
                fit0$fitted$activation,
                0L,
                fit0$fitted$mean,
                fit0$nuisance$sigma, lower.tail = FALSE)

u <- e1690$SURVCENS * (1 - Spop) + (1 - e1690$SURVCENS) * runif(length(e1690$SURVTIME), 1 - Spop)
for (i in 1:1000) {
  qresid <- sort(qnorm(runif(u)))
  mqresid <- cbind(mqresid, qresid)
}

qresid <- apply(mqresid, 1, median)
qqnorm(qresid, pch = "+")
abline(0, 1, col = "grey", lwd = 2, lty = 1)

# Latent variables
plot(1:nrow(e1690), fit0$latent$M, xlab = "Index", ylab = "Number of competing causes", pch = 16)
plot(1:nrow(e1690), fit0$latent$D, xlab = "Index", ylab = "Total damage", pch = 16)

Correlated destructive fit

fit <- cdnbcr(formula = Surv(SURVTIME, SURVCENS) ~ NO2 + NO3 + NO4 - 1 |
               SEX1 + TRT1 + BRESLOW + AGE | 1, data = e1690)


summary(fit)
## Call:
## cdnbcr(formula = Surv(SURVTIME, SURVCENS) ~ NO2 + NO3 + NO4 - 
##     1 | SEX1 + TRT1 + BRESLOW + AGE | 1, data = e1690)
## 
## Summary for residuals:
##       Mean Std. dev. Skewness Kurtosis
##   -0.00082   0.99758 -0.00735  2.88052
## 
## ----------------------------------------------------------------
## Mean competing causes:
## ----------------------------------------------------------------
## Coefficients:
##     Estimate Std. Error t value  Pr(>|t|)    
## NO2  1.10410    0.44695  2.4703  0.013499 *  
## NO3  1.59814    0.54135  2.9521  0.003156 ** 
## NO4  2.42379    0.51487  4.7076 2.507e-06 ***
## 
## ----------------------------------------------------------------
## Activation probability:
## ----------------------------------------------------------------
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.828430   1.681785 -0.4926   0.6223
## SEX1        -1.933279   1.298598 -1.4887   0.1366
## TRT1         0.559996   0.790050  0.7088   0.4784
## BRESLOW      0.429648   0.363267  1.1827   0.2369
## AGE          0.053436   0.033984  1.5724   0.1159
## 
## ----------------------------------------------------------------
## Mean lifetime:
## ----------------------------------------------------------------
## Coefficients:
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  1.22261    0.10318  11.849 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## phi: 2.606687 Std. Error: 0.7460012
## alpha: 0.9781557 Std. Error: 0.05944433
## sigma: 2.289632 Std. Error: 0.2207266
## 
## In addition, Log-lik value: -502.7272 
## AIC: 1029.454 and BIC: 1077.851 
## EM iterations: 933

Some diagnostic plots:

## Fitted median cure rate by ulc
theta_h <- fit$fitted$competing
phi_h <- fit$nuisance$phi
p_h <- fit$fitted$activation
alpha_h <- fit$nuisance$alpha

p0 <- cure_rate(theta_h, phi_h, p_h, alpha_h)

p0_1 <- median(p0[which(e1690$NODES1 == 1)])
p0_2 <- median(p0[which(e1690$NODES1 == 2)])
p0_3 <- median(p0[which(e1690$NODES1 == 3)])
p0_4 <- median(p0[which(e1690$NODES1 == 4)])

# Graphically
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ NODES1, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = 1:4)
abline(h = c(p0_1, p0_2, p0_3, p0_4), col = 1:4, lty = c(2, 2))
legend("bottomleft", c("Nodule: 1", "Nodule: 2", "Nodule: 3", "Nodule: 4"), col = 1:4, lty = 1)


# Quantile residuals
nrep <- 1000
mqresid <- NULL

# Populational survival function
Spop <- pcdnbcr(e1690$SURVTIME,
                fit$fitted$competing,
                fit$nuisance$phi,
                fit$fitted$activation,
                fit$nuisance$alpha,
                fit$fitted$mean,
                fit$nuisance$sigma, lower.tail = FALSE)

u <- e1690$SURVCENS * (1 - Spop) + (1 - e1690$SURVCENS) * runif(length(e1690$SURVTIME), 1 - Spop)
for (i in 1:1000) {
  qresid <- sort(qnorm(runif(u)))
  mqresid <- cbind(mqresid, qresid)
}

qresid <- apply(mqresid, 1, median)
qqnorm(qresid, pch = "+")
abline(0, 1, col = "grey", lwd = 2, lty = 1)

# Latent variables
plot(1:nrow(e1690), fit$latent$M, xlab = "Index", ylab = "Number of competing causes", pch = 16)
plot(1:nrow(e1690), fit$latent$D, xlab = "Index", ylab = "Total damage", pch = 16)

Likelihood ratio test

Consider the test of the following hypotheses

\[ \begin{align*} \textrm{H}_0: \alpha = 0 \quad \textrm{versus} \quad \textrm{H}_a: \alpha \neq 0. \end{align*} \]

We consider the following statistics \[ \Lambda = 2 \{\ell(\mathbf{\widehat{\psi}}) - \ell(\mathbf{\widetilde{\psi}})\}, \] where \(\mathbf{\widehat{\psi}}\) and \(\mathbf{\widetilde{\psi}}\) are the unrestricted and restricted (under the null hypothesis) maximum likelihood estimates of \(\psi = (\mathbf{\beta}_1^\top, \phi, \beta_2^\top, \alpha, \beta_3^\top, \sigma)^\top\).

Under mild regularity conditions, we have that, under \(\textrm{H}_0\), \[ \Lambda \overset{a}{\sim} 0.5 \; \chi^2_1 + 0.5 \; \chi^2_0. \]

## Test statistic
Lambda <- 2 * (fit$logLik - fit0$logLik)

## Summary
data.frame(Lambda = Lambda,
           Critical_value = qchisq(1 - 2 * 0.05, 1))
##    Lambda Critical_value
## 1 4.68185       2.705543