Reading the dataset and cleaning
# Reading the dataset and cleaning ---------------------------------------------
e1690 <- read_table("C:/Users/rodri/Desktop/Simulation_CDNBCR/e1690_missing.dat",
skip = 12, na = ".")
# Filter lines with Study == 1690
e1690 <- e1690 %>% filter(STUDY == "1690")
# Excluding missing observations
e1690 <- na.exclude(e1690)
# Selecting variables and transforming
e1690 <- e1690 %>% select(TRT, SURVTIME, SURVCENS, AGE, NODES1, SEX, BRESLOW) %>%
mutate(TRT = factor(TRT),
NODES1 = factor(NODES1),
NO2 = ifelse(NODES1 == 2, 1, 0),
NO3 = ifelse(NODES1 == 3, 1, 0),
NO4 = ifelse(NODES1 == 4, 1, 0),
TRT1 = ifelse(TRT == 2, 1, 0),
SEX1 = ifelse(SEX == 2, 1, 0),
SEX = factor(SEX))
# SURVCENS
e1690 <- e1690 %>% mutate(SURVCENS = as.numeric(SURVCENS == 2))
glimpse(e1690)
## Rows: 417
## Columns: 12
## $ TRT <fct> 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2~
## $ SURVTIME <dbl> 0.72279, 6.29979, 6.62286, 3.47159, 7.01164, 6.64750, 6.97604~
## $ SURVCENS <dbl> 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0~
## $ AGE <dbl> 34.2834, 38.1711, 46.7023, 56.2382, 40.6571, 37.5880, 32.1780~
## $ NODES1 <fct> 3, 1, 3, 4, 1, 1, 2, 4, 1, 1, 4, 2, 1, 3, 3, 4, 2, 1, 2, 1, 2~
## $ SEX <fct> 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1~
## $ BRESLOW <dbl> 1.20, 4.50, 1.29, 1.14, 6.00, 7.00, 1.60, 4.20, 4.05, 6.00, 6~
## $ NO2 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1~
## $ NO3 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0~
## $ NO4 <dbl> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0~
## $ TRT1 <dbl> 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1~
## $ SEX1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0~
Descriptive analysis
# Summary
skim(e1690)
Data summary
| Name |
e1690 |
| Number of rows |
417 |
| Number of columns |
12 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
3 |
| numeric |
9 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| TRT |
0 |
1 |
FALSE |
2 |
2: 213, 1: 204 |
| NODES1 |
0 |
1 |
FALSE |
4 |
2: 137, 1: 111, 3: 87, 4: 82 |
| SEX |
0 |
1 |
FALSE |
2 |
1: 263, 2: 154 |
Variable type: numeric
| SURVTIME |
0 |
1 |
3.18 |
1.69 |
0.15 |
1.67 |
3.22 |
4.49 |
7.01 |
▆▆▇▆▂ |
| SURVCENS |
0 |
1 |
0.44 |
0.50 |
0.00 |
0.00 |
0.00 |
1.00 |
1.00 |
▇▁▁▁▆ |
| AGE |
0 |
1 |
48.00 |
13.12 |
19.13 |
38.04 |
47.11 |
57.56 |
78.05 |
▂▇▇▆▃ |
| BRESLOW |
0 |
1 |
3.94 |
3.20 |
0.08 |
1.50 |
3.10 |
5.10 |
20.00 |
▇▅▁▁▁ |
| NO2 |
0 |
1 |
0.33 |
0.47 |
0.00 |
0.00 |
0.00 |
1.00 |
1.00 |
▇▁▁▁▃ |
| NO3 |
0 |
1 |
0.21 |
0.41 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| NO4 |
0 |
1 |
0.20 |
0.40 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| TRT1 |
0 |
1 |
0.51 |
0.50 |
0.00 |
0.00 |
1.00 |
1.00 |
1.00 |
▇▁▁▁▇ |
| SEX1 |
0 |
1 |
0.37 |
0.48 |
0.00 |
0.00 |
0.00 |
1.00 |
1.00 |
▇▁▁▁▅ |
# Plots
hist(e1690$SURVTIME, main = "", xlab = "Time (years)", ylab = "Density", prob = TRUE)
boxplot(SURVTIME ~ TRT, e1690, xlab = "Treatment", ylab = "Time (years)")
boxplot(SURVTIME ~ NODES1, e1690, xlab = "Nodes", ylab = "Time (years)")
boxplot(SURVTIME ~ SEX, e1690, xlab = "Sex", ylab = "Time (years)")
plot(SURVTIME ~ AGE, e1690, pch = 16)
plot(SURVTIME ~ BRESLOW, e1690, pch = 16)
# Kaplan-Meier by treatment
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ TRT, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = c(1, 2))
legend("bottomleft", c("Trt: 1", "Trt: 2"), col = 1:2, lty = 1)
# Kaplan-Meier by nodes
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ NODES1, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = 1:4)
legend("bottomleft", c("Nodule: 1", "Nodule: 2", "Nodule: 3", "Nodule: 4"), col = 1:4, lty = 1)
# Kaplan-Meier by sex
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ SEX, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = 1:2)
legend("bottomleft", c("Sex: 1", "Sex: 2"), col = 1:2, lty = 1)









CDNBCR fit
Consider the following regression structure \[
\begin{align*}
\log(\theta_i) & = \beta_{11}I(\texttt{Nodule}_i == 2) +
\beta_{12}I(\texttt{Nodule}_i == 3) + \beta_{13}I(\texttt{Nodule}_i ==
4)\\
\log\left(\dfrac{p_i}{1 - p_i}\right) & = \beta_{20} +
\beta_{21}I(\texttt{Sex}_i == 2) + \beta_{22}I(\texttt{Treatment}_i ==
2) + \beta_{23} \texttt{breslow}_i + \beta_{24} \texttt{Age}_i
\end{align*}
\]
Uncorrelated destructive fit (alpha = 0)
fit0 <- cdnbcr(formula = Surv(SURVTIME, SURVCENS) ~ NO2 + NO3 + NO4 - 1 |
SEX1 + TRT1 + BRESLOW + AGE | 1, alpha = FALSE, data = e1690)
summary(fit0)
## Call:
## cdnbcr(formula = Surv(SURVTIME, SURVCENS) ~ NO2 + NO3 + NO4 -
## 1 | SEX1 + TRT1 + BRESLOW + AGE | 1, data = e1690, alpha = FALSE)
##
## Summary for residuals:
## Mean Std. dev. Skewness Kurtosis
## 0.00032 0.99574 0.00095 2.87583
##
## ----------------------------------------------------------------
## Mean competing causes:
## ----------------------------------------------------------------
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## NO2 1.00394 0.44215 2.2706 0.023171 *
## NO3 1.53028 0.54644 2.8004 0.005103 **
## NO4 2.36206 0.54865 4.3052 1.668e-05 ***
##
## ----------------------------------------------------------------
## Activation probability:
## ----------------------------------------------------------------
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.408018 1.482600 -0.9497 0.3423
## SEX1 -0.661460 0.881681 -0.7502 0.4531
## TRT1 -0.623070 1.161186 -0.5366 0.5916
## BRESLOW 0.199055 0.227001 0.8769 0.3805
## AGE 0.045400 0.038319 1.1848 0.2361
##
## ----------------------------------------------------------------
## Mean lifetime:
## ----------------------------------------------------------------
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.14678 0.12431 9.2254 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## phi: 2.63826 Std. Error: 1.044329
## sigma: 2.187949 Std. Error: 0.2384147
##
## In addition, Log-lik value: -505.0681
## AIC: 1032.136 and BIC: 1076.5
## EM iterations: 353
Some diagnostic plots:
## Fitted median cure rate by ulc
theta_h <- fit0$fitted$competing
phi_h <- fit0$nuisance$phi
p_h <- fit0$fitted$activation
alpha_h <- 0L
p0 <- cure_rate(theta_h, phi_h, p_h, alpha_h)
p0_1 <- median(p0[which(e1690$NODES1 == 1)])
p0_2 <- median(p0[which(e1690$NODES1 == 2)])
p0_3 <- median(p0[which(e1690$NODES1 == 3)])
p0_4 <- median(p0[which(e1690$NODES1 == 4)])
# Graphically
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ NODES1, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = 1:4)
abline(h = c(p0_1, p0_2, p0_3, p0_4), col = 1:4, lty = c(2, 2))
legend("bottomleft", c("Nodule: 1", "Nodule: 2", "Nodule: 3", "Nodule: 4"), col = 1:4, lty = 1)
# Quantile residuals
nrep <- 1000
mqresid <- NULL
# Populational survival function
Spop <- pcdnbcr(e1690$SURVTIME,
fit0$fitted$competing,
fit0$nuisance$phi,
fit0$fitted$activation,
0L,
fit0$fitted$mean,
fit0$nuisance$sigma, lower.tail = FALSE)
u <- e1690$SURVCENS * (1 - Spop) + (1 - e1690$SURVCENS) * runif(length(e1690$SURVTIME), 1 - Spop)
for (i in 1:1000) {
qresid <- sort(qnorm(runif(u)))
mqresid <- cbind(mqresid, qresid)
}
qresid <- apply(mqresid, 1, median)
qqnorm(qresid, pch = "+")
abline(0, 1, col = "grey", lwd = 2, lty = 1)
# Latent variables
plot(1:nrow(e1690), fit0$latent$M, xlab = "Index", ylab = "Number of competing causes", pch = 16)
plot(1:nrow(e1690), fit0$latent$D, xlab = "Index", ylab = "Total damage", pch = 16)




Correlated destructive fit
fit <- cdnbcr(formula = Surv(SURVTIME, SURVCENS) ~ NO2 + NO3 + NO4 - 1 |
SEX1 + TRT1 + BRESLOW + AGE | 1, data = e1690)
summary(fit)
## Call:
## cdnbcr(formula = Surv(SURVTIME, SURVCENS) ~ NO2 + NO3 + NO4 -
## 1 | SEX1 + TRT1 + BRESLOW + AGE | 1, data = e1690)
##
## Summary for residuals:
## Mean Std. dev. Skewness Kurtosis
## -0.00082 0.99758 -0.00735 2.88052
##
## ----------------------------------------------------------------
## Mean competing causes:
## ----------------------------------------------------------------
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## NO2 1.10410 0.44695 2.4703 0.013499 *
## NO3 1.59814 0.54135 2.9521 0.003156 **
## NO4 2.42379 0.51487 4.7076 2.507e-06 ***
##
## ----------------------------------------------------------------
## Activation probability:
## ----------------------------------------------------------------
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.828430 1.681785 -0.4926 0.6223
## SEX1 -1.933279 1.298598 -1.4887 0.1366
## TRT1 0.559996 0.790050 0.7088 0.4784
## BRESLOW 0.429648 0.363267 1.1827 0.2369
## AGE 0.053436 0.033984 1.5724 0.1159
##
## ----------------------------------------------------------------
## Mean lifetime:
## ----------------------------------------------------------------
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.22261 0.10318 11.849 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## phi: 2.606687 Std. Error: 0.7460012
## alpha: 0.9781557 Std. Error: 0.05944433
## sigma: 2.289632 Std. Error: 0.2207266
##
## In addition, Log-lik value: -502.7272
## AIC: 1029.454 and BIC: 1077.851
## EM iterations: 933
Some diagnostic plots:
## Fitted median cure rate by ulc
theta_h <- fit$fitted$competing
phi_h <- fit$nuisance$phi
p_h <- fit$fitted$activation
alpha_h <- fit$nuisance$alpha
p0 <- cure_rate(theta_h, phi_h, p_h, alpha_h)
p0_1 <- median(p0[which(e1690$NODES1 == 1)])
p0_2 <- median(p0[which(e1690$NODES1 == 2)])
p0_3 <- median(p0[which(e1690$NODES1 == 3)])
p0_4 <- median(p0[which(e1690$NODES1 == 4)])
# Graphically
mKM <- survfit(Surv(SURVTIME, SURVCENS) ~ NODES1, e1690, se.fit = FALSE)
plot(mKM, xlab = "Time (years)", ylab = "Surviving function", col = 1:4)
abline(h = c(p0_1, p0_2, p0_3, p0_4), col = 1:4, lty = c(2, 2))
legend("bottomleft", c("Nodule: 1", "Nodule: 2", "Nodule: 3", "Nodule: 4"), col = 1:4, lty = 1)
# Quantile residuals
nrep <- 1000
mqresid <- NULL
# Populational survival function
Spop <- pcdnbcr(e1690$SURVTIME,
fit$fitted$competing,
fit$nuisance$phi,
fit$fitted$activation,
fit$nuisance$alpha,
fit$fitted$mean,
fit$nuisance$sigma, lower.tail = FALSE)
u <- e1690$SURVCENS * (1 - Spop) + (1 - e1690$SURVCENS) * runif(length(e1690$SURVTIME), 1 - Spop)
for (i in 1:1000) {
qresid <- sort(qnorm(runif(u)))
mqresid <- cbind(mqresid, qresid)
}
qresid <- apply(mqresid, 1, median)
qqnorm(qresid, pch = "+")
abline(0, 1, col = "grey", lwd = 2, lty = 1)
# Latent variables
plot(1:nrow(e1690), fit$latent$M, xlab = "Index", ylab = "Number of competing causes", pch = 16)
plot(1:nrow(e1690), fit$latent$D, xlab = "Index", ylab = "Total damage", pch = 16)




Likelihood ratio test
Consider the test of the following hypotheses
\[
\begin{align*}
\textrm{H}_0: \alpha = 0 \quad \textrm{versus} \quad \textrm{H}_a:
\alpha \neq 0.
\end{align*}
\]
We consider the following statistics \[
\Lambda = 2 \{\ell(\mathbf{\widehat{\psi}}) -
\ell(\mathbf{\widetilde{\psi}})\},
\] where \(\mathbf{\widehat{\psi}}\) and \(\mathbf{\widetilde{\psi}}\) are the
unrestricted and restricted (under the null hypothesis) maximum
likelihood estimates of \(\psi =
(\mathbf{\beta}_1^\top, \phi, \beta_2^\top, \alpha, \beta_3^\top,
\sigma)^\top\).
Under mild regularity conditions, we have that, under \(\textrm{H}_0\), \[
\Lambda \overset{a}{\sim} 0.5 \; \chi^2_1 + 0.5 \; \chi^2_0.
\]
## Test statistic
Lambda <- 2 * (fit$logLik - fit0$logLik)
## Summary
data.frame(Lambda = Lambda,
Critical_value = qchisq(1 - 2 * 0.05, 1))
## Lambda Critical_value
## 1 4.68185 2.705543