Packages

library(tidyverse)
library(skimr)
library(survival)
library(timereg)
library(cdnbcr)
library(readxl)

Reading the dataset and cleaning

# Reading
melanoma <- read_excel("C:/Users/rodri/Desktop/Simulation_CDNBCR/Applications/dados_melanoma_new.xls",
                       col_types = c("numeric", "numeric", "numeric",  "text", "text", "text", "numeric",
                                     "text", "numeric", "numeric", "text", "text", "numeric"))

# Transforming
melanoma <- melanoma %>% mutate(sexo = factor(sexo, labels = c("Male", "Female")),
                                idade_cat = factor(cut(idade, c(0, 40, 200)),
                                                   labels = c("<= 40", "> 40")),
                                escolari = factor(escolari,
                                                  labels = c("Illiterate", "Elementary school", "Elementary school",
                                                             "High school", "Higher education", "Not informed")),
                                EC_cat = factor(EC_cat, labels = c("I", "II", "III", "IV")),
                                quimio = factor(quimio, labels = c("No", "Yes")),
                                radio = factor(radio, labels = c("No", "Yes")),
                                cirurgia = factor(cirurgia, labels = c("No", "Yes")))

# Selecting, renaming, and excluding missing observations
melanoma <- melanoma %>% dplyr::select(tempo_anos, status, sexo, idade_cat, escolari, EC_cat,
                                quimio, radio, cirurgia) %>% 
  rename(time = tempo_anos,
         delta = status,
         sex = sexo,
         age = idade_cat,
         education_level = escolari,
         clinical_stage = EC_cat,
         chemotherapy = quimio,
         radiotherapy = radio,
         surgery = cirurgia) %>% 
  na.omit()


melanoma <- melanoma %>% mutate(E1=ifelse(education_level=="Illiterate",1,0),
                                E2=ifelse(education_level=="Elementary school",1,0),
                                E3=ifelse(education_level=="High school",1,0),
                                E4=ifelse(education_level=="Higher education",1,0),
                                C2=ifelse(clinical_stage=="II",1,0),
                                C3=ifelse(clinical_stage=="III",1,0),
                                C4=ifelse(clinical_stage=="IV",1,0),
                                Age1 = ifelse(age=="> 40",1,0),
                                Sex1=ifelse(sex=="Female",1,0),
                                Chemo1=ifelse(chemotherapy=="Yes",1,0),
                                Radio1=ifelse(radiotherapy=="Yes",1,0),
                                Surgery1=ifelse(surgery =="Yes",1,0),
)


glimpse(melanoma)
## Rows: 6,751
## Columns: 21
## $ time            <dbl> 9.5402299, 0.3092501, 3.3004926, 2.2632731, 5.7060755,~
## $ delta           <dbl> 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, ~
## $ sex             <fct> Male, Male, Male, Female, Male, Female, Female, Male, ~
## $ age             <fct> > 40, > 40, <= 40, > 40, > 40, <= 40, <= 40, <= 40, > ~
## $ education_level <fct> Not informed, Not informed, Elementary school, Element~
## $ clinical_stage  <fct> II, II, I, III, I, III, II, I, III, II, IV, III, I, IV~
## $ chemotherapy    <fct> No, No, No, Yes, No, Yes, No, No, No, Yes, No, Yes, No~
## $ radiotherapy    <fct> No, No, Yes, Yes, No, Yes, No, No, Yes, No, No, No, No~
## $ surgery         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, No~
## $ E1              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ E2              <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ~
## $ E3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, ~
## $ E4              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, ~
## $ C2              <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ C3              <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, ~
## $ C4              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ~
## $ Age1            <dbl> 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ Sex1            <dbl> 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, ~
## $ Chemo1          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, ~
## $ Radio1          <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ Surgery1        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ~

Descriptive analysis

# Summary
skim(melanoma)
Data summary
Name melanoma
Number of rows 6751
Number of columns 21
_______________________
Column type frequency:
factor 7
numeric 14
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 Fem: 3417, Mal: 3334
age 0 1 FALSE 2 > 4: 5693, <= : 1058
education_level 0 1 FALSE 5 Ele: 2649, Not: 2246, Hig: 841, Hig: 667
clinical_stage 0 1 FALSE 4 I: 3012, II: 1542, III: 1229, IV: 968
chemotherapy 0 1 FALSE 2 No: 5646, Yes: 1105
radiotherapy 0 1 FALSE 2 No: 6164, Yes: 587
surgery 0 1 FALSE 2 Yes: 5980, No: 771

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
time 0 1 4.69 3.85 0 1.67 3.69 6.86 18.54 ▇▅▂▁▁
delta 0 1 0.28 0.45 0 0.00 0.00 1.00 1.00 ▇▁▁▁▃
E1 0 1 0.05 0.22 0 0.00 0.00 0.00 1.00 ▇▁▁▁▁
E2 0 1 0.39 0.49 0 0.00 0.00 1.00 1.00 ▇▁▁▁▅
E3 0 1 0.12 0.33 0 0.00 0.00 0.00 1.00 ▇▁▁▁▁
E4 0 1 0.10 0.30 0 0.00 0.00 0.00 1.00 ▇▁▁▁▁
C2 0 1 0.23 0.42 0 0.00 0.00 0.00 1.00 ▇▁▁▁▂
C3 0 1 0.18 0.39 0 0.00 0.00 0.00 1.00 ▇▁▁▁▂
C4 0 1 0.14 0.35 0 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Age1 0 1 0.84 0.36 0 1.00 1.00 1.00 1.00 ▂▁▁▁▇
Sex1 0 1 0.51 0.50 0 0.00 1.00 1.00 1.00 ▇▁▁▁▇
Chemo1 0 1 0.16 0.37 0 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Radio1 0 1 0.09 0.28 0 0.00 0.00 0.00 1.00 ▇▁▁▁▁
Surgery1 0 1 0.89 0.32 0 1.00 1.00 1.00 1.00 ▁▁▁▁▇
# Plots
theme_set(theme_classic())
melanoma %>% 
  ggplot(aes(x = time)) +
  geom_histogram() +
  labs(x = "Time", y = "Frequency")

mKM <- survfit(Surv(time, delta) ~ 1, melanoma, se.fit = FALSE)
plot(mKM, xlab = "Time", ylab = "Surviving function")

# Time by sex
melanoma %>% 
  ggplot(aes(x = sex, y = time)) +
  geom_boxplot() +
  labs(x = "Sex", y = "Time")

mKM <- survfit(Surv(time, delta) ~ sex, melanoma, se.fit = FALSE)
plot(mKM, xlab = "Time", ylab = "Surviving function", col = c(1, 2))
legend("bottomleft", c("Male", "Female"), col = c(1, 2), lty = 1)

# Time by age
melanoma %>% 
  ggplot(aes(x = age, y = time)) +
  geom_boxplot() +
  labs(x = "Age at diagnosis", y = "Time")

mKM <- survfit(Surv(time, delta) ~ age, melanoma, se.fit = FALSE)
plot(mKM, xlab = "Time", ylab = "Surviving function", col = c(1, 2))
legend("bottomleft", c("Less than or equal to 40", "Greater than 40"),
       col = c(1, 2), lty = 1)

# Time by education level
melanoma %>% 
  ggplot(aes(x = education_level, y = time)) +
  geom_boxplot() +
  labs(x = "Education level", y = "Time")

mKM <- survfit(Surv(time, delta) ~ education_level, melanoma, se.fit = FALSE)
plot(mKM, xlab = "Time", ylab = "Surviving function", col = 1:5)
legend("bottomleft", levels(melanoma$education_level), col = 1:5, lty = 1)

# Time by clinical cancer stage
melanoma %>% 
  ggplot(aes(x = clinical_stage, y = time)) +
  geom_boxplot() +
  labs(x = "Clinical cancer stage", y = "Time")

mKM <- survfit(Surv(time, delta) ~ clinical_stage, melanoma, se.fit = FALSE)
plot(mKM, xlab = "Time", ylab = "Surviving function", col = 1:4)
legend("bottomleft", levels(melanoma$clinical_stage), col = 1:4, lty = 1)

# Time by chemotherapy
melanoma %>% 
  ggplot(aes(x = chemotherapy, y = time)) +
  geom_boxplot() +
  labs(x = "Was chemotherapy done?", y = "Time")

mKM <- survfit(Surv(time, delta) ~ chemotherapy, melanoma, se.fit = FALSE)
plot(mKM, xlab = "Time", ylab = "Surviving function", col = 1:2)
legend("bottomleft", levels(melanoma$chemotherapy), col = 1:2, lty = 1)

# Time by radiotherapy
melanoma %>% 
  ggplot(aes(x = chemotherapy, y = time)) +
  geom_boxplot() +
  labs(x = "Was radiotherapy done?", y = "Time")

mKM <- survfit(Surv(time, delta) ~ radiotherapy, melanoma, se.fit = FALSE)
plot(mKM, xlab = "Time", ylab = "Surviving function", col = 1:2)
legend("bottomleft", levels(melanoma$radiotherapy), col = 1:2, lty = 1)

# Time by surgery
melanoma %>% 
  ggplot(aes(x = surgery, y = time)) +
  geom_boxplot() +
  labs(x = "Was surgery done?", y = "Time")

mKM <- survfit(Surv(time, delta) ~ surgery, melanoma, se.fit = FALSE)
plot(mKM, xlab = "Time", ylab = "Surviving function", col = 1:2)
legend("bottomleft", levels(melanoma$surgery), col = 1:2, lty = 1)

CDNBCR fit

Consider the following regression structure $$ \[\begin{align*} \log(\theta_i) & = \beta_{11}I(\texttt{sex}_i == ``\textrm{Female}") + \beta_{12}I(\texttt{age}_i == ``\textrm{> 40}") + \\ & \hspace{0.5cm} \beta_{13}I(\texttt{education}_i == ``\textrm{Illiterate}") + \beta_{14}I(\texttt{education}_i == ``\textrm{Elementary school}") + \\ & \hspace{0.5cm} \beta_{15}I(\texttt{education}_i == ``\textrm{High school}") + \beta_{16}I(\texttt{education}_i == ``\textrm{Higher education}"),\\ \log\left(\dfrac{p_i}{1 - p_i}\right) & = \beta_{20} + \beta_{21}I(\texttt{clinical_stage}_i == ``\textrm{II}") + \beta_{22}I(\texttt{clinical_stage}_i == ``\textrm{III}") +\\ & \hspace{0.5cm} \beta_{23}I(\texttt{clinical_stage}_i == ``\textrm{IV}") + \beta_{24}I(\texttt{chemotheray}_i == ``\textrm{Yes}") +\\ & \hspace{0.5cm} \beta_{25}I(\texttt{radiotheray}_i == ``\textrm{Yes}") + \beta_{26}I(\texttt{surgery}_i == ``\textrm{Yes}"),\\ \\ \log(\mu_i) & = \beta_{30} + \beta_{31}I(\texttt{chemotheray}_i == ``\textrm{Yes}") + \beta_{32}I(\texttt{radiotheray}_i == ``\textrm{Yes}") + \\ & \hspace{0.5cm} \beta_{33}I(\texttt{surgery}_i == ``\textrm{Yes}"). \end{align*}\] $$

Uncorrelated destructive fit (alpha = 0)

fit0 <- cdnbcr(Surv(time, delta) ~ Sex1 + Age1 + E1 + E2 + E3 + E4 -1 | 
                 C2 + C3 + C4 + Chemo1 + Radio1 + Surgery1 |
                 Chemo1 + Radio1 + Surgery1, data = melanoma, alpha = FALSE)

summary(fit0)
## Call:
## cdnbcr(formula = Surv(time, delta) ~ Sex1 + Age1 + E1 + E2 + 
##     E3 + E4 - 1 | C2 + C3 + C4 + Chemo1 + Radio1 + Surgery1 | 
##     Chemo1 + Radio1 + Surgery1, data = melanoma, alpha = FALSE)
## 
## Summary for residuals:
##     Mean Std. dev. Skewness Kurtosis
##   -7e-05    0.9991        0  2.98323
## 
## ----------------------------------------------------------------
## Mean competing causes:
## ----------------------------------------------------------------
## Coefficients:
##       Estimate Std. Error t value  Pr(>|t|)    
## Sex1 -0.226309   0.051466 -4.3973 1.096e-05 ***
## Age1  1.045698   0.093527 11.1806 < 2.2e-16 ***
## E1    0.725214   0.110392  6.5695 5.050e-11 ***
## E2    0.414606   0.066002  6.2818 3.348e-10 ***
## E3    0.387661   0.090568  4.2803 1.866e-05 ***
## E4    0.086756   0.114665  0.7566    0.4493    
## 
## ----------------------------------------------------------------
## Activation probability:
## ----------------------------------------------------------------
## Coefficients:
##              Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept) -3.056418   0.184574 -16.5593 < 2.2e-16 ***
## C2           1.270106   0.098601  12.8813 < 2.2e-16 ***
## C3           2.231727   0.109738  20.3369 < 2.2e-16 ***
## C4          10.444141   4.590054   2.2754   0.02288 *  
## Chemo1       1.029628   0.155348   6.6279 3.405e-11 ***
## Radio1       0.954833   0.202578   4.7134 2.436e-06 ***
## Surgery1    -0.120117   0.159321  -0.7539   0.45089    
## 
## ----------------------------------------------------------------
## Mean lifetime:
## ----------------------------------------------------------------
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.801907   0.064265 12.4782  < 2e-16 ***
## Chemo1       0.126535   0.064144  1.9727  0.04853 *  
## Radio1      -0.068771   0.072858 -0.9439  0.34522    
## Surgery1     0.799218   0.064111 12.4661  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## phi: 0.261701 Std. Error: 0.09424336
## sigma: 1.223818 Std. Error: 0.0282786
## 
## In addition, Log-lik value: -5591.027 
## AIC: 11220.05 and BIC: 11349.59 
## EM iterations: 1887

Some diagnostic plots:

# Quantile residuals
qqnorm(fit0$residuals, pch = "+")
abline(0, 1, col = "grey", lwd = 2, lty = 1)

# Latent variables
plot(1:nrow(melanoma), fit0$latent$M,
     xlab = "Index", ylab = "Number of competing causes", pch = 16)
plot(1:nrow(melanoma), fit0$latent$D,
     xlab = "Index", ylab = "Total damage", pch = 16)

Correlated destructive fit

fit <- cdnbcr(Surv(time, delta) ~ Sex1 + Age1 + E1 + E2 + E3 + E4 -1 | 
                 C2 + C3 + C4 + Chemo1 + Radio1 + Surgery1 |
                 Chemo1 + Radio1 + Surgery1, data = melanoma)

summary(fit)
## Call:
## cdnbcr(formula = Surv(time, delta) ~ Sex1 + Age1 + E1 + E2 + 
##     E3 + E4 - 1 | C2 + C3 + C4 + Chemo1 + Radio1 + Surgery1 | 
##     Chemo1 + Radio1 + Surgery1, data = melanoma)
## 
## Summary for residuals:
##      Mean Std. dev. Skewness Kurtosis
##   0.00029   0.99928  0.00051  2.98091
## 
## ----------------------------------------------------------------
## Mean competing causes:
## ----------------------------------------------------------------
## Coefficients:
##       Estimate Std. Error t value  Pr(>|t|)    
## Sex1 -0.226381   0.051428 -4.4019 1.073e-05 ***
## Age1  1.043618   0.093477 11.1644 < 2.2e-16 ***
## E1    0.724241   0.110333  6.5641 5.234e-11 ***
## E2    0.413878   0.065969  6.2739 3.522e-10 ***
## E3    0.386656   0.090510  4.2720 1.937e-05 ***
## E4    0.086098   0.114611  0.7512    0.4525    
## 
## ----------------------------------------------------------------
## Activation probability:
## ----------------------------------------------------------------
## Coefficients:
##              Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept) -3.053188   0.184536 -16.5452 < 2.2e-16 ***
## C2           1.269849   0.098583  12.8810 < 2.2e-16 ***
## C3           2.230978   0.109696  20.3379 < 2.2e-16 ***
## C4          10.711049   5.254076   2.0386   0.04149 *  
## Chemo1       1.028469   0.155231   6.6254 3.463e-11 ***
## Radio1       0.954108   0.202452   4.7128 2.444e-06 ***
## Surgery1    -0.120066   0.159265  -0.7539   0.45092    
## 
## ----------------------------------------------------------------
## Mean lifetime:
## ----------------------------------------------------------------
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.801468   0.064288 12.4668  < 2e-16 ***
## Chemo1       0.126029   0.064142  1.9649  0.04943 *  
## Radio1      -0.069054   0.072847 -0.9479  0.34317    
## Surgery1     0.799669   0.064118 12.4718  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## phi: 0.2587654 Std. Error: 0.0940683
## alpha: 0.01912129 Std. Error: 0.08037823
## sigma: 1.223319 Std. Error: 0.02826525
## 
## In addition, Log-lik value: -5591.041 
## AIC: 11222.08 and BIC: 11358.43 
## EM iterations: 2044

Some diagnostic plots:

# Quantile residuals
qqnorm(fit$residuals, pch = "+")
abline(0, 1, col = "grey", lwd = 2, lty = 1)

# Latent variables
plot(1:nrow(melanoma), fit$latent$M, xlab = "Index", ylab = "Number of competing causes", pch = 16)
plot(1:nrow(melanoma), fit$latent$D, xlab = "Index", ylab = "Total damage", pch = 16)

Likelihood ratio test

Consider the test of the following hypotheses

\[ \begin{align*} \textrm{H}_0: \alpha = 0 \quad \textrm{versus} \quad \textrm{H}_a: \alpha \neq 0. \end{align*} \]

We consider the following statistics \[ \Lambda = 2 \{\ell(\mathbf{\widehat{\psi}}) - \ell(\mathbf{\widetilde{\psi}})\}, \] where \(\mathbf{\widehat{\psi}}\) and \(\mathbf{\widetilde{\psi}}\) are the unrestricted and restricted (under the null hypothesis) maximum likelihood estimates of \(\psi = (\mathbf{\beta}_1^\top, \phi, \beta_2^\top, \alpha, \beta_3^\top, \sigma)^\top\).

Under mild regularity conditions, we have that, under \(\textrm{H}_0\), \[ \Lambda \overset{a}{\sim} 0.5 \; \chi^2_1 + 0.5 \; \chi^2_0. \]

## Test statistic
Lambda <- - 2 * (fit0$logLik - fit$logLik)

## Summary
data.frame(Lambda = Lambda,
           Critical_value = qchisq(1 - 2 * 0.05, 1))
##       Lambda Critical_value
## 1 -0.0270853       2.705543