Stats Consulting - Dr Okechi

Author

Vusi

library(gtsummary)
library(datasets)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(survival)
library(survminer)
Loading required package: ggpubr

Attaching package: 'survminer'

The following object is masked from 'package:survival':

    myeloma
library(car) 
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some
data(trial)
trial <- trial %>% 

    # manually re-name columns
           # NEW name             # OLD name
    rename(Treatment       = trt) %>%
  
    
    # de-duplicate
    distinct()

trial
# A tibble: 200 × 8
   Treatment   age marker stage grade response death ttdeath
   <chr>     <dbl>  <dbl> <fct> <fct>    <int> <int>   <dbl>
 1 Drug A       23  0.16  T1    II           0     0    24  
 2 Drug B        9  1.11  T2    I            1     0    24  
 3 Drug A       31  0.277 T1    II           0     0    24  
 4 Drug A       NA  2.07  T3    III          1     1    17.6
 5 Drug A       51  2.77  T4    III          1     1    16.4
 6 Drug B       39  0.613 T4    I            0     1    15.6
 7 Drug A       37  0.354 T1    II           0     0    24  
 8 Drug A       32  1.74  T1    I            0     1    18.4
 9 Drug A       31  0.144 T1    II           0     0    24  
10 Drug B       34  0.205 T3    I            0     1    10.5
# ℹ 190 more rows
trial1 <- trial %>%
  dplyr::mutate(
    response = dplyr::case_when(
      response == 1 ~ "success",
      response == 0 ~ "fail",
      TRUE ~ as.character(response)
    ),
    death = dplyr::case_when(
      death == 1 ~ "Alive",
      death == 0 ~ "Dead",
      TRUE ~ as.character(death)
    )
  )


trial1$age <- as.numeric(trial1$age)
trial1 %>%
  dplyr::select(age, marker, ttdeath, grade, stage, response, death,Treatment) %>%
  tbl_summary(
    by = Treatment,
    missing = "always",
    missing_text = "Missing" ,
    statistic = list(
      all_of(c("age", "ttdeath")) ~ "{mean} ({sd})",
      all_of(c("marker")) ~ "{median} ({IQR})",
      all_of(c("grade", "response", "stage", "death")) ~ "{n} / {N} ({p}%)"
    )
  ) %>%
  add_stat_label() %>% add_overall() %>% 
  bold_labels() %>%
  modify_header(
    label ~ "**Characteristics**",
    all_stat_cols() ~ "**{level}**"
  ) %>%
  modify_spanning_header(
    all_stat_cols()~ "**Treatment group**"
  ) %>%
  as_gt() %>%
  gt::tab_header(
    title = gt::md("**Table 1. Demographic Characteristics**"),
    subtitle = gt::md("")
  ) %>%
  gt::tab_source_note(source_note = "Data updated May, 2025")
Table 1. Demographic Characteristics
Characteristics
Treatment group
Overall1 Drug A Drug B
age, Mean (SD) 47 (14) 47 (15) 47 (14)
    Missing 11 7 4
Marker Level (ng/mL), Median (IQR) 0.64 (1.18) 0.84 (1.34) 0.52 (1.02)
    Missing 10 6 4
Months to Death/Censor, Mean (SD) 19.6 (5.3) 20.2 (5.0) 19.0 (5.5)
    Missing 0 0 0
Grade, n / N (%)


    I 68 / 200 (34%) 35 / 98 (36%) 33 / 102 (32%)
    II 68 / 200 (34%) 32 / 98 (33%) 36 / 102 (35%)
    III 64 / 200 (32%) 31 / 98 (32%) 33 / 102 (32%)
    Missing 0 0 0
T Stage, n / N (%)


    T1 53 / 200 (27%) 28 / 98 (29%) 25 / 102 (25%)
    T2 54 / 200 (27%) 25 / 98 (26%) 29 / 102 (28%)
    T3 43 / 200 (22%) 22 / 98 (22%) 21 / 102 (21%)
    T4 50 / 200 (25%) 23 / 98 (23%) 27 / 102 (26%)
    Missing 0 0 0
response, n / N (%)


    fail 132 / 193 (68%) 67 / 95 (71%) 65 / 98 (66%)
    success 61 / 193 (32%) 28 / 95 (29%) 33 / 98 (34%)
    Missing 7 3 4
death, n / N (%)


    Alive 112 / 200 (56%) 52 / 98 (53%) 60 / 102 (59%)
    Dead 88 / 200 (44%) 46 / 98 (47%) 42 / 102 (41%)
    Missing 0 0 0
Data updated May, 2025
1 Mean (SD); Median (IQR); n / N (%)

Assumptions Testing

Checking for normality of continuosus variables

# Q-Q plot for age
ggplot(trial, aes(sample = age)) +
  stat_qq() +
  stat_qq_line(color = "red") +
  facet_wrap(~ Treatment, scales = "free_y") +
  labs(
    title = "Q-Q Plot of Age by Treatment Group",
    x = "Theoretical Quantiles",
    y = "Sample Quantiles"
  ) +
  theme_minimal()
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_qq_line()`).

# Q-Q plot for marker
ggplot(trial1, aes(sample = marker)) +
  stat_qq() +
  stat_qq_line(color = "red") +
  facet_wrap(~ Treatment, scales = "free_y") +
  labs(
    title = "Q-Q Plot of Marker Level by Treatment Group",
    x = "Theoretical Quantiles",
    y = "Sample Quantiles"
  ) +
  theme_minimal()
Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_qq_line()`).

Checking for linearity assumption using restricted cubic splines for the logit model and the analysis of deviance

Full multivariable logistic regression model

model_response_multi_full <- glm(response ~ Treatment + age + marker + stage + grade, data = trial, family = "binomial")  
summary(model_response_multi_full)

Call:
glm(formula = response ~ Treatment + age + marker + stage + grade, 
    family = "binomial", data = trial)

Coefficients:
                Estimate Std. Error z value Pr(>|z|)  
(Intercept)     -1.85393    0.73976  -2.506   0.0122 *
TreatmentDrug B  0.29470    0.34084   0.865   0.3872  
age              0.01901    0.01193   1.593   0.1112  
marker           0.34596    0.19688   1.757   0.0789 .
stageT2         -0.80936    0.47127  -1.717   0.0859 .
stageT3         -0.14589    0.48377  -0.302   0.7630  
stageT4         -0.43985    0.47316  -0.930   0.3526  
gradeII          0.03926    0.42852   0.092   0.9270  
gradeIII         0.03984    0.40810   0.098   0.9222  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 214.80  on 172  degrees of freedom
Residual deviance: 205.89  on 164  degrees of freedom
  (27 observations deleted due to missingness)
AIC: 223.89

Number of Fisher Scoring iterations: 4
OR <- exp(coef(model_response_multi_full))
CI <- exp(confint(model_response_multi_full))
Waiting for profiling to be done...
terms <- names(OR)

logit_df <- data.frame(
  Term = terms,
  OR = OR,
  Lower = CI[, 1],
  Upper = CI[, 2]
)

# Remove intercept
logit_df <- logit_df[logit_df$Term != "(Intercept)", ]

# Plot
library(ggplot2)

ggplot(logit_df, aes(x = reorder(Term, OR), y = OR)) +
  geom_point() +
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  coord_flip() +
  labs(title = "Forest Plot: Odds Ratios from Logistic Regression",
       x = "Covariates", y = "Odds Ratio (95% CI)") +
  theme_minimal()

library(rms) 
Loading required package: Hmisc

Attaching package: 'Hmisc'
The following objects are masked from 'package:dplyr':

    src, summarize
The following objects are masked from 'package:base':

    format.pval, units

Attaching package: 'rms'
The following objects are masked from 'package:car':

    Predict, vif
dd <- datadist(trial) 
options(datadist = "dd")
logistic1 <- glm(response ~ rcs(age, 3) + Treatment + marker + stage + grade, data = trial, family = "binomial")
logistic2 <- glm(response ~ age + Treatment + rcs(marker, 3) + stage + grade, data = trial, family = "binomial") 

Comparing the cubic spline model for the marker with the full model

anova(model_response_multi_full, logistic2, test = "LRT")
Analysis of Deviance Table

Model 1: response ~ Treatment + age + marker + stage + grade
Model 2: response ~ age + Treatment + rcs(marker, 3) + stage + grade
  Resid. Df Resid. Dev Df  Deviance Pr(>Chi)
1       164     205.89                      
2       163     205.89  1 0.0054495   0.9412

The difference in deviance between the models is very small (0.0054). The non-linear spline term for marker does not significantly improve the model (p = 0.9412), so the linearity assumption for marker holds.

Comparing the cubic spline model for age with the full model

anova(model_response_multi_full,logistic1, test = "LRT")
Analysis of Deviance Table

Model 1: response ~ Treatment + age + marker + stage + grade
Model 2: response ~ rcs(age, 3) + Treatment + marker + stage + grade
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1       164     205.89                     
2       163     205.88  1 0.010616   0.9179

The difference in deviance between the models is very small (0.010616). The non-linear spline term for age does not significantly improve the model (p = 0.9179), so the linearity assumption for age holds.

Box-Tidwell transformation for age

boxTidwell(response ~ age, data = trial)
 MLE of lambda Score Statistic (t) Pr(>|t|)
        1.2722              0.1624   0.8712

iterations =  8 

Box-Tidwell transformation for marker

boxTidwell(response ~ marker, data = trial)
 MLE of lambda Score Statistic (t) Pr(>|t|)
       0.47004             -0.2639   0.7921

iterations =  7 

Cox regression linearity assumptions

Full multivariable Cox regression model

surv_object <- Surv(time = trial$ttdeath, event = trial$death) 

model_survival_multi_full <- coxph(surv_object ~ Treatment + age + marker + stage + grade, data = trial)  


summary(model_survival_multi_full)
Call:
coxph(formula = surv_object ~ Treatment + age + marker + stage + 
    grade, data = trial)

  n= 179, number of events= 95 
   (21 observations deleted due to missingness)

                     coef exp(coef)  se(coef)      z Pr(>|z|)    
TreatmentDrug B  0.332808  1.394880  0.208613  1.595 0.110638    
age              0.010252  1.010304  0.007487  1.369 0.170914    
marker          -0.121121  0.885927  0.128692 -0.941 0.346618    
stageT2          0.420776  1.523144  0.312899  1.345 0.178701    
stageT3          0.353370  1.423858  0.339888  1.040 0.298494    
stageT4          1.173846  3.234410  0.303136  3.872 0.000108 ***
gradeII          0.140104  1.150394  0.268726  0.521 0.602112    
gradeIII         0.527365  1.694462  0.245990  2.144 0.032045 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                exp(coef) exp(-coef) lower .95 upper .95
TreatmentDrug B    1.3949     0.7169    0.9268     2.099
age                1.0103     0.9898    0.9956     1.025
marker             0.8859     1.1288    0.6884     1.140
stageT2            1.5231     0.6565    0.8249     2.812
stageT3            1.4239     0.7023    0.7314     2.772
stageT4            3.2344     0.3092    1.7855     5.859
gradeII            1.1504     0.8693    0.6794     1.948
gradeIII           1.6945     0.5902    1.0463     2.744

Concordance= 0.637  (se = 0.03 )
Likelihood ratio test= 25.18  on 8 df,   p=0.001
Wald test            = 25.43  on 8 df,   p=0.001
Score (logrank) test = 26.7  on 8 df,   p=8e-04
HR <- exp(coef(model_survival_multi_full))
CI <- exp(confint(model_survival_multi_full))
terms <- names(HR)

cox_df <- data.frame(
  Term = terms,
  HR = HR,
  Lower = CI[, 1],
  Upper = CI[, 2]
)

# Plot
ggplot(cox_df, aes(x = reorder(Term, HR), y = HR)) +
  geom_point() +
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  coord_flip() +
  labs(title = "Forest Plot: Hazard Ratios from Cox Model",
       x = "Covariates", y = "Hazard Ratio (95% CI)") +
  theme_minimal()

Cox1 <- (coxph(surv_object ~ Treatment + age + rcs(marker, 3) + stage + grade, data = trial))
Cox2 <- (coxph(surv_object ~ Treatment + rcs(age, 3) + marker + stage + grade, data = trial))
anova(Cox1, model_survival_multi_full, test = "LRT")
Analysis of Deviance Table
 Cox model: response is  surv_object
 Model 1: ~ Treatment + age + rcs(marker, 3) + stage + grade
 Model 2: ~ Treatment + age + marker + stage + grade
   loglik  Chisq Df Pr(>|Chi|)
1 -448.97                     
2 -449.14 0.3418  1     0.5588

The effect of the marker to the log-hazard is linear

anova(Cox2, model_survival_multi_full, test = "LRT")
Analysis of Deviance Table
 Cox model: response is  surv_object
 Model 1: ~ Treatment + rcs(age, 3) + marker + stage + grade
 Model 2: ~ Treatment + age + marker + stage + grade
   loglik  Chisq Df Pr(>|Chi|)
1 -449.14                     
2 -449.14 0.0029  1      0.957

The effect of age to the log-hazard is linear

Multi-collinearity for the logistic regression model

vif(model_response_multi_full)
TreatmentDrug B             age          marker         stageT2         stageT3 
       1.017786        1.012350        1.086460        1.468179        1.432594 
        stageT4         gradeII        gradeIII 
       1.417034        1.362658        1.307661 

Analysis of deviance to assess predictor variables that improve the logistic model

(anova(glm(response ~ Treatment + age + marker + stage + grade, data = trial, family = "binomial")))
Analysis of Deviance Table

Model: binomial, link: logit

Response: response

Terms added sequentially (first to last)

          Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL                        172     214.80         
Treatment  1   0.3931       171     214.40   0.5307
age        1   2.6158       170     211.79   0.1058
marker     1   2.3914       169     209.40   0.1220
stage      3   3.4927       166     205.90   0.3217
grade      2   0.0122       164     205.89   0.9939

Backward elimination (threshold p > 0.10) to validate the analysis of deviance

model_response_multi_step <- step(model_response_multi_full, direction = "backward", trace = FALSE)

summary(model_response_multi_step)

Call:
glm(formula = response ~ age + marker, family = "binomial", data = trial)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept) -1.95041    0.62357  -3.128  0.00176 **
age          0.01880    0.01172   1.604  0.10882   
marker       0.27647    0.18640   1.483  0.13801   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 214.80  on 172  degrees of freedom
Residual deviance: 209.97  on 170  degrees of freedom
  (27 observations deleted due to missingness)
AIC: 215.97

Number of Fisher Scoring iterations: 4

Multi-collinearity for the Cox model

vif(model_survival_multi_full)
TreatmentDrug B             age          marker         stageT2         stageT3 
       1.010961        1.057127        1.069657        1.845739        1.682669 
        stageT4         gradeII        gradeIII 
       1.949996        1.392637        1.371761 

Analysis of deviance to assess predictor variables that improve the Cox regression model

(anova(coxph(surv_object ~ Treatment + age + marker + stage + grade, data = trial))) 
Analysis of Deviance Table
 Cox model: response is surv_object
Terms added sequentially (first to last)

           loglik   Chisq Df Pr(>|Chi|)    
NULL      -461.73                          
Treatment -460.54  2.3854  1  0.1224763    
age       -460.20  0.6704  1  0.4129019    
marker    -460.16  0.0866  1  0.7685735    
stage     -451.60 17.1202  3  0.0006676 ***
grade     -449.14  4.9145  2  0.0856700 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Constructing the final model that retains all clinically important variables

tbl1 <-
  glm(response ~ Treatment + age + grade + stage + marker, trial, family = binomial) %>%
  tbl_regression(exponentiate = TRUE) %>% bold_p() %>% add_global_p()
tbl2 <-
  coxph(Surv(ttdeath, death) ~ Treatment + age + grade + stage + marker, trial) %>%
  tbl_regression(exponentiate = TRUE) %>% bold_p()  %>% add_global_p()
tbl_merge_1 <-
  tbl_merge(
    tbls = list(tbl1, tbl2),
    tab_spanner = c("**Tumor Response**", "**Time to Death**")) %>% bold_labels()%>%
  modify_spanning_header(all_stat_cols() ~ "**Treatment group**") %>%
  as_gt() %>%
  gt::tab_header(
    title = gt::md("**Table 2. Associations of Clinical and Treatment Factors with Tumor Response and Time to Death**"),
    subtitle = gt::md("_Highly Confidential_")
  ) %>%
  gt::tab_source_note("Concordance= 0.637  (se = 0.03 )
Likelihood ratio test= 25.18  on 8 df,   p=0.001
Wald test            = 25.43  on 8 df,   p=0.001
Score (logrank) test = 26.7  on 8 df,   p=8e-04")

tbl_merge_1
Table 2. Associations of Clinical and Treatment Factors with Tumor Response and Time to Death
Highly Confidential
Characteristic
Tumor Response
Time to Death
OR 95% CI p-value HR 95% CI p-value
Chemotherapy Treatment

0.4

0.11
    Drug A

    Drug B 1.34 0.69, 2.64
1.39 0.93, 2.10
Age 1.02 1.00, 1.04 0.11 1.01 1.00, 1.03 0.2
Grade

>0.9

0.086
    I

    II 1.04 0.45, 2.42
1.15 0.68, 1.95
    III 1.04 0.47, 2.32
1.69 1.05, 2.74
T Stage

0.3

<0.001
    T1

    T2 0.45 0.17, 1.11
1.52 0.82, 2.81
    T3 0.86 0.33, 2.22
1.42 0.73, 2.77
    T4 0.64 0.25, 1.62
3.23 1.79, 5.86
Marker Level (ng/mL) 1.41 0.96, 2.09 0.080 0.89 0.69, 1.14 0.3
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio, OR = Odds Ratio
Concordance= 0.637 (se = 0.03 ) Likelihood ratio test= 25.18 on 8 df, p=0.001 Wald test = 25.43 on 8 df, p=0.001 Score (logrank) test = 26.7 on 8 df, p=8e-04

Schoenfeld residuals for Cox regression assumptions

test_ph <- cox.zph(model_survival_multi_full) 
plot(test_ph)