Multivariable logistic regression modelling

Author

Sergio Uribe

CREATED

October 6, 2024

UPDATED

October 6, 2024

Packages

pacman::p_load(
  tidyverse, 
  carData, # for the Titanic dataset
  gtsummary, # for regresion tables
  janitor, 
  mgcv, # for non-linearity check
  performance, # to check assumptions
  lme4, # for the mixed effects log reg
  effects, # for ggeffects
  ggeffects, # to extract and visualize predictions
  emmeans, # for the table
  flextable, # to export the table to doc
  sjPlot, 
  here, 
  randomForest, # for classification
  vip, # to visualize the model importance variables
  effectsize, 
  pROC, 
  caret, 
  equatiomatic # to extract the formula
)

The dataset

df <- carData::TitanicSurvival |> 
  filter(!is.na(age))
glimpse(df) 
Rows: 1,046
Columns: 4
$ survived       <fct> yes, yes, no, no, no, yes, yes, no, yes, no, no, yes, y…
$ sex            <fct> female, male, female, male, female, male, female, male,…
$ age            <dbl> 29.0000, 0.9167, 2.0000, 30.0000, 25.0000, 48.0000, 63.…
$ passengerClass <fct> 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, …
tabyl(df$survived) |> 
  knitr::kable()
df$survived n percent
no 619 0.5917782
yes 427 0.4082218

The model

m <- glm(formula = survived ~ sex + age + passengerClass, 
         data = df, 
         family = binomial)

Check non-linearity

mgcv::gam(survived ~sex + s(age) + passengerClass, 
          df, 
          family = binomial) |> 
  plot()

so far so good

check all model assumptions

performance::check_model(m)

mixed-effects logistic regression

m2 <- lme4::glmer(survived ~ age + sex + (1|passengerClass), # for repeated measures
                  data = df, 
                  family = binomial)
performance::check_model(m2)

extract and visualize predictions

ggeffects::ggeffect(m) 
$sex
# Predicted probabilities of survived

sex    | Predicted |     95% CI
-------------------------------
female |      0.75 | 0.70, 0.79
male   |      0.19 | 0.16, 0.23


$age
# Predicted probabilities of survived

age | Predicted |     95% CI
----------------------------
  0 |      0.63 | 0.54, 0.72
 10 |      0.55 | 0.48, 0.62
 20 |      0.46 | 0.41, 0.51
 30 |      0.38 | 0.34, 0.42
 40 |      0.30 | 0.26, 0.35
 50 |      0.23 | 0.18, 0.29
 60 |      0.18 | 0.13, 0.25
 80 |      0.10 | 0.05, 0.17

Not all rows are shown in the output. Use `print(..., n = Inf)` to show
  all rows.

$passengerClass
# Predicted probabilities of survived

passengerClass | Predicted |     95% CI
---------------------------------------
1st            |      0.72 | 0.65, 0.78
2nd            |      0.41 | 0.34, 0.48
3rd            |      0.20 | 0.16, 0.25


attr(,"class")
[1] "ggalleffects" "list"        
attr(,"model.name")
[1] "m"

predictions 2

ggeffects::predict_response(m, terms = "age [all]")
# Predicted probabilities of survived

  age | Predicted |     95% CI
------------------------------
 0.17 |      0.97 | 0.95, 0.98
 6.00 |      0.96 | 0.94, 0.98
16.00 |      0.95 | 0.92, 0.97
24.00 |      0.94 | 0.91, 0.96
32.50 |      0.92 | 0.88, 0.94
40.50 |      0.89 | 0.85, 0.92
51.00 |      0.85 | 0.80, 0.90
80.00 |      0.68 | 0.54, 0.80

Adjusted for:
*            sex = female
* passengerClass =    1st

Not all rows are shown in the output. Use `print(..., n = Inf)` to show
  all rows.

visualize predictions

ggeffects::ggeffect(m) |> 
  plot() |> 
  sjPlot::plot_grid()
Warning in sjPlot::plot_grid(plot(ggeffects::ggeffect(m))): Not enough tags
labels in list. Using letters instead.

table gtsummary

m |> 
  gtsummary::tbl_regression(exponentiate = T, 
                            add_pairwise_contrasts = T, 
                            contrast_adjust = "bonferroni", 
                            pairwise_reverse = F, 
                            pvalue_fun = ~style_pvalue(.x, digits = 3)) |> 
  bold_p()
! `broom::tidy()` failed to tidy the model.
✔ `tidy_parameters()` used instead.
ℹ Add `tidy_fun = broom.helpers::tidy_parameters` to quiet these messages.
Characteristic OR1 95% CI1 p-value
sex


    female / male 12.2 8.78, 16.8 <0.001
age 0.97 0.95, 0.98 <0.001
passengerClass


    1st / 2nd 3.60 2.12, 6.11 <0.001
    1st / 3rd 9.87 5.82, 16.8 <0.001
    2nd / 3rd 2.74 1.72, 4.37 <0.001
1 OR = Odds Ratio, CI = Confidence Interval

save the table

# m |> 
#   gtsummary::tbl_regression(exponentiate = T, 
#                             add_pairwise_contrasts = T, 
#                             contrast_adjust = "bonferroni", 
#                             pairwise_reverse = F, 
#                             pvalue_fun = ~style_pvalue(.x, digits = 3)) |> 
#   bold_p() |> 
#   # the table exported
#   as_flex_table() |> 
#   save_as_docx(path = here("tables", "table_x.docx"))

export the model formula

equatiomatic::extract_eq(m)

\[ \log\left[ \frac { P( \operatorname{survived} = \operatorname{yes} ) }{ 1 - P( \operatorname{survived} = \operatorname{yes} ) } \right] = \alpha + \beta_{1}(\operatorname{sex}_{\operatorname{male}}) + \beta_{2}(\operatorname{age}) + \beta_{3}(\operatorname{passengerClass}_{\operatorname{2nd}}) + \beta_{4}(\operatorname{passengerClass}_{\operatorname{3rd}}) \]

variable importance

car::Anova(m) 
Analysis of Deviance Table (Type II tests)

Response: survived
               LR Chisq Df Pr(>Chisq)    
sex             273.239  1  < 2.2e-16 ***
age              31.344  1  2.161e-08 ***
passengerClass  118.886  2  < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

the LR can be interpreted as effect size

random forest

randomForest::randomForest(survived ~sex + age + passengerClass, 
                           data = df)

Call:
 randomForest(formula = survived ~ sex + age + passengerClass,      data = df) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 1

        OOB estimate of  error rate: 20.65%
Confusion matrix:
     no yes class.error
no  579  40  0.06462036
yes 176 251  0.41217799
vip::vip(randomForest::randomForest(survived ~sex + age + passengerClass, 
                           data = df))

model performance and effect size

performance::performance(m) |> 
  knitr::kable()
AIC AICc BIC R2_Tjur RMSE Sigma Log_loss Score_log Score_spherical PCP
992.4531 992.5108 1017.217 0.3762437 0.3872188 1 0.4696239 -Inf 0.0012374 0.6986299
?interpret_r2
Tjur's R2
effectsize::interpret_r2(0.376)
[1] "substantial"
(Rules: cohen1988)

The three predictors explain almost 38% of the survival variance

effectsize::interpret_oddsratio(12.2)
[1] "large"
(Rules: chen2010)

ROC curve

pROC::roc(survived ~fitted.values(m), 
          data = df, 
          plot = T, 
          legacy_axes = T, 
          print.auc = T, 
          ci = T)
Setting levels: control = no, case = yes
Setting direction: controls < cases


Call:
roc.formula(formula = survived ~ fitted.values(m), data = df,     plot = T, legacy_axes = T, print.auc = T, ci = T)

Data: fitted.values(m) in 619 controls (survived no) < 427 cases (survived yes).
Area under the curve: 0.8401
95% CI: 0.8141-0.8661 (DeLong)

confusion matrix

df |>
  mutate(pred_probs = as.vector(fitted.values(m))) |>
  mutate(predicted_outcome = case_when(pred_probs  > 0.5 ~ "yes", 
                                       TRUE ~               "no")) |>
  with(table(survived, predicted_outcome)) |>
  caret::confusionMatrix()
Confusion Matrix and Statistics

        predicted_outcome
survived  no yes
     no  520  99
     yes 126 301
                                          
               Accuracy : 0.7849          
                 95% CI : (0.7587, 0.8094)
    No Information Rate : 0.6176          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.5504          
                                          
 Mcnemar's Test P-Value : 0.08304         
                                          
            Sensitivity : 0.8050          
            Specificity : 0.7525          
         Pos Pred Value : 0.8401          
         Neg Pred Value : 0.7049          
             Prevalence : 0.6176          
         Detection Rate : 0.4971          
   Detection Prevalence : 0.5918          
      Balanced Accuracy : 0.7787          
                                          
       'Positive' Class : no              
                                          

Comparison with univariable model

gtsummary::tbl_regression(
  glm(survived ~age, 
      data = df, 
      family = binomial), 
  exponentiate = T)
Characteristic OR1 95% CI1 p-value
age 0.99 0.98, 1.00 0.073
1 OR = Odds Ratio, CI = Confidence Interval