pacman::p_load(
tidyverse,
carData, # for the Titanic dataset
gtsummary, # for regresion tables
janitor,
mgcv, # for non-linearity check
performance, # to check assumptions
lme4, # for the mixed effects log reg
effects, # for ggeffects
ggeffects, # to extract and visualize predictions
emmeans, # for the table
flextable, # to export the table to doc
sjPlot,
here,
randomForest, # for classification
vip, # to visualize the model importance variables
effectsize,
pROC,
caret,
equatiomatic # to extract the formula
)Multivariable logistic regression modelling
Packages
The dataset
df <- carData::TitanicSurvival |>
filter(!is.na(age))glimpse(df) Rows: 1,046
Columns: 4
$ survived <fct> yes, yes, no, no, no, yes, yes, no, yes, no, no, yes, y…
$ sex <fct> female, male, female, male, female, male, female, male,…
$ age <dbl> 29.0000, 0.9167, 2.0000, 30.0000, 25.0000, 48.0000, 63.…
$ passengerClass <fct> 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, …
tabyl(df$survived) |>
knitr::kable()| df$survived | n | percent |
|---|---|---|
| no | 619 | 0.5917782 |
| yes | 427 | 0.4082218 |
The model
m <- glm(formula = survived ~ sex + age + passengerClass,
data = df,
family = binomial)Check non-linearity
mgcv::gam(survived ~sex + s(age) + passengerClass,
df,
family = binomial) |>
plot()so far so good
check all model assumptions
performance::check_model(m)mixed-effects logistic regression
m2 <- lme4::glmer(survived ~ age + sex + (1|passengerClass), # for repeated measures
data = df,
family = binomial)performance::check_model(m2)extract and visualize predictions
ggeffects::ggeffect(m) $sex
# Predicted probabilities of survived
sex | Predicted | 95% CI
-------------------------------
female | 0.75 | 0.70, 0.79
male | 0.19 | 0.16, 0.23
$age
# Predicted probabilities of survived
age | Predicted | 95% CI
----------------------------
0 | 0.63 | 0.54, 0.72
10 | 0.55 | 0.48, 0.62
20 | 0.46 | 0.41, 0.51
30 | 0.38 | 0.34, 0.42
40 | 0.30 | 0.26, 0.35
50 | 0.23 | 0.18, 0.29
60 | 0.18 | 0.13, 0.25
80 | 0.10 | 0.05, 0.17
Not all rows are shown in the output. Use `print(..., n = Inf)` to show
all rows.
$passengerClass
# Predicted probabilities of survived
passengerClass | Predicted | 95% CI
---------------------------------------
1st | 0.72 | 0.65, 0.78
2nd | 0.41 | 0.34, 0.48
3rd | 0.20 | 0.16, 0.25
attr(,"class")
[1] "ggalleffects" "list"
attr(,"model.name")
[1] "m"
predictions 2
ggeffects::predict_response(m, terms = "age [all]")# Predicted probabilities of survived
age | Predicted | 95% CI
------------------------------
0.17 | 0.97 | 0.95, 0.98
6.00 | 0.96 | 0.94, 0.98
16.00 | 0.95 | 0.92, 0.97
24.00 | 0.94 | 0.91, 0.96
32.50 | 0.92 | 0.88, 0.94
40.50 | 0.89 | 0.85, 0.92
51.00 | 0.85 | 0.80, 0.90
80.00 | 0.68 | 0.54, 0.80
Adjusted for:
* sex = female
* passengerClass = 1st
Not all rows are shown in the output. Use `print(..., n = Inf)` to show
all rows.
visualize predictions
ggeffects::ggeffect(m) |>
plot() |>
sjPlot::plot_grid()Warning in sjPlot::plot_grid(plot(ggeffects::ggeffect(m))): Not enough tags
labels in list. Using letters instead.
table gtsummary
m |>
gtsummary::tbl_regression(exponentiate = T,
add_pairwise_contrasts = T,
contrast_adjust = "bonferroni",
pairwise_reverse = F,
pvalue_fun = ~style_pvalue(.x, digits = 3)) |>
bold_p()! `broom::tidy()` failed to tidy the model.
✔ `tidy_parameters()` used instead.
ℹ Add `tidy_fun = broom.helpers::tidy_parameters` to quiet these messages.
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| sex | |||
| female / male | 12.2 | 8.78, 16.8 | <0.001 |
| age | 0.97 | 0.95, 0.98 | <0.001 |
| passengerClass | |||
| 1st / 2nd | 3.60 | 2.12, 6.11 | <0.001 |
| 1st / 3rd | 9.87 | 5.82, 16.8 | <0.001 |
| 2nd / 3rd | 2.74 | 1.72, 4.37 | <0.001 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
save the table
# m |>
# gtsummary::tbl_regression(exponentiate = T,
# add_pairwise_contrasts = T,
# contrast_adjust = "bonferroni",
# pairwise_reverse = F,
# pvalue_fun = ~style_pvalue(.x, digits = 3)) |>
# bold_p() |>
# # the table exported
# as_flex_table() |>
# save_as_docx(path = here("tables", "table_x.docx"))export the model formula
equatiomatic::extract_eq(m)\[ \log\left[ \frac { P( \operatorname{survived} = \operatorname{yes} ) }{ 1 - P( \operatorname{survived} = \operatorname{yes} ) } \right] = \alpha + \beta_{1}(\operatorname{sex}_{\operatorname{male}}) + \beta_{2}(\operatorname{age}) + \beta_{3}(\operatorname{passengerClass}_{\operatorname{2nd}}) + \beta_{4}(\operatorname{passengerClass}_{\operatorname{3rd}}) \]
variable importance
car::Anova(m) Analysis of Deviance Table (Type II tests)
Response: survived
LR Chisq Df Pr(>Chisq)
sex 273.239 1 < 2.2e-16 ***
age 31.344 1 2.161e-08 ***
passengerClass 118.886 2 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
the LR can be interpreted as effect size
random forest
randomForest::randomForest(survived ~sex + age + passengerClass,
data = df)
Call:
randomForest(formula = survived ~ sex + age + passengerClass, data = df)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 1
OOB estimate of error rate: 20.65%
Confusion matrix:
no yes class.error
no 579 40 0.06462036
yes 176 251 0.41217799
vip::vip(randomForest::randomForest(survived ~sex + age + passengerClass,
data = df))model performance and effect size
performance::performance(m) |>
knitr::kable()| AIC | AICc | BIC | R2_Tjur | RMSE | Sigma | Log_loss | Score_log | Score_spherical | PCP |
|---|---|---|---|---|---|---|---|---|---|
| 992.4531 | 992.5108 | 1017.217 | 0.3762437 | 0.3872188 | 1 | 0.4696239 | -Inf | 0.0012374 | 0.6986299 |
?interpret_r2Tjur's R2
effectsize::interpret_r2(0.376)[1] "substantial"
(Rules: cohen1988)
The three predictors explain almost 38% of the survival variance
effectsize::interpret_oddsratio(12.2)[1] "large"
(Rules: chen2010)
ROC curve
pROC::roc(survived ~fitted.values(m),
data = df,
plot = T,
legacy_axes = T,
print.auc = T,
ci = T)Setting levels: control = no, case = yes
Setting direction: controls < cases
Call:
roc.formula(formula = survived ~ fitted.values(m), data = df, plot = T, legacy_axes = T, print.auc = T, ci = T)
Data: fitted.values(m) in 619 controls (survived no) < 427 cases (survived yes).
Area under the curve: 0.8401
95% CI: 0.8141-0.8661 (DeLong)
confusion matrix
df |>
mutate(pred_probs = as.vector(fitted.values(m))) |>
mutate(predicted_outcome = case_when(pred_probs > 0.5 ~ "yes",
TRUE ~ "no")) |>
with(table(survived, predicted_outcome)) |>
caret::confusionMatrix()Confusion Matrix and Statistics
predicted_outcome
survived no yes
no 520 99
yes 126 301
Accuracy : 0.7849
95% CI : (0.7587, 0.8094)
No Information Rate : 0.6176
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.5504
Mcnemar's Test P-Value : 0.08304
Sensitivity : 0.8050
Specificity : 0.7525
Pos Pred Value : 0.8401
Neg Pred Value : 0.7049
Prevalence : 0.6176
Detection Rate : 0.4971
Detection Prevalence : 0.5918
Balanced Accuracy : 0.7787
'Positive' Class : no
Comparison with univariable model
gtsummary::tbl_regression(
glm(survived ~age,
data = df,
family = binomial),
exponentiate = T)| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| age | 0.99 | 0.98, 1.00 | 0.073 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||