library(tidyverse)
library(readxl)
library(rstatix)
library(ggpubr)
library(broom)
library(pROC)
library(caret)
library(rmda)
library(kableExtra)
library(MASS)
library(dcurves)
covid_data <- read_excel("covidpredict-1.xlsx")
covid_data <- covid_data %>% mutate(mortality = factor(mortality, levels = c(0,1), labels = c("Alive", "Dead")))
str(covid_data)
## tibble [5,782 × 11] (S3: tbl_df/tbl/data.frame)
## $ rr : num [1:5782] 33 25 26 14 17 42 19 22 33 22 ...
## $ oxygen_sat : num [1:5782] 90 92 89 80 90 92 91 92 99 94 ...
## $ urea : num [1:5782] 26.2 21.3 5.6 24.4 6.7 17.5 4.1 21.2 3.7 10.3 ...
## $ crp : num [1:5782] 130 171 164 137 296 ...
## $ gcs : num [1:5782] 13 15 15 15 15 13 15 15 15 15 ...
## $ age : num [1:5782] 54 86 86 88 86 55 76 85 70 91 ...
## $ female : num [1:5782] 0 0 1 0 0 1 0 1 0 0 ...
## $ comorbidity: num [1:5782] 2 2 1 2 0 1 1 1 2 2 ...
## $ date : POSIXct[1:5782], format: "2020-02-06 00:00:00" "2020-02-06 00:42:14" ...
## $ set : chr [1:5782] "dev" "dev" "dev" "dev" ...
## $ mortality : Factor w/ 2 levels "Alive","Dead": 1 2 2 2 1 2 2 1 2 1 ...
numerical_vars <- c("age", "rr", "oxygen_sat", "gcs", "urea", "crp")
categorical_vars <- c("female", "comorbidity", "set")
boxplots <- lapply(numerical_vars, function(var) {
ggplot(covid_data, aes(x = mortality, y = .data[[var]], fill = mortality)) +
geom_boxplot(outlier.color = "red") +
labs(title = paste("Boxplot of", var, "by Mortality")) +
theme_minimal()
})
ggarrange(plotlist = boxplots, ncol = 2)
## $`1`
##
## $`2`
##
## $`3`
##
## attr(,"class")
## [1] "list" "ggarrange"
density_plots <- lapply(numerical_vars, function(var) {
ggplot(covid_data, aes(x = .data[[var]], fill = mortality, color = mortality)) +
geom_density(alpha = 0.5) +
labs(title = paste("Density Plot of", var)) +
theme_minimal()
})
ggarrange(plotlist = density_plots, ncol = 2)
## $`1`
##
## $`2`
##
## $`3`
##
## attr(,"class")
## [1] "list" "ggarrange"
normality_results <- map_dfr(numerical_vars, function(var) {
train_vals <- na.omit(covid_data %>% filter(set == "dev") %>% pull(var))
val_vals <- na.omit(covid_data %>% filter(set == "val") %>% pull(var))
tibble(
Variable = var,
Shapiro_W_p_train = shapiro.test(train_vals)$p.value,
Shapiro_W_stat_train = shapiro.test(train_vals)$statistic,
Shapiro_W_p_val = shapiro.test(val_vals)$p.value,
Shapiro_W_stat_val = shapiro.test(val_vals)$statistic
)
})
kable(normality_results, digits = 4) %>% kable_styling()
| Variable | Shapiro_W_p_train | Shapiro_W_stat_train | Shapiro_W_p_val | Shapiro_W_stat_val |
|---|---|---|---|---|
| age | 0 | 0.9663 | 0 | 0.9652 |
| rr | 0 | 0.9734 | 0 | 0.9749 |
| oxygen_sat | 0 | 0.9708 | 0 | 0.9739 |
| gcs | 0 | 0.5469 | 0 | 0.5482 |
| urea | 0 | 0.8704 | 0 | 0.8846 |
| crp | 0 | 0.9135 | 0 | 0.9203 |
mannwhitney_results <- map_df(numerical_vars, function(var) {
res <- wilcox_test(covid_data, as.formula(paste(var, "~ mortality")))
res %>% mutate(Variable = var)
})
kable(mannwhitney_results, digits = 3) %>% kable_styling()
| .y. | group1 | group2 | n1 | n2 | statistic | p | Variable |
|---|---|---|---|---|---|---|---|
| age | Alive | Dead | 3527 | 2255 | 2729391 | 0 | age |
| rr | Alive | Dead | 3527 | 2255 | 2874189 | 0 | rr |
| oxygen_sat | Alive | Dead | 3527 | 2255 | 4986468 | 0 | oxygen_sat |
| gcs | Alive | Dead | 3527 | 2255 | 4689462 | 0 | gcs |
| urea | Alive | Dead | 3527 | 2255 | 2672199 | 0 | urea |
| crp | Alive | Dead | 3527 | 2255 | 2868646 | 0 | crp |
cat_plots <- lapply(categorical_vars, function(var) {
ggplot(covid_data, aes(x = .data[[var]], fill = mortality)) +
geom_bar(position = "fill") +
labs(title = paste("Stacked Bar Plot of", var, "by Mortality")) +
ylab("Proportion") +
theme_minimal()
})
ggarrange(plotlist = cat_plots, ncol = 2)
## $`1`
##
## $`2`
##
## attr(,"class")
## [1] "list" "ggarrange"
comorbidity_test <-table(covid_data$comorbidity, covid_data$mortality)
chisq.test(comorbidity_test)
##
## Pearson's Chi-squared test
##
## data: comorbidity_test
## X-squared = 52.511, df = 2, p-value = 3.958e-12
female_test <-table(covid_data$female, covid_data$mortality)
chisq.test(female_test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: female_test
## X-squared = 1.3282, df = 1, p-value = 0.2491
The data-set is composed by 6 numerical variables and 2 categorical variables (numbers of comorbidities has been treated as categorical for simplicity).
Shapiro-Wilk tests showed that none of the numerical variables is normally distributed.
Mann-Whitney U-test and Chi-Squared test showed that all the variables have a significant effect on mortality except sex.
train_data <- covid_data %>% filter(set == "dev")
test_data <- covid_data %>% filter(set == "val")
model <- glm(mortality ~ age + rr + oxygen_sat + gcs + urea + crp + female + comorbidity,
data = train_data, family = binomial)
model_table <- tidy(model) %>%
mutate(across(where(is.numeric), ~signif(.x, 4)))
kable(model_table, digits = 4) %>% kable_styling()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.2427 | 0.7869 | 0.3085 | 0.7577 |
| age | 0.0463 | 0.0027 | 17.0300 | 0.0000 |
| rr | 0.0287 | 0.0047 | 6.1590 | 0.0000 |
| oxygen_sat | -0.0496 | 0.0073 | -6.8000 | 0.0000 |
| gcs | -0.1126 | 0.0255 | -4.4190 | 0.0000 |
| urea | 0.0355 | 0.0040 | 8.7720 | 0.0000 |
| crp | 0.0027 | 0.0004 | 6.4120 | 0.0000 |
| female | -0.1424 | 0.0791 | -1.8000 | 0.0718 |
| comorbidity | 0.3176 | 0.0488 | 6.5020 | 0.0000 |
Logistic regression model results confirm what expected: sex is the only variable with no significant effect of mortality (however, p-value is very close to significance being p=0.07).
# Predict probabilities and classes
prob_test <- predict(model, newdata = test_data, type = "response")
pred_class <- ifelse(prob_test > 0.5, "Dead", "Alive") %>% factor(levels = c("Alive", "Dead"))
# ROC Curve and AUC
roc_curve <- roc(test_data$mortality, prob_test)
auc_value <- auc(roc_curve)
plot(roc_curve, main = "ROC Curve - Validation Set")
cat("AUC =", round(auc_value, 3), "\n")
## AUC = 0.775
# Confusion Matrix
conf_matrix <- confusionMatrix(pred_class, test_data$mortality, positive = "Dead")
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Alive Dead
## Alive 1134 421
## Dead 209 472
##
## Accuracy : 0.7182
## 95% CI : (0.6991, 0.7368)
## No Information Rate : 0.6006
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3884
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5286
## Specificity : 0.8444
## Pos Pred Value : 0.6931
## Neg Pred Value : 0.7293
## Prevalence : 0.3994
## Detection Rate : 0.2111
## Detection Prevalence : 0.3046
## Balanced Accuracy : 0.6865
##
## 'Positive' Class : Dead
##
cm_df <- as.data.frame(as.table(conf_matrix))
ggplot(cm_df, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 1) +
scale_fill_gradient(low = "white", high = "blue") +
labs(x = "Predicted", y = "Actual", fill = "Frequency") +
theme_minimal() +
ggtitle("Confusion Matrix Heatmap")
# Precision, Recall
precision_value <- precision(test_data$mortality, pred_class, positive = "Dead")
recall_value <- recall(test_data$mortality, pred_class, positive = "Dead")
cat("Precision: ", round(precision_value, 3), "\n")
## Precision: 0.844
cat("Recall: ", round(recall_value, 3), "\n")
## Recall: 0.729
# Predict probabilities and classes
prob_test <- predict(model, newdata = test_data, type = "response")
pred_class <- ifelse(prob_test > 0.1, "Dead", "Alive") %>% factor(levels = c("Alive", "Dead"))
# Confusion Matrix
conf_matrix <- confusionMatrix(pred_class, test_data$mortality, positive = "Dead")
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Alive Dead
## Alive 157 10
## Dead 1186 883
##
## Accuracy : 0.4651
## 95% CI : (0.4443, 0.486)
## No Information Rate : 0.6006
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0866
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9888
## Specificity : 0.1169
## Pos Pred Value : 0.4268
## Neg Pred Value : 0.9401
## Prevalence : 0.3994
## Detection Rate : 0.3949
## Detection Prevalence : 0.9253
## Balanced Accuracy : 0.5529
##
## 'Positive' Class : Dead
##
cm_df <- as.data.frame(as.table(conf_matrix))
ggplot(cm_df, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 1) +
scale_fill_gradient(low = "white", high = "blue") +
labs(x = "Predicted", y = "Actual", fill = "Frequency") +
theme_minimal() +
ggtitle("Confusion Matrix Heatmap")
# Precision, Recall
precision_value <- precision(test_data$mortality, pred_class, positive = "Dead")
recall_value <- recall(test_data$mortality, pred_class, positive = "Dead")
cat("Precision: ", round(precision_value, 3), "\n")
## Precision: 0.117
cat("Recall: ", round(recall_value, 3), "\n")
## Recall: 0.94
At the clinically relevant threshold of 0.10 (ICU admission risk cutoff), the model achieves:
High sensitivity (recall = 0.99) for high-risk patients, which is valuable for not missing severe cases.
Very low precision (0.42) and overall low accuracy (0.46) indicate a high false positive rate, risking unnecessary ICU admissions.
In contrast, at a 0.50 threshold:
covid_data <- covid_data %>%
mutate(
pred_prob = predict(model, newdata = covid_data, type = "response")
)
dca_result <- dca( mortality ~ pred_prob,
data = covid_data,
thresholds = seq(0, 0.50, by = 0.01),
label = list(pred_prob = "Prediction Model"))
plot(dca_result, smooth = TRUE, main = "Multivariable Decision Curve Analysis")
The decision curve shows that the prediction model provides higher net benefit than both “treat all” and “treat none” strategies across all threshold probabilities. This indicates that the model is clinically useful for guiding treatment decisions within this range, offering improved patient stratification and reducing unnecessary interventions.
bootstrap_evpi_nb <- function(data, outcome_var = "mortality", pred_probs, pt = 0.1, n_boot = 1000, seed = 42) {
set.seed(seed)
N <- nrow(data)
odds_pt <- pt / (1 - pt)
net_benefit <- function(pred, true) {
TP <- sum(pred == 1 & true == 1)
FP <- sum(pred == 1 & true == 0)
NB <- (TP / N) - (FP / N) * odds_pt
return(NB)
}
outcome <- data[[outcome_var]]
pred_labels <- ifelse(pred_probs >= pt, 1, 0)
NB_all <- net_benefit(rep(1, N), outcome)
NB_none <- 0 # no one treated
boot_max_nb <- numeric(n_boot)
nb_model_boot <- numeric(n_boot)
for (i in 1:n_boot) {
idx <- sample(1:N, N, replace = TRUE)
y_boot <- outcome[idx]
p_boot <- pred_probs[idx]
pred_boot <- ifelse(p_boot >= pt, 1, 0)
nb_model <- net_benefit(pred_boot, y_boot)
nb_all <- net_benefit(rep(1, N), y_boot)
nb_model_boot[i] <- nb_model
boot_max_nb[i] <- max(nb_model, nb_all, NB_none)
}
EVPI <- mean(boot_max_nb) - max(mean(nb_model_boot), NB_all, NB_none)
return(list(
EVPI = EVPI,
mean_model_nb = mean(nb_model_boot),
NB_all = NB_all,
NB_none = NB_none,
expected_max_nb = mean(boot_max_nb)
))
}
pred_probs <- predict(model, newdata = covid_data, type = "response")
result <- bootstrap_evpi_nb(data = covid_data, outcome_var = "mortality", pred_probs = pred_probs, pt = 0.1)
print(result$EVPI)
## [1] 0
evpi_dist <- result$expected_max_nb - pmax(result$mean_model_nb, result$NB_all, result$NB_none)
quantile(evpi_dist, c(0.025, 0.975))
## 2.5% 97.5%
## 0 0
While the model shows good discrimination and performs well overall, UK-specific development call for external validation in Dutch hospitals. Differences in admission practices, resource availability, and patient profiles may affect real-world performance.
The model can support early ICU decision-making, but external validation is essential before deployment in the Netherlands to ensure safe, effective use.