# Cài package nếu chưa có
#install.packages("aplore3")
#install.packages("tidyverse")
#install.packages("gtsummary") # Tạo bảng thống kê đẹp
#install.packages("pROC") # Vẽ ROC, tính AUC
#install.packages("ResourceSelection") # Hosmer-Lemeshow test
# Load library
library(aplore3)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gtsummary)
# Gọi dataset
data("icu")
# Xem cấu trúc
glimpse(icu)
## Rows: 200
## Columns: 21
## $ id <int> 4, 8, 12, 14, 27, 28, 32, 38, 40, 41, 42, 47, 50, 51, 52, 53, 5…
## $ sta <fct> Died, Lived, Lived, Lived, Died, Lived, Lived, Lived, Lived, Li…
## $ age <int> 87, 27, 59, 77, 76, 54, 87, 69, 63, 30, 35, 78, 70, 55, 63, 48,…
## $ gender <fct> Female, Female, Male, Male, Female, Male, Female, Male, Male, F…
## $ race <fct> White, White, White, White, White, White, White, White, White, …
## $ ser <fct> Surgical, Medical, Medical, Surgical, Surgical, Medical, Surgic…
## $ can <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes, No, No, Ye…
## $ crn <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, Yes, No…
## $ inf <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes,…
## $ cpr <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No,…
## $ sys <int> 80, 142, 112, 100, 128, 142, 110, 110, 104, 144, 108, 130, 138,…
## $ hra <int> 96, 88, 80, 70, 90, 103, 154, 132, 66, 110, 60, 132, 103, 86, 1…
## $ pre <fct> No, No, Yes, No, Yes, No, Yes, No, No, No, No, No, No, Yes, Yes…
## $ type <fct> Emergency, Emergency, Emergency, Elective, Emergency, Emergency…
## $ fra <fct> Yes, No, No, No, No, Yes, No, No, No, No, No, No, No, No, No, N…
## $ po2 <fct> <= 60, > 60, > 60, > 60, > 60, > 60, > 60, <= 60, > 60, > 60, >…
## $ ph <fct> < 7.25, >= 7.25, >= 7.25, >= 7.25, >= 7.25, >= 7.25, >= 7.25, >…
## $ pco <fct> > 45, <= 45, <= 45, <= 45, <= 45, <= 45, <= 45, <= 45, <= 45, <…
## $ bic <fct> >= 18, >= 18, >= 18, >= 18, >= 18, >= 18, >= 18, < 18, >= 18, >…
## $ cre <fct> <= 2.0, <= 2.0, <= 2.0, <= 2.0, <= 2.0, <= 2.0, <= 2.0, <= 2.0,…
## $ loc <fct> Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, …
# Xem 6 dòng đầu
head(icu)
## id sta age gender race ser can crn inf cpr sys hra pre type fra
## 1 4 Died 87 Female White Surgical No No Yes No 80 96 No Emergency Yes
## 2 8 Lived 27 Female White Medical No No Yes No 142 88 No Emergency No
## 3 12 Lived 59 Male White Medical No No No No 112 80 Yes Emergency No
## 4 14 Lived 77 Male White Surgical No No No No 100 70 No Elective No
## 5 27 Died 76 Female White Surgical No No Yes No 128 90 Yes Emergency No
## 6 28 Lived 54 Male White Medical No No Yes No 142 103 No Emergency Yes
## po2 ph pco bic cre loc
## 1 <= 60 < 7.25 > 45 >= 18 <= 2.0 Nothing
## 2 > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
## 3 > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
## 4 > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
## 5 > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
## 6 > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
# Tóm tắt thống kê
summary(icu)
## id sta age gender race
## Min. : 4.0 Lived:160 Min. :16.00 Male :124 White:175
## 1st Qu.:210.2 Died : 40 1st Qu.:46.75 Female: 76 Black: 15
## Median :412.5 Median :63.00 Other: 10
## Mean :444.8 Mean :57.55
## 3rd Qu.:671.8 3rd Qu.:72.00
## Max. :929.0 Max. :92.00
## ser can crn inf cpr sys
## Medical : 93 No :180 No :181 No :116 No :187 Min. : 36.0
## Surgical:107 Yes: 20 Yes: 19 Yes: 84 Yes: 13 1st Qu.:110.0
## Median :130.0
## Mean :132.3
## 3rd Qu.:150.0
## Max. :256.0
## hra pre type fra po2 ph
## Min. : 39.00 No :170 Elective : 53 No :185 > 60 :184 >= 7.25:187
## 1st Qu.: 80.00 Yes: 30 Emergency:147 Yes: 15 <= 60: 16 < 7.25 : 13
## Median : 96.00
## Mean : 98.92
## 3rd Qu.:118.25
## Max. :192.00
## pco bic cre loc
## <= 45:180 >= 18:185 <= 2.0:190 Nothing:185
## > 45 : 20 < 18 : 15 > 2.0 : 10 Stupor : 5
## Coma : 10
##
##
##
library(aplore3)
library(tidyverse)
library(gtsummary)
data("icu")
# Xem cấu trúc outcome
table(icu$sta)
##
## Lived Died
## 160 40
# Mặc định: Died = 1, Lived = 0 (cần kiểm tra)
# Chuyển outcome về dạng numeric 0/1
icu <- icu %>%
mutate(
outcome = ifelse(sta == "Died", 1, 0),
# Chuẩn hóa một số biến factor
loc = factor(loc), # Mức độ ý thức (quan trọng!)
ser = factor(ser),
type = factor(type)
)
# Kiểm tra
table(icu$outcome) # 0 = Sống, 1 = Tử vong
##
## 0 1
## 160 40
#Chạy đơn biến tổng hợp
# Bảng phân tích đơn biến tất cả biến
# Cài các package phụ thuộc còn thiếu
#install.packages("broom.helpers")
#install.packages("broom")
# Sau đó load lại
library(gtsummary)
tbl_univariate <- icu %>%
select(outcome, age, gender, ser, can, crn, inf,
cpr, sys, hra, pre, type, fra, loc) %>%
tbl_uvregression(
method = glm,
y = outcome,
method.args = list(family = binomial),
exponentiate = TRUE, # Hiển thị OR thay vì log(OR)
pvalue_fun = ~style_pvalue(.x, digits = 3)
) %>%
bold_p(t = 0.05) %>%
bold_labels() %>%
modify_header(label = "**Biến số**") %>%
modify_caption("**Phân tích đơn biến — Tiên lượng tử vong ICU**")
## There was a warning running `tbl_regression()` for variable "loc". See message
## below.
## ! glm.fit: fitted probabilities numerically 0 or 1 occurred, glm.fit: fitted
## probabilities numerically 0 or 1 occurred, glm.fit: fitted probabilities
## numerically 0 or 1 occurred, glm.fit: fitted probabilities numerically 0 or 1
## occurred, glm.fit: fitted probabilities numerically 0 or 1 occurred, glm.fit:
## fitted probabilities numerically 0 or 1 occurred, glm.fit: fitted
## probabilities numerically 0 or 1 occurred, glm.fit: fitted probabilities
## numerically 0 or 1 occurred, glm.fit: fitted probabilities numerically 0 or 1
## occurred, glm.fit: fitted probabilities numerically 0 or 1 occurred, glm.fit:
## fitted probabilities numerically 0 or 1 occurred, glm.fit: fitted
## probabilities numerically 0 or 1 occurred, glm.fit: fitted probabilities
## numerically 0 or 1 occurred, and glm.fit: fitted probabilities numerically 0
## or 1 occurred
# Xem bảng
tbl_univariate
Phân tích đơn biến — Tiên lượng tử vong ICU
| Biến số |
N |
OR |
95% CI |
p-value |
| age |
200 |
1.03 |
1.01, 1.05 |
0.009 |
| gender |
200 |
|
|
|
| Male |
|
— |
— |
|
| Female |
|
1.11 |
0.54, 2.24 |
0.771 |
| ser |
200 |
|
|
|
| Medical |
|
— |
— |
|
| Surgical |
|
0.39 |
0.18, 0.79 |
0.010 |
| can |
200 |
|
|
|
| No |
|
— |
— |
|
| Yes |
|
1.00 |
0.27, 2.93 |
>0.999 |
| crn |
200 |
|
|
|
| No |
|
— |
— |
|
| Yes |
|
3.39 |
1.22, 9.06 |
0.015 |
| inf |
200 |
|
|
|
| No |
|
— |
— |
|
| Yes |
|
2.50 |
1.24, 5.16 |
0.011 |
| cpr |
200 |
|
|
|
| No |
|
— |
— |
|
| Yes |
|
5.44 |
1.70, 17.9 |
0.004 |
| sys |
200 |
0.98 |
0.97, 0.99 |
0.005 |
| hra |
200 |
1.00 |
0.99, 1.02 |
0.653 |
| pre |
200 |
|
|
|
| No |
|
— |
— |
|
| Yes |
|
1.26 |
0.47, 3.07 |
0.621 |
| type |
200 |
|
|
|
| Elective |
|
— |
— |
|
| Emergency |
|
8.89 |
2.58, 56.0 |
0.003 |
| fra |
200 |
|
|
|
| No |
|
— |
— |
|
| Yes |
|
1.00 |
0.22, 3.34 |
>0.999 |
| loc |
200 |
|
|
|
| Nothing |
|
— |
— |
|
| Stupor |
|
91,589,445 |
0.00,
|
0.986 |
| Coma |
|
23.4 |
5.52, 161 |
<0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio |
#install.packages("BMA")
#install.packages("BAS") # Bayesian Adaptive Sampling — mạnh hơn BMA package
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
library(BAS)
# Chạy BMA logistic regression
bma_model <- bas.glm(
outcome ~ age + can + crn + inf + cpr + sys + type + loc,
data = icu,
family = binomial(link = "logit"),
method = "MCMC", # MCMC sampling cho không gian mô hình lớn
MCMC.iterations = 100000,
betaprior = bic.prior(), # Prior phổ biến trong y văn
modelprior = uniform() # Prior đồng đều cho các mô hình
)
# Xem tóm tắt
summary(bma_model)
## P(B != 0 | Y) model 1 model 2 model 3 model 4
## Intercept 1.00000000 1.00000 1.0000000 1.0000000 1.0000000
## age 0.91283791 1.00000 1.0000000 1.0000000 1.0000000
## canYes 0.72592519 1.00000 1.0000000 0.0000000 0.0000000
## crnYes 0.11238903 0.00000 0.0000000 0.0000000 0.0000000
## infYes 0.09311721 0.00000 0.0000000 0.0000000 0.0000000
## cprYes 0.13756608 0.00000 0.0000000 0.0000000 0.0000000
## sys 0.62373067 1.00000 0.0000000 1.0000000 0.0000000
## typeEmergency 0.99194015 1.00000 1.0000000 1.0000000 1.0000000
## locStupor 0.99931172 1.00000 1.0000000 1.0000000 1.0000000
## locComa 0.88119701 1.00000 1.0000000 1.0000000 1.0000000
## BF NA 1.00000 0.6519453 0.2808813 0.2422451
## PostProbs NA 0.26580 0.1760000 0.0765000 0.0684000
## R2 NA 0.35830 0.3276000 0.3192000 0.2912000
## dim NA 7.00000 6.0000000 6.0000000 5.0000000
## logmarg NA -80.11515 -80.5429478 -81.3849764 -81.5329585
## model 5
## Intercept 1.0000000
## age 1.0000000
## canYes 1.0000000
## crnYes 0.0000000
## infYes 0.0000000
## cprYes 0.0000000
## sys 1.0000000
## typeEmergency 1.0000000
## locStupor 1.0000000
## locComa 0.0000000
## BF 0.1295497
## PostProbs 0.0348000
## R2 0.3114000
## dim 6.0000000
## logmarg -82.1588442
# Inclusion probability — chỉ số quan trọng nhất của BMA
plot(bma_model, which = 4, # Inclusion probabilities
ask = FALSE,
cex.axis = 0.7)

# Xem dạng bảng
bma_model$probne0 # Xác suất posterior mỗi biến ≠ 0
## [1] 1.00000000 0.91283791 0.72592519 0.11238903 0.09311721 0.13756608
## [7] 0.62373067 0.99194015 0.99931172 0.88119701
# Coefficient trung bình theo BMA
coef_bma <- coef(bma_model)
print(coef_bma)
##
## Marginal Posterior Summaries of Coefficients:
##
## Using BMA
##
## Based on the top 202 models
## post mean post SD post p(B != 0)
## Intercept -5.57521 2.23551 1.00000
## age 0.03388 0.01619 0.91284
## canYes 1.80274 1.37544 0.72593
## crnYes 0.08372 0.32976 0.11239
## infYes 0.03615 0.18739 0.09312
## cprYes 0.14204 0.45935 0.13757
## sys -0.01140 0.01068 0.62373
## typeEmergency 3.46968 1.34458 0.99194
## locStupor 20.89150 1429.40283 0.99931
## locComa 2.21051 1.16324 0.88120
# Visualize coefficient
plot(coef_bma,
ask = FALSE)










library(BMA)
# Cần chuẩn bị matrix biến độc lập
X <- icu %>%
select(age, can, crn, inf, cpr, sys, type, loc) %>%
mutate(across(where(is.factor), as.numeric)) %>%
as.data.frame()
bma_model2 <- bic.glm(
x = X,
y = icu$outcome,
glm.family = binomial(link = "logit"),
strict = FALSE
)
# Xem kết quả
summary(bma_model2)
##
## Call:
## bic.glm.data.frame(x = X, y = icu$outcome, glm.family = binomial(link = "logit"), strict = FALSE)
##
##
## 12 models were selected
## Best 5 models (cumulative posterior probability = 0.7917 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 -11.088458 3.080617 -1.326e+01 -9.544e+00 -1.177e+01
## age 92.9 0.031635 0.014670 3.584e-02 3.291e-02 3.508e-02
## can 50.6 0.966395 1.119466 1.925e+00 . 1.945e+00
## crn 5.6 0.024246 0.175746 . . .
## inf 7.0 0.030049 0.158869 . . .
## cpr 5.4 0.029808 0.225111 . . .
## sys 13.3 -0.001267 0.004018 . . -9.646e-03
## type 100.0 2.494913 0.909742 2.893e+00 2.188e+00 2.790e+00
## loc 100.0 1.876222 0.534174 1.927e+00 1.834e+00 1.915e+00
##
## nVar 4 3 5
## BIC -8.876e+02 -8.875e+02 -8.845e+02
## post prob 0.317 0.298 0.070
## model 4 model 5
## Intercept -8.029e+00 -7.144e+00
## age 3.215e-02 .
## can . .
## crn . .
## inf . .
## cpr . .
## sys -9.375e-03 .
## type 2.064e+00 1.926e+00
## loc 1.822e+00 1.894e+00
##
## nVar 4 2
## BIC -8.844e+02 -8.836e+02
## post prob 0.063 0.043
print(bma_model2)
##
## Call:
## bic.glm.data.frame(x = X, y = icu$outcome, glm.family = binomial(link = "logit"), strict = FALSE)
##
##
## Posterior probabilities(%):
## age can crn inf cpr sys type loc
## 92.9 50.6 5.6 7.0 5.4 13.3 100.0 100.0
##
## Coefficient posterior expected values:
## (Intercept) age can crn inf cpr
## -11.088458 0.031635 0.966395 0.024246 0.030049 0.029808
## sys type loc
## -0.001267 2.494913 1.876222
# Mô hình cuối từ kết quả BMA
model_final <- glm(
outcome ~ age + can + type + loc,
data = icu,
family = binomial(link = "logit")
)
# Trình bày chuẩn y văn
tbl_final <- model_final %>%
tbl_regression(
exponentiate = TRUE,
pvalue_fun = ~style_pvalue(.x, digits = 3)
) %>%
bold_p(t = 0.05) %>%
bold_labels() %>%
modify_header(label = "**Biến số**") %>%
modify_caption("**Mô hình cuối — BMA Selected Variables**")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
tbl_final
Mô hình cuối — BMA Selected Variables
| Biến số |
OR |
95% CI |
p-value |
| age |
1.04 |
1.01, 1.07 |
0.003 |
| can |
|
|
|
| No |
— |
— |
|
| Yes |
11.4 |
1.97, 92.0 |
0.009 |
| type |
|
|
|
| Elective |
— |
— |
|
| Emergency |
48.6 |
6.64, 1,169 |
0.002 |
| loc |
|
|
|
| Nothing |
— |
— |
|
| Stupor |
1,101,550,279 |
0.00,
|
0.989 |
| Coma |
15.9 |
3.57, 112 |
<0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio |
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ResourceSelection)
## ResourceSelection 0.3-6 2023-06-27
# Predicted probability
icu$pred_prob <- predict(model_final, type = "response")
# 1. AUC - ROC
roc_obj <- roc(icu$outcome, icu$pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.8552
# Vẽ ROC
plot(roc_obj,
print.auc = TRUE,
col = "steelblue",
lwd = 2,
main = "ROC Curve — ICU Mortality Prediction")

# 2. Hosmer-Lemeshow test (Calibration)
hl_test <- hoslem.test(icu$outcome, icu$pred_prob, g = 10)
print(hl_test)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: icu$outcome, icu$pred_prob
## X-squared = 4.568, df = 8, p-value = 0.8026
# p > 0.05 = mô hình calibrate tốt
# Xem coefficients
coef(model_final)
## (Intercept) age canYes typeEmergency locStupor
## -7.68055261 0.03785479 2.43193239 3.88421704 20.81998437
## locComa
## 2.76516818
# Kiểm tra phân phối loc vs outcome
table(icu$loc, icu$outcome)
##
## 0 1
## Nothing 158 27
## Stupor 0 5
## Coma 2 8
# Kiểm tra số lượng
icu %>%
count(loc, outcome) %>%
pivot_wider(names_from = outcome,
values_from = n,
values_fill = 0) %>%
rename(Survived = `0`, Died = `1`)
## # A tibble: 3 × 3
## loc Survived Died
## <fct> <int> <int>
## 1 Nothing 158 27
## 2 Stupor 0 5
## 3 Coma 2 8
# Mô hình cuối bỏ loc
model_final3 <- glm(
outcome ~ age + can + type,
data = icu,
family = binomial(link = "logit")
)
# Xem coefficients
coef(model_final3)
## (Intercept) age canYes typeEmergency
## -6.28626263 0.03604018 1.61315724 3.03773520
# Trình bày bảng
model_final3 %>%
tbl_regression(
exponentiate = TRUE,
pvalue_fun = ~style_pvalue(.x, digits = 3)
) %>%
bold_p(t = 0.05) %>%
bold_labels() %>%
modify_caption("**Mô hình cuối — age + can + type**")
Mô hình cuối — age + can + type
| Characteristic |
OR |
95% CI |
p-value |
| age |
1.04 |
1.02, 1.06 |
0.001 |
| can |
|
|
|
| No |
— |
— |
|
| Yes |
5.02 |
1.02, 27.6 |
0.046 |
| type |
|
|
|
| Elective |
— |
— |
|
| Emergency |
20.9 |
4.84, 166 |
<0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio |
# Cập nhật predicted probability
icu$pred_prob <- predict(model_final3, type = "response")
# Kiểm tra lại AUC
roc_obj <- roc(icu$outcome, icu$pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.7609
coef(model_final3)
## (Intercept) age canYes typeEmergency
## -6.28626263 0.03604018 1.61315724 3.03773520
auc(roc_obj)
## Area under the curve: 0.7609
# Coefficients (bỏ intercept)
beta <- coef(model_final3)[-1]
# Lấy beta nhỏ nhất làm đơn vị
beta_min <- min(abs(beta))
# = 0.03604 (age)
# Tính raw score
raw_score <- beta / beta_min
print(round(raw_score, 1))
## age canYes typeEmergency
## 1.0 44.8 84.3
# age = 0.036 / 0.036 = 1.0 → 1 điểm/năm tuổi
# can = 1.613 / 0.036 = 44.8 → quá lớn!
# type = 3.038 / 0.036 = 84.3 → quá lớn!
# Nhóm tuổi phổ biến trong y văn ICU
icu <- icu %>%
mutate(
age_group = case_when(
age < 40 ~ 0,
age < 50 ~ 1,
age < 60 ~ 2,
age < 70 ~ 3,
age >= 70 ~ 4
)
)
# Kiểm tra phân phối
table(icu$age_group, icu$outcome)
##
## 0 1
## 0 35 2
## 1 19 3
## 2 20 6
## 3 40 11
## 4 46 18
model_score <- glm(
outcome ~ age_group + can + type,
data = icu,
family = binomial(link = "logit")
)
coef(model_score)
## (Intercept) age_group canYes typeEmergency
## -5.5810100 0.5183759 1.6546616 3.1562033
beta2 <- coef(model_score)[-1]
beta_min2 <- min(abs(beta2))
raw_score2 <- beta2 / beta_min2
integer_score2 <- round(raw_score2, 0)
print(integer_score2)
## age_group canYes typeEmergency
## 1 3 6
icu <- icu %>%
mutate(
pt_age = age_group * integer_score2["age_group"],
pt_can = ifelse(can == "Yes", integer_score2["canYes"], 0),
pt_type = ifelse(type == "Emergency", integer_score2["typeEmergency"], 0),
total_score = pt_age + pt_can + pt_type
)
summary(icu$total_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.000 7.000 7.125 9.000 13.000
hist(icu$total_score,
breaks = 10,
col = "steelblue",
main = "Phân phối tổng điểm",
xlab = "Tổng điểm")

roc_score <- roc(icu$outcome, icu$total_score)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_score,
print.auc = TRUE,
col = "tomato",
lwd = 2,
main = "ROC: Scoring System vs Original Model")
lines(roc_obj, col = "steelblue", lwd = 2, lty = 2)
legend("bottomright",
legend = c(
paste("Scoring AUC =", round(auc(roc_score), 3)),
paste("Model AUC =", round(auc(roc_obj), 3))
),
col = c("tomato", "steelblue"),
lwd = 2)

library(pROC)
# ROC và ngưỡng tối ưu
roc_score <- roc(icu$outcome, icu$total_score)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
best_cut <- coords(
roc_score,
"best",
best.method = "youden",
ret = c("threshold", "sensitivity",
"specificity", "ppv", "npv")
)
print(best_cut)
## threshold sensitivity specificity ppv npv
## threshold 7.5 0.825 0.6 0.3402062 0.9320388
# Hosmer-Lemeshow cho bảng điểm
hl_score <- hoslem.test(icu$outcome,
fitted(model_score),
g = 10)
## Warning in hoslem.test(icu$outcome, fitted(model_score), g = 10): The data did
## not allow for the requested number of bins.
print(hl_score)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: icu$outcome, fitted(model_score)
## X-squared = 3.1805, df = 7, p-value = 0.8678
# Tỷ lệ tử vong theo từng mức điểm
score_perf <- icu %>%
group_by(total_score) %>%
summarise(
n = n(),
n_died = sum(outcome),
mortality = round(mean(outcome) * 100, 1)
) %>%
arrange(total_score)
print(score_perf)
## # A tibble: 13 × 4
## total_score n n_died mortality
## <dbl> <int> <dbl> <dbl>
## 1 0 2 0 0
## 2 2 6 0 0
## 3 3 15 0 0
## 4 4 20 1 5
## 5 5 1 0 0
## 6 6 36 2 5.6
## 7 7 23 4 17.4
## 8 8 18 5 27.8
## 9 9 34 10 29.4
## 10 10 42 15 35.7
## 11 11 1 1 100
## 12 12 1 1 100
## 13 13 1 1 100
# Sau khi có best_cut từ bước 5.7
# Ví dụ ngưỡng = 3 (cập nhật theo kết quả thực tế)
icu <- icu %>%
mutate(
risk_group = case_when(
total_score <= 2 ~ "Thấp",
total_score <= 5 ~ "Trung bình",
total_score >= 6 ~ "Cao"
),
risk_group = factor(risk_group,
levels = c("Thấp", "Trung bình", "Cao"))
)
# Bảng tỷ lệ tử vong theo nhóm nguy cơ
icu %>%
group_by(risk_group) %>%
summarise(
n = n(),
n_died = sum(outcome),
mortality = paste0(round(mean(outcome) * 100, 1), "%")
)
## # A tibble: 3 × 4
## risk_group n n_died mortality
## <fct> <int> <dbl> <chr>
## 1 Thấp 8 0 0%
## 2 Trung bình 36 1 2.8%
## 3 Cao 156 39 25%
# Cập nhật ngưỡng theo Youden (≥8 = nguy cơ cao)
icu <- icu %>%
mutate(
risk_group2 = case_when(
total_score < 8 ~ "Thấp — Trung bình",
total_score >= 8 ~ "Cao"
),
risk_group2 = factor(risk_group2,
levels = c("Thấp — Trung bình", "Cao"))
)
# Bảng tử vong theo nhóm mới
icu %>%
group_by(risk_group2) %>%
summarise(
n = n(),
n_died = sum(outcome),
mortality = paste0(round(mean(outcome) * 100, 1), "%")
)
## # A tibble: 2 × 4
## risk_group2 n n_died mortality
## <fct> <int> <dbl> <chr>
## 1 Thấp — Trung bình 103 7 6.8%
## 2 Cao 97 33 34%
#install.packages("rms")
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
dd <- datadist(icu)
options(datadist = "dd")
model_rms <- lrm(
outcome ~ age_group + can + type,
data = icu,
x = TRUE,
y = TRUE
)
# Bootstrap 1000 lần
set.seed(123)
val <- validate(model_rms, B = 1000)
print(val)
## index.orig training test optimism index.corrected Lower Upper
## Dxy 0.5256 0.5380 0.5184 0.0197 0.5060 0.3497 0.6551
## R2 0.2393 0.2596 0.2123 0.0473 0.1920 -0.0133 0.3354
## Intercept 0.0000 0.0000 -0.1882 0.1882 -0.1882 -1.3184 0.7353
## Slope 1.0000 1.0000 0.8177 0.1823 0.8177 -0.3258 1.4806
## Emax 0.0000 0.0000 0.1646 -0.1646 0.1646 -0.0856 0.9655
## D 0.1591 0.1750 0.1395 0.0355 0.1236 -0.0289 0.2333
## U -0.0100 -0.0100 0.0448 -0.0548 0.0448 -0.0685 0.5983
## Q 0.1691 0.1850 0.0948 0.0903 0.0788 -0.5453 0.2725
## B 0.1357 0.1330 0.1392 -0.0062 0.1420 0.1137 0.1751
## g 1.5092 2.4503 1.3960 1.0543 0.4549 -8.1767 2.4463
## gp 0.1748 0.1767 0.1579 0.0188 0.1560 0.0521 0.2253
## n
## Dxy 1000
## R2 1000
## Intercept 1000
## Slope 1000
## Emax 1000
## D 1000
## U 1000
## Q 1000
## B 1000
## g 1000
## gp 1000
# Tính AUC hiệu chỉnh
dxy_optimism_corrected <- val["Dxy", "index.corrected"]
auc_corrected <- dxy_optimism_corrected / 2 + 0.5
cat("AUC sau bootstrap correction:", round(auc_corrected, 3), "\n")
## AUC sau bootstrap correction: 0.753
# Calibration plot chuẩn y văn
cal <- calibrate(model_rms, B = 1000)
plot(cal,
xlab = "Predicted Probability",
ylab = "Observed Probability",
main = "Calibration Plot — Bootstrap 1000 lần")

##
## n=200 Mean absolute error=0.021 Mean squared error=0.00069
## 0.9 Quantile of absolute error=0.046
library(pROC)
library(ggplot2)
# 1. ROC Curve đẹp bằng ggplot2
roc_data <- data.frame(
fpr = 1 - roc_score$specificities,
tpr = roc_score$sensitivities
)
ggplot(roc_data, aes(x = fpr, y = tpr)) +
geom_line(color = "steelblue", linewidth = 1.2) +
geom_abline(linetype = "dashed", color = "gray50") +
geom_point(aes(x = 1 - 0.6, y = 0.825), # Điểm tối ưu
color = "tomato", size = 3) +
annotate("text",
x = 0.55, y = 0.75,
label = paste0("AUC = 0.761\nCorrected AUC = 0.753"),
size = 4, color = "steelblue") +
labs(
title = "ROC Curve — ICU Mortality Scoring System",
x = "1 - Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)"
) +
theme_minimal(base_size = 13)
## Warning in geom_point(aes(x = 1 - 0.6, y = 0.825), color = "tomato", size = 3): All aesthetics have length 1, but the data has 14 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.

# 2. Tỷ lệ tử vong theo tổng điểm
score_perf %>%
ggplot(aes(x = factor(total_score), y = mortality)) +
geom_col(fill = "steelblue", alpha = 0.8) +
geom_text(aes(label = paste0(mortality, "%")),
vjust = -0.5, size = 3.5) +
labs(
title = "Tỷ lệ tử vong theo tổng điểm",
x = "Tổng điểm",
y = "Tỷ lệ tử vong (%)"
) +
theme_minimal(base_size = 13)

# Bảng điểm hoàn chỉnh
final_score_table <- data.frame(
"Yếu tố" = c("Tuổi < 40", "Tuổi 40–49", "Tuổi 50–59",
"Tuổi 60–69", "Tuổi ≥ 70",
"Ung thư: Không", "Ung thư: Có",
"Nhập viện chương trình", "Nhập viện cấp cứu"),
"Điểm số" = c(0, 1, 2, 3, 4, 0, 3, 0, 6),
check.names = FALSE
)
print(final_score_table)
## Yếu tố Điểm số
## 1 Tuổi < 40 0
## 2 Tuổi 40–49 1
## 3 Tuổi 50–59 2
## 4 Tuổi 60–69 3
## 5 Tuổi ≥ 70 4
## 6 Ung thư: Không 0
## 7 Ung thư: Có 3
## 8 Nhập viện chương trình 0
## 9 Nhập viện cấp cứu 6
# Bảng phân tầng nguy cơ
risk_table <- data.frame(
"Tổng điểm" = c("0 – 7", "≥ 8"),
"Nhóm nguy cơ" = c("Thấp – Trung bình", "Cao"),
"Tỷ lệ tử vong" = c("< 5%", "~25%"),
check.names = FALSE
)
print(risk_table)
## Tổng điểm Nhóm nguy cơ Tỷ lệ tử vong
## 1 0 – 7 Thấp – Trung bình < 5%
## 2 ≥ 8 Cao ~25%