# Cài package nếu chưa có
#install.packages("aplore3")
#install.packages("tidyverse")
#install.packages("gtsummary")    # Tạo bảng thống kê đẹp
#install.packages("pROC")         # Vẽ ROC, tính AUC
#install.packages("ResourceSelection") # Hosmer-Lemeshow test

# Load library
library(aplore3)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gtsummary)

# Gọi dataset
data("icu")

# Xem cấu trúc
glimpse(icu)
## Rows: 200
## Columns: 21
## $ id     <int> 4, 8, 12, 14, 27, 28, 32, 38, 40, 41, 42, 47, 50, 51, 52, 53, 5…
## $ sta    <fct> Died, Lived, Lived, Lived, Died, Lived, Lived, Lived, Lived, Li…
## $ age    <int> 87, 27, 59, 77, 76, 54, 87, 69, 63, 30, 35, 78, 70, 55, 63, 48,…
## $ gender <fct> Female, Female, Male, Male, Female, Male, Female, Male, Male, F…
## $ race   <fct> White, White, White, White, White, White, White, White, White, …
## $ ser    <fct> Surgical, Medical, Medical, Surgical, Surgical, Medical, Surgic…
## $ can    <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes, No, No, Ye…
## $ crn    <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, Yes, No…
## $ inf    <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes,…
## $ cpr    <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No,…
## $ sys    <int> 80, 142, 112, 100, 128, 142, 110, 110, 104, 144, 108, 130, 138,…
## $ hra    <int> 96, 88, 80, 70, 90, 103, 154, 132, 66, 110, 60, 132, 103, 86, 1…
## $ pre    <fct> No, No, Yes, No, Yes, No, Yes, No, No, No, No, No, No, Yes, Yes…
## $ type   <fct> Emergency, Emergency, Emergency, Elective, Emergency, Emergency…
## $ fra    <fct> Yes, No, No, No, No, Yes, No, No, No, No, No, No, No, No, No, N…
## $ po2    <fct> <= 60, > 60, > 60, > 60, > 60, > 60, > 60, <= 60, > 60, > 60, >…
## $ ph     <fct> < 7.25, >= 7.25, >= 7.25, >= 7.25, >= 7.25, >= 7.25, >= 7.25, >…
## $ pco    <fct> > 45, <= 45, <= 45, <= 45, <= 45, <= 45, <= 45, <= 45, <= 45, <…
## $ bic    <fct> >= 18, >= 18, >= 18, >= 18, >= 18, >= 18, >= 18, < 18, >= 18, >…
## $ cre    <fct> <= 2.0, <= 2.0, <= 2.0, <= 2.0, <= 2.0, <= 2.0, <= 2.0, <= 2.0,…
## $ loc    <fct> Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, …
# Xem 6 dòng đầu
head(icu)
##   id   sta age gender  race      ser can crn inf cpr sys hra pre      type fra
## 1  4  Died  87 Female White Surgical  No  No Yes  No  80  96  No Emergency Yes
## 2  8 Lived  27 Female White  Medical  No  No Yes  No 142  88  No Emergency  No
## 3 12 Lived  59   Male White  Medical  No  No  No  No 112  80 Yes Emergency  No
## 4 14 Lived  77   Male White Surgical  No  No  No  No 100  70  No  Elective  No
## 5 27  Died  76 Female White Surgical  No  No Yes  No 128  90 Yes Emergency  No
## 6 28 Lived  54   Male White  Medical  No  No Yes  No 142 103  No Emergency Yes
##     po2      ph   pco   bic    cre     loc
## 1 <= 60  < 7.25  > 45 >= 18 <= 2.0 Nothing
## 2  > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
## 3  > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
## 4  > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
## 5  > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
## 6  > 60 >= 7.25 <= 45 >= 18 <= 2.0 Nothing
# Tóm tắt thống kê
summary(icu)
##        id           sta           age           gender       race    
##  Min.   :  4.0   Lived:160   Min.   :16.00   Male  :124   White:175  
##  1st Qu.:210.2   Died : 40   1st Qu.:46.75   Female: 76   Black: 15  
##  Median :412.5               Median :63.00                Other: 10  
##  Mean   :444.8               Mean   :57.55                           
##  3rd Qu.:671.8               3rd Qu.:72.00                           
##  Max.   :929.0               Max.   :92.00                           
##        ser       can       crn       inf       cpr           sys       
##  Medical : 93   No :180   No :181   No :116   No :187   Min.   : 36.0  
##  Surgical:107   Yes: 20   Yes: 19   Yes: 84   Yes: 13   1st Qu.:110.0  
##                                                         Median :130.0  
##                                                         Mean   :132.3  
##                                                         3rd Qu.:150.0  
##                                                         Max.   :256.0  
##       hra          pre             type      fra         po2            ph     
##  Min.   : 39.00   No :170   Elective : 53   No :185   > 60 :184   >= 7.25:187  
##  1st Qu.: 80.00   Yes: 30   Emergency:147   Yes: 15   <= 60: 16   < 7.25 : 13  
##  Median : 96.00                                                                
##  Mean   : 98.92                                                                
##  3rd Qu.:118.25                                                                
##  Max.   :192.00                                                                
##     pco         bic          cre           loc     
##  <= 45:180   >= 18:185   <= 2.0:190   Nothing:185  
##  > 45 : 20   < 18 : 15   > 2.0 : 10   Stupor :  5  
##                                       Coma   : 10  
##                                                    
##                                                    
## 
library(aplore3)
library(tidyverse)
library(gtsummary)

data("icu")

# Xem cấu trúc outcome
table(icu$sta)
## 
## Lived  Died 
##   160    40
# Mặc định: Died = 1, Lived = 0 (cần kiểm tra)

# Chuyển outcome về dạng numeric 0/1
icu <- icu %>%
  mutate(
    outcome = ifelse(sta == "Died", 1, 0),
    
    # Chuẩn hóa một số biến factor
    loc = factor(loc),   # Mức độ ý thức (quan trọng!)
    ser = factor(ser),
    type = factor(type)
  )

# Kiểm tra
table(icu$outcome)  # 0 = Sống, 1 = Tử vong
## 
##   0   1 
## 160  40
#Chạy đơn biến tổng hợp
# Bảng phân tích đơn biến tất cả biến
# Cài các package phụ thuộc còn thiếu
#install.packages("broom.helpers")
#install.packages("broom")

# Sau đó load lại
library(gtsummary)
tbl_univariate <- icu %>%
  select(outcome, age, gender, ser, can, crn, inf,
         cpr, sys, hra, pre, type, fra, loc) %>%
  tbl_uvregression(
    method = glm,
    y = outcome,
    method.args = list(family = binomial),
    exponentiate = TRUE,   # Hiển thị OR thay vì log(OR)
    pvalue_fun = ~style_pvalue(.x, digits = 3)
  ) %>%
  bold_p(t = 0.05) %>%
  bold_labels() %>%
  modify_header(label = "**Biến số**") %>%
  modify_caption("**Phân tích đơn biến — Tiên lượng tử vong ICU**")
## There was a warning running `tbl_regression()` for variable "loc". See message
## below.
## ! glm.fit: fitted probabilities numerically 0 or 1 occurred, glm.fit: fitted
##   probabilities numerically 0 or 1 occurred, glm.fit: fitted probabilities
##   numerically 0 or 1 occurred, glm.fit: fitted probabilities numerically 0 or 1
##   occurred, glm.fit: fitted probabilities numerically 0 or 1 occurred, glm.fit:
##   fitted probabilities numerically 0 or 1 occurred, glm.fit: fitted
##   probabilities numerically 0 or 1 occurred, glm.fit: fitted probabilities
##   numerically 0 or 1 occurred, glm.fit: fitted probabilities numerically 0 or 1
##   occurred, glm.fit: fitted probabilities numerically 0 or 1 occurred, glm.fit:
##   fitted probabilities numerically 0 or 1 occurred, glm.fit: fitted
##   probabilities numerically 0 or 1 occurred, glm.fit: fitted probabilities
##   numerically 0 or 1 occurred, and glm.fit: fitted probabilities numerically 0
##   or 1 occurred
# Xem bảng
tbl_univariate
Phân tích đơn biến — Tiên lượng tử vong ICU
Biến số N OR 95% CI p-value
age 200 1.03 1.01, 1.05 0.009
gender 200


    Male

    Female
1.11 0.54, 2.24 0.771
ser 200


    Medical

    Surgical
0.39 0.18, 0.79 0.010
can 200


    No

    Yes
1.00 0.27, 2.93 >0.999
crn 200


    No

    Yes
3.39 1.22, 9.06 0.015
inf 200


    No

    Yes
2.50 1.24, 5.16 0.011
cpr 200


    No

    Yes
5.44 1.70, 17.9 0.004
sys 200 0.98 0.97, 0.99 0.005
hra 200 1.00 0.99, 1.02 0.653
pre 200


    No

    Yes
1.26 0.47, 3.07 0.621
type 200


    Elective

    Emergency
8.89 2.58, 56.0 0.003
fra 200


    No

    Yes
1.00 0.22, 3.34 >0.999
loc 200


    Nothing

    Stupor
91,589,445 0.00,
0.986
    Coma
23.4 5.52, 161 <0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio
#install.packages("BMA")
#install.packages("BAS")   # Bayesian Adaptive Sampling — mạnh hơn BMA package

library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
library(BAS)
# Chạy BMA logistic regression
bma_model <- bas.glm(
  outcome ~ age + can + crn + inf + cpr + sys + type + loc,
  data = icu,
  family = binomial(link = "logit"),
  method = "MCMC",           # MCMC sampling cho không gian mô hình lớn
  MCMC.iterations = 100000,
  betaprior = bic.prior(),   # Prior phổ biến trong y văn
  modelprior = uniform()     # Prior đồng đều cho các mô hình
)

# Xem tóm tắt
summary(bma_model)
##               P(B != 0 | Y)   model 1     model 2     model 3     model 4
## Intercept        1.00000000   1.00000   1.0000000   1.0000000   1.0000000
## age              0.91283791   1.00000   1.0000000   1.0000000   1.0000000
## canYes           0.72592519   1.00000   1.0000000   0.0000000   0.0000000
## crnYes           0.11238903   0.00000   0.0000000   0.0000000   0.0000000
## infYes           0.09311721   0.00000   0.0000000   0.0000000   0.0000000
## cprYes           0.13756608   0.00000   0.0000000   0.0000000   0.0000000
## sys              0.62373067   1.00000   0.0000000   1.0000000   0.0000000
## typeEmergency    0.99194015   1.00000   1.0000000   1.0000000   1.0000000
## locStupor        0.99931172   1.00000   1.0000000   1.0000000   1.0000000
## locComa          0.88119701   1.00000   1.0000000   1.0000000   1.0000000
## BF                       NA   1.00000   0.6519453   0.2808813   0.2422451
## PostProbs                NA   0.26580   0.1760000   0.0765000   0.0684000
## R2                       NA   0.35830   0.3276000   0.3192000   0.2912000
## dim                      NA   7.00000   6.0000000   6.0000000   5.0000000
## logmarg                  NA -80.11515 -80.5429478 -81.3849764 -81.5329585
##                   model 5
## Intercept       1.0000000
## age             1.0000000
## canYes          1.0000000
## crnYes          0.0000000
## infYes          0.0000000
## cprYes          0.0000000
## sys             1.0000000
## typeEmergency   1.0000000
## locStupor       1.0000000
## locComa         0.0000000
## BF              0.1295497
## PostProbs       0.0348000
## R2              0.3114000
## dim             6.0000000
## logmarg       -82.1588442
# Inclusion probability — chỉ số quan trọng nhất của BMA
plot(bma_model, which = 4,  # Inclusion probabilities
     ask = FALSE,
     cex.axis = 0.7)

# Xem dạng bảng
bma_model$probne0  # Xác suất posterior mỗi biến ≠ 0
##  [1] 1.00000000 0.91283791 0.72592519 0.11238903 0.09311721 0.13756608
##  [7] 0.62373067 0.99194015 0.99931172 0.88119701
# Coefficient trung bình theo BMA
coef_bma <- coef(bma_model)
print(coef_bma)
## 
##  Marginal Posterior Summaries of Coefficients: 
## 
##  Using  BMA 
## 
##  Based on the top  202 models 
##                post mean   post SD     post p(B != 0)
## Intercept        -5.57521     2.23551     1.00000    
## age               0.03388     0.01619     0.91284    
## canYes            1.80274     1.37544     0.72593    
## crnYes            0.08372     0.32976     0.11239    
## infYes            0.03615     0.18739     0.09312    
## cprYes            0.14204     0.45935     0.13757    
## sys              -0.01140     0.01068     0.62373    
## typeEmergency     3.46968     1.34458     0.99194    
## locStupor        20.89150  1429.40283     0.99931    
## locComa           2.21051     1.16324     0.88120
# Visualize coefficient
plot(coef_bma,
     ask = FALSE)

library(BMA)

# Cần chuẩn bị matrix biến độc lập
X <- icu %>%
  select(age, can, crn, inf, cpr, sys, type, loc) %>%
  mutate(across(where(is.factor), as.numeric)) %>%
  as.data.frame()

bma_model2 <- bic.glm(
  x = X,
  y = icu$outcome,
  glm.family = binomial(link = "logit"),
  strict = FALSE
)

# Xem kết quả
summary(bma_model2)
## 
## Call:
## bic.glm.data.frame(x = X, y = icu$outcome, glm.family = binomial(link = "logit"),     strict = FALSE)
## 
## 
##   12  models were selected
##  Best  5  models (cumulative posterior probability =  0.7917 ): 
## 
##            p!=0    EV         SD        model 1     model 2     model 3   
## Intercept  100    -11.088458  3.080617  -1.326e+01  -9.544e+00  -1.177e+01
## age         92.9    0.031635  0.014670   3.584e-02   3.291e-02   3.508e-02
## can         50.6    0.966395  1.119466   1.925e+00       .       1.945e+00
## crn          5.6    0.024246  0.175746       .           .           .    
## inf          7.0    0.030049  0.158869       .           .           .    
## cpr          5.4    0.029808  0.225111       .           .           .    
## sys         13.3   -0.001267  0.004018       .           .      -9.646e-03
## type       100.0    2.494913  0.909742   2.893e+00   2.188e+00   2.790e+00
## loc        100.0    1.876222  0.534174   1.927e+00   1.834e+00   1.915e+00
##                                                                           
## nVar                                       4           3           5      
## BIC                                     -8.876e+02  -8.875e+02  -8.845e+02
## post prob                                0.317       0.298       0.070    
##            model 4     model 5   
## Intercept  -8.029e+00  -7.144e+00
## age         3.215e-02       .    
## can             .           .    
## crn             .           .    
## inf             .           .    
## cpr             .           .    
## sys        -9.375e-03       .    
## type        2.064e+00   1.926e+00
## loc         1.822e+00   1.894e+00
##                                  
## nVar          4           2      
## BIC        -8.844e+02  -8.836e+02
## post prob   0.063       0.043
print(bma_model2)
## 
## Call:
## bic.glm.data.frame(x = X, y = icu$outcome, glm.family = binomial(link = "logit"),     strict = FALSE)
## 
## 
##  Posterior probabilities(%): 
##   age   can   crn   inf   cpr   sys  type   loc 
##  92.9  50.6   5.6   7.0   5.4  13.3 100.0 100.0 
## 
##  Coefficient posterior expected values: 
## (Intercept)          age          can          crn          inf          cpr  
##  -11.088458     0.031635     0.966395     0.024246     0.030049     0.029808  
##         sys         type          loc  
##   -0.001267     2.494913     1.876222
# Mô hình cuối từ kết quả BMA
model_final <- glm(
  outcome ~ age + can + type + loc,
  data = icu,
  family = binomial(link = "logit")
)

# Trình bày chuẩn y văn
tbl_final <- model_final %>%
  tbl_regression(
    exponentiate = TRUE,
    pvalue_fun = ~style_pvalue(.x, digits = 3)
  ) %>%
  bold_p(t = 0.05) %>%
  bold_labels() %>%
  modify_header(label = "**Biến số**") %>%
  modify_caption("**Mô hình cuối — BMA Selected Variables**")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
tbl_final
Mô hình cuối — BMA Selected Variables
Biến số OR 95% CI p-value
age 1.04 1.01, 1.07 0.003
can


    No
    Yes 11.4 1.97, 92.0 0.009
type


    Elective
    Emergency 48.6 6.64, 1,169 0.002
loc


    Nothing
    Stupor 1,101,550,279 0.00,
0.989
    Coma 15.9 3.57, 112 <0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ResourceSelection)
## ResourceSelection 0.3-6   2023-06-27
# Predicted probability
icu$pred_prob <- predict(model_final, type = "response")

# 1. AUC - ROC
roc_obj <- roc(icu$outcome, icu$pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.8552
# Vẽ ROC
plot(roc_obj,
     print.auc = TRUE,
     col = "steelblue",
     lwd = 2,
     main = "ROC Curve — ICU Mortality Prediction")

# 2. Hosmer-Lemeshow test (Calibration)
hl_test <- hoslem.test(icu$outcome, icu$pred_prob, g = 10)
print(hl_test)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  icu$outcome, icu$pred_prob
## X-squared = 4.568, df = 8, p-value = 0.8026
# p > 0.05 = mô hình calibrate tốt
# Xem coefficients
coef(model_final)
##   (Intercept)           age        canYes typeEmergency     locStupor 
##   -7.68055261    0.03785479    2.43193239    3.88421704   20.81998437 
##       locComa 
##    2.76516818
# Kiểm tra phân phối loc vs outcome
table(icu$loc, icu$outcome)
##          
##             0   1
##   Nothing 158  27
##   Stupor    0   5
##   Coma      2   8
# Kiểm tra số lượng
icu %>%
  count(loc, outcome) %>%
  pivot_wider(names_from = outcome, 
              values_from = n,
              values_fill = 0) %>%
  rename(Survived = `0`, Died = `1`)
## # A tibble: 3 × 3
##   loc     Survived  Died
##   <fct>      <int> <int>
## 1 Nothing      158    27
## 2 Stupor         0     5
## 3 Coma           2     8
# Mô hình cuối bỏ loc
model_final3 <- glm(
  outcome ~ age + can + type,
  data = icu,
  family = binomial(link = "logit")
)

# Xem coefficients
coef(model_final3)
##   (Intercept)           age        canYes typeEmergency 
##   -6.28626263    0.03604018    1.61315724    3.03773520
# Trình bày bảng
model_final3 %>%
  tbl_regression(
    exponentiate = TRUE,
    pvalue_fun = ~style_pvalue(.x, digits = 3)
  ) %>%
  bold_p(t = 0.05) %>%
  bold_labels() %>%
  modify_caption("**Mô hình cuối — age + can + type**")
Mô hình cuối — age + can + type
Characteristic OR 95% CI p-value
age 1.04 1.02, 1.06 0.001
can


    No
    Yes 5.02 1.02, 27.6 0.046
type


    Elective
    Emergency 20.9 4.84, 166 <0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio
# Cập nhật predicted probability
icu$pred_prob <- predict(model_final3, type = "response")

# Kiểm tra lại AUC
roc_obj <- roc(icu$outcome, icu$pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.7609
coef(model_final3)
##   (Intercept)           age        canYes typeEmergency 
##   -6.28626263    0.03604018    1.61315724    3.03773520
auc(roc_obj)
## Area under the curve: 0.7609
# Coefficients (bỏ intercept)
beta <- coef(model_final3)[-1]

# Lấy beta nhỏ nhất làm đơn vị
beta_min <- min(abs(beta))
# = 0.03604 (age)

# Tính raw score
raw_score <- beta / beta_min
print(round(raw_score, 1))
##           age        canYes typeEmergency 
##           1.0          44.8          84.3
# age     = 0.036 / 0.036 = 1.0  → 1 điểm/năm tuổi
# can     = 1.613 / 0.036 = 44.8 → quá lớn!
# type    = 3.038 / 0.036 = 84.3 → quá lớn!
# Nhóm tuổi phổ biến trong y văn ICU
icu <- icu %>%
  mutate(
    age_group = case_when(
      age < 40  ~ 0,
      age < 50  ~ 1,
      age < 60  ~ 2,
      age < 70  ~ 3,
      age >= 70 ~ 4
    )
  )

# Kiểm tra phân phối
table(icu$age_group, icu$outcome)
##    
##      0  1
##   0 35  2
##   1 19  3
##   2 20  6
##   3 40 11
##   4 46 18
model_score <- glm(
  outcome ~ age_group + can + type,
  data = icu,
  family = binomial(link = "logit")
)

coef(model_score)
##   (Intercept)     age_group        canYes typeEmergency 
##    -5.5810100     0.5183759     1.6546616     3.1562033
beta2     <- coef(model_score)[-1]
beta_min2 <- min(abs(beta2))

raw_score2     <- beta2 / beta_min2
integer_score2 <- round(raw_score2, 0)

print(integer_score2)
##     age_group        canYes typeEmergency 
##             1             3             6
icu <- icu %>%
  mutate(
    pt_age  = age_group * integer_score2["age_group"],
    pt_can  = ifelse(can == "Yes", integer_score2["canYes"], 0),
    pt_type = ifelse(type == "Emergency", integer_score2["typeEmergency"], 0),
    total_score = pt_age + pt_can + pt_type
  )

summary(icu$total_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   6.000   7.000   7.125   9.000  13.000
hist(icu$total_score,
     breaks = 10,
     col    = "steelblue",
     main   = "Phân phối tổng điểm",
     xlab   = "Tổng điểm")

roc_score <- roc(icu$outcome, icu$total_score)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_score,
     print.auc = TRUE,
     col  = "tomato",
     lwd  = 2,
     main = "ROC: Scoring System vs Original Model")

lines(roc_obj, col = "steelblue", lwd = 2, lty = 2)

legend("bottomright",
       legend = c(
         paste("Scoring AUC =", round(auc(roc_score), 3)),
         paste("Model AUC   =", round(auc(roc_obj), 3))
       ),
       col = c("tomato", "steelblue"),
       lwd = 2)

library(pROC)

# ROC và ngưỡng tối ưu
roc_score <- roc(icu$outcome, icu$total_score)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
best_cut <- coords(
  roc_score,
  "best",
  best.method = "youden",
  ret = c("threshold", "sensitivity", 
          "specificity", "ppv", "npv")
)
print(best_cut)
##           threshold sensitivity specificity       ppv       npv
## threshold       7.5       0.825         0.6 0.3402062 0.9320388
# Hosmer-Lemeshow cho bảng điểm
hl_score <- hoslem.test(icu$outcome, 
                        fitted(model_score), 
                        g = 10)
## Warning in hoslem.test(icu$outcome, fitted(model_score), g = 10): The data did
## not allow for the requested number of bins.
print(hl_score)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  icu$outcome, fitted(model_score)
## X-squared = 3.1805, df = 7, p-value = 0.8678
# Tỷ lệ tử vong theo từng mức điểm
score_perf <- icu %>%
  group_by(total_score) %>%
  summarise(
    n          = n(),
    n_died     = sum(outcome),
    mortality  = round(mean(outcome) * 100, 1)
  ) %>%
  arrange(total_score)

print(score_perf)
## # A tibble: 13 × 4
##    total_score     n n_died mortality
##          <dbl> <int>  <dbl>     <dbl>
##  1           0     2      0       0  
##  2           2     6      0       0  
##  3           3    15      0       0  
##  4           4    20      1       5  
##  5           5     1      0       0  
##  6           6    36      2       5.6
##  7           7    23      4      17.4
##  8           8    18      5      27.8
##  9           9    34     10      29.4
## 10          10    42     15      35.7
## 11          11     1      1     100  
## 12          12     1      1     100  
## 13          13     1      1     100
# Sau khi có best_cut từ bước 5.7
# Ví dụ ngưỡng = 3 (cập nhật theo kết quả thực tế)

icu <- icu %>%
  mutate(
    risk_group = case_when(
      total_score <= 2 ~ "Thấp",
      total_score <= 5 ~ "Trung bình", 
      total_score >= 6 ~ "Cao"
    ),
    risk_group = factor(risk_group, 
                        levels = c("Thấp", "Trung bình", "Cao"))
  )

# Bảng tỷ lệ tử vong theo nhóm nguy cơ
icu %>%
  group_by(risk_group) %>%
  summarise(
    n         = n(),
    n_died    = sum(outcome),
    mortality = paste0(round(mean(outcome) * 100, 1), "%")
  )
## # A tibble: 3 × 4
##   risk_group     n n_died mortality
##   <fct>      <int>  <dbl> <chr>    
## 1 Thấp           8      0 0%       
## 2 Trung bình    36      1 2.8%     
## 3 Cao          156     39 25%
# Cập nhật ngưỡng theo Youden (≥8 = nguy cơ cao)
icu <- icu %>%
  mutate(
    risk_group2 = case_when(
      total_score < 8  ~ "Thấp — Trung bình",
      total_score >= 8 ~ "Cao"
    ),
    risk_group2 = factor(risk_group2,
                         levels = c("Thấp — Trung bình", "Cao"))
  )

# Bảng tử vong theo nhóm mới
icu %>%
  group_by(risk_group2) %>%
  summarise(
    n         = n(),
    n_died    = sum(outcome),
    mortality = paste0(round(mean(outcome) * 100, 1), "%")
  )
## # A tibble: 2 × 4
##   risk_group2           n n_died mortality
##   <fct>             <int>  <dbl> <chr>    
## 1 Thấp — Trung bình   103      7 6.8%     
## 2 Cao                  97     33 34%
#install.packages("rms")
library(rms)
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
dd <- datadist(icu)
options(datadist = "dd")

model_rms <- lrm(
  outcome ~ age_group + can + type,
  data    = icu,
  x       = TRUE,
  y       = TRUE
)

# Bootstrap 1000 lần
set.seed(123)
val <- validate(model_rms, B = 1000)
print(val)
##           index.orig training    test optimism index.corrected   Lower  Upper
## Dxy           0.5256   0.5380  0.5184   0.0197          0.5060  0.3497 0.6551
## R2            0.2393   0.2596  0.2123   0.0473          0.1920 -0.0133 0.3354
## Intercept     0.0000   0.0000 -0.1882   0.1882         -0.1882 -1.3184 0.7353
## Slope         1.0000   1.0000  0.8177   0.1823          0.8177 -0.3258 1.4806
## Emax          0.0000   0.0000  0.1646  -0.1646          0.1646 -0.0856 0.9655
## D             0.1591   0.1750  0.1395   0.0355          0.1236 -0.0289 0.2333
## U            -0.0100  -0.0100  0.0448  -0.0548          0.0448 -0.0685 0.5983
## Q             0.1691   0.1850  0.0948   0.0903          0.0788 -0.5453 0.2725
## B             0.1357   0.1330  0.1392  -0.0062          0.1420  0.1137 0.1751
## g             1.5092   2.4503  1.3960   1.0543          0.4549 -8.1767 2.4463
## gp            0.1748   0.1767  0.1579   0.0188          0.1560  0.0521 0.2253
##              n
## Dxy       1000
## R2        1000
## Intercept 1000
## Slope     1000
## Emax      1000
## D         1000
## U         1000
## Q         1000
## B         1000
## g         1000
## gp        1000
# Tính AUC hiệu chỉnh
dxy_optimism_corrected <- val["Dxy", "index.corrected"]
auc_corrected <- dxy_optimism_corrected / 2 + 0.5
cat("AUC sau bootstrap correction:", round(auc_corrected, 3), "\n")
## AUC sau bootstrap correction: 0.753
# Calibration plot chuẩn y văn
cal <- calibrate(model_rms, B = 1000)
plot(cal,
     xlab = "Predicted Probability",
     ylab = "Observed Probability",
     main = "Calibration Plot — Bootstrap 1000 lần")

## 
## n=200   Mean absolute error=0.021   Mean squared error=0.00069
## 0.9 Quantile of absolute error=0.046
library(pROC)
library(ggplot2)

# 1. ROC Curve đẹp bằng ggplot2
roc_data <- data.frame(
  fpr = 1 - roc_score$specificities,
  tpr = roc_score$sensitivities
)

ggplot(roc_data, aes(x = fpr, y = tpr)) +
  geom_line(color = "steelblue", linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = "gray50") +
  geom_point(aes(x = 1 - 0.6, y = 0.825),  # Điểm tối ưu
             color = "tomato", size = 3) +
  annotate("text",
           x    = 0.55, y = 0.75,
           label = paste0("AUC = 0.761\nCorrected AUC = 0.753"),
           size  = 4, color = "steelblue") +
  labs(
    title    = "ROC Curve — ICU Mortality Scoring System",
    x        = "1 - Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)"
  ) +
  theme_minimal(base_size = 13)
## Warning in geom_point(aes(x = 1 - 0.6, y = 0.825), color = "tomato", size = 3): All aesthetics have length 1, but the data has 14 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# 2. Tỷ lệ tử vong theo tổng điểm
score_perf %>%
  ggplot(aes(x = factor(total_score), y = mortality)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  geom_text(aes(label = paste0(mortality, "%")),
            vjust = -0.5, size = 3.5) +
  labs(
    title = "Tỷ lệ tử vong theo tổng điểm",
    x     = "Tổng điểm",
    y     = "Tỷ lệ tử vong (%)"
  ) +
  theme_minimal(base_size = 13)

# Bảng điểm hoàn chỉnh
final_score_table <- data.frame(
  "Yếu tố"      = c("Tuổi < 40", "Tuổi 40–49", "Tuổi 50–59",
                     "Tuổi 60–69", "Tuổi ≥ 70",
                     "Ung thư: Không", "Ung thư: Có",
                     "Nhập viện chương trình", "Nhập viện cấp cứu"),
  "Điểm số"     = c(0, 1, 2, 3, 4, 0, 3, 0, 6),
  check.names   = FALSE
)

print(final_score_table)
##                   Yếu tố Điểm số
## 1              Tuổi < 40       0
## 2             Tuổi 40–49       1
## 3             Tuổi 50–59       2
## 4             Tuổi 60–69       3
## 5              Tuổi ≥ 70       4
## 6         Ung thư: Không       0
## 7            Ung thư: Có       3
## 8 Nhập viện chương trình       0
## 9      Nhập viện cấp cứu       6
# Bảng phân tầng nguy cơ
risk_table <- data.frame(
  "Tổng điểm"    = c("0 – 7", "≥ 8"),
  "Nhóm nguy cơ" = c("Thấp – Trung bình", "Cao"),
  "Tỷ lệ tử vong" = c("< 5%", "~25%"),
  check.names    = FALSE
)

print(risk_table)
##   Tổng điểm      Nhóm nguy cơ Tỷ lệ tử vong
## 1     0 – 7 Thấp – Trung bình          < 5%
## 2       ≥ 8               Cao          ~25%