LAR model in sepsis patients in Vietnam Intensive care unit

Nhập dữ liệu vào phần mềm R - Đánh giá phân phối chuẩn hay không chuẩn của các biến liên tục

library(readxl)
datalar <- read_excel("~/Desktop/Developer/datalar.xlsx")
View(datalar)
attach(datalar)
shapiro.test(datalar$age)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$age
## W = 0.94995, p-value = 9.945e-06
shapiro.test(datalar$gcs)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$gcs
## W = 0.76095, p-value = 2.361e-15
shapiro.test(datalar$mach)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$mach
## W = 0.98003, p-value = 0.015
shapiro.test(datalar$nhietdo)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$nhietdo
## W = 0.94253, p-value = 2.307e-06
shapiro.test(datalar$nhiptho)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$nhiptho
## W = 0.96489, p-value = 0.0002728
shapiro.test(datalar$hatb)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$hatb
## W = 0.95999, p-value = 8.662e-05
shapiro.test(datalar$sp02)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$sp02
## W = 0.7037, p-value < 2.2e-16
shapiro.test(datalar$bc)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$bc
## W = 0.94903, p-value = 8.249e-06
shapiro.test(datalar$hct)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$hct
## W = 0.98586, p-value = 0.08341
shapiro.test(datalar$tc)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$tc
## W = 0.9027, p-value = 3.67e-09
shapiro.test(datalar$bilitp)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$bilitp
## W = 0.27918, p-value < 2.2e-16
shapiro.test(datalar$na)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$na
## W = 0.95381, p-value = 2.225e-05
shapiro.test(datalar$k)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$k
## W = 0.96519, p-value = 0.0002938
shapiro.test(datalar$ast)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$ast
## W = 0.31451, p-value < 2.2e-16
shapiro.test(datalar$alt)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$alt
## W = 0.305, p-value < 2.2e-16
shapiro.test(datalar$ure)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$ure
## W = 0.8503, p-value = 6.615e-12
shapiro.test(datalar$crea)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$crea
## W = 0.82095, p-value = 3.627e-13
shapiro.test(datalar$ph)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$ph
## W = 0.90407, p-value = 4.443e-09
shapiro.test(datalar$pc02)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$pc02
## W = 0.85577, p-value = 1.186e-11
shapiro.test(datalar$p02)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$p02
## W = 0.71846, p-value < 2.2e-16
shapiro.test(datalar$hc03)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$hc03
## W = 0.95487, p-value = 2.791e-05
shapiro.test(datalar$lactate)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$lactate
## W = 0.89633, p-value = 1.537e-09
shapiro.test(datalar$albumin)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$albumin
## W = 0.98597, p-value = 0.08636
shapiro.test(datalar$lar)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$lar
## W = 0.85129, p-value = 7.344e-12
shapiro.test(datalar$sofa)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$sofa
## W = 0.96972, p-value = 0.0009052
shapiro.test(datalar$apacheii)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$apacheii
## W = 0.97322, p-value = 0.002262
shapiro.test(datalar$ngay)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$ngay
## W = 0.789, p-value = 2.19e-14
shapiro.test(datalar$pct)
## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$pct
## W = 0.75866, p-value = 3.903e-15

–> Kết quả cho thấy các biến phân phối không chuẩn, chỉ có biến albumin và biến Hct là phân phối chuẩn

Đặc điểm dân số nghiên cứu và phân tích sự khác biệt giữa hai nhóm sống và nhóm tử vong

library(gtsummary)
summary2 <- datalar %>%
  select(sex, age, dtd, tha, btm, dqn, bpm, bthm, bgm, ut, nkp, nktn, nkth,
         nkdmm, khac, gcs, mach, nhietdo, nhiptho, hatb, sp02, thongkhich, locmau, bc,
         hct, tc, bilitp, na, k, ast, alt, ure, crea, ph, pc02, p02, hc03,
         lactate, albumin, lar, pct, caymau, sofa, apacheii, ngay, tuvong) %>%
  tbl_summary(
    by = tuvong,
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c("{mean} ({sd})", "{median} ({p25}, {p75})", "{min}, {max}"), 
      all_categorical() ~ "{n} / {N} ({p}%)"),
    digits = all_continuous() ~ 2,
    label  = list(
      sex ~ "Giới tính",
      age ~ "Tuổi",
      dtd ~ "Đái tháo đường",
      tha ~ "Tăng huyết áp",
      btm ~ "Bệnh tim mạch",
      dqn ~ "Đột quỵ não",
      bpm ~ "Bệnh phổi mạn",
      bthm ~ "Bệnh thận mạn", 
      bgm ~ "Bệnh gan mạn", 
      ut ~ "Ung thư", 
      nkp ~ "Nhiễm khuẩn hô hấp", 
      nktn ~ "Nhiễm khuẩn tiết niệu", 
      nkth ~ "Nhiễm khuẩn tiêu hoá",
      nkdmm ~ "Nhiễm khuẩn da và mô mềm", 
      khac ~ "Khác", 
      gcs ~ "GCS", 
      mach ~ "Mạch", 
      nhietdo ~ "Nhiệt độ", 
      nhiptho ~ "Nhịp thở", 
      hatb ~ "Huyết áp trung bình", 
      sp02 ~ "Sp02", 
      thongkhich ~ "Thông khí cơ học", 
      locmau ~ "CRRT",
      bc ~ "Bạch cầu",
      hct ~ "Hct", 
      tc ~ "Tiểu cầu",
      bilitp ~ "Bilirubin toàn phần",
      na ~ "Natri", 
      k ~ "Kali", 
      ast ~ "AST", 
      alt ~ "ALT",
      ure ~ "Ure", 
      crea ~ "Creatinine", 
      ph ~ "pH", 
      pc02 ~"pC02", 
      p02 ~ "p02", 
      hc03 ~ "HC03",
      lactate ~ "Lactate máu", 
      albumin ~ "ALbumin máu", 
      lar ~"Lactate/Albumin máu",
      pct ~ "PCT",
      caymau ~ "Cấy máu dương tính", 
      sofa ~ "Thang điểm SOFA", 
      apacheii ~ "Thang điểm APACHE II", 
      ngay ~ "Số ngày nằm khoa hồi sức"),
    missing_text = "(Missing)"
  ) %>% 
  add_p(pvalue_fun = ~ style_pvalue(.x, digits = 3)) %>%
  add_overall() %>%
  add_n() %>%
  modify_caption("**Đặc điểm dân số nghiên cứu**") %>%
  bold_labels()
summary2
Đặc điểm dân số nghiên cứu
Characteristic N Overall, N = 1701 0, N = 721 1, N = 981 p-value2
Giới tính 170 92 / 170 (54%) 43 / 72 (60%) 49 / 98 (50%) 0.209
Tuổi 170


<0.001
    Mean (SD)
70.64 (17.13) 65.88 (17.01) 74.14 (16.44)
    Median (IQR)
73.00 (62.00, 84.75) 68.50 (55.75, 78.25) 77.50 (66.00, 86.00)
    Range
16.00, 101.00 17.00, 99.00 16.00, 101.00
Đái tháo đường 170 59 / 170 (35%) 23 / 72 (32%) 36 / 98 (37%) 0.517
Tăng huyết áp 170 105 / 170 (62%) 43 / 72 (60%) 62 / 98 (63%) 0.639
Bệnh tim mạch 170 49 / 170 (29%) 24 / 72 (33%) 25 / 98 (26%) 0.266
Đột quỵ não 170 25 / 170 (15%) 7 / 72 (9.7%) 18 / 98 (18%) 0.116
Bệnh phổi mạn 170 8 / 170 (4.7%) 6 / 72 (8.3%) 2 / 98 (2.0%) 0.072
Bệnh thận mạn 170 13 / 170 (7.6%) 3 / 72 (4.2%) 10 / 98 (10%) 0.143
Bệnh gan mạn 170 4 / 170 (2.4%) 2 / 72 (2.8%) 2 / 98 (2.0%) >0.999
Ung thư 170 12 / 170 (7.1%) 5 / 72 (6.9%) 7 / 98 (7.1%) 0.960
Nhiễm khuẩn hô hấp 170 87 / 170 (51%) 26 / 72 (36%) 61 / 98 (62%) <0.001
Nhiễm khuẩn tiết niệu 170 34 / 170 (20%) 11 / 72 (15%) 23 / 98 (23%) 0.187
Nhiễm khuẩn tiêu hoá 170 44 / 170 (26%) 24 / 72 (33%) 20 / 98 (20%) 0.057
Nhiễm khuẩn da và mô mềm 170 16 / 170 (9.4%) 11 / 72 (15%) 5 / 98 (5.1%) 0.025
Khác 170 16 / 170 (9.4%) 9 / 72 (13%) 7 / 98 (7.1%) 0.237
GCS 170


<0.001
    Mean (SD)
11.98 (3.94) 13.26 (3.15) 11.03 (4.20)
    Median (IQR)
14.00 (9.00, 15.00) 15.00 (13.75, 15.00) 12.00 (8.00, 15.00)
    Range
3.00, 15.00 3.00, 15.00 3.00, 15.00
Mạch 170


0.975
    Mean (SD)
108.13 (24.10) 107.50 (23.65) 108.59 (24.53)
    Median (IQR)
105.50 (93.25, 124.00) 106.50 (93.75, 124.00) 105.00 (93.25, 126.00)
    Range
60.00, 170.00 62.00, 170.00 60.00, 166.00
Nhiệt độ 170


0.061
    Mean (SD)
37.53 (1.05) 37.68 (1.02) 37.42 (1.06)
    Median (IQR)
37.15 (36.73, 38.28) 37.65 (37.00, 38.42) 37.00 (36.56, 38.10)
    Range
36.00, 41.30 36.00, 41.30 36.00, 41.10
Nhịp thở 170


0.446
    Mean (SD)
25.81 (7.16) 25.57 (5.64) 25.99 (8.12)
    Median (IQR)
25.00 (20.00, 30.00) 24.00 (20.75, 28.00) 26.00 (20.00, 30.00)
    Range
10.00, 50.00 18.00, 40.00 10.00, 50.00
Huyết áp trung bình 170


0.155
    Mean (SD)
88.26 (26.22) 91.36 (26.56) 85.98 (25.87)
    Median (IQR)
83.00 (70.00, 103.00) 87.00 (72.75, 104.00) 80.00 (70.00, 100.00)
    Range
37.00, 163.00 43.00, 163.00 37.00, 163.00
Sp02 170


0.203
    Mean (SD)
89.28 (10.60) 90.97 (7.27) 88.04 (12.39)
    Median (IQR)
93.00 (86.25, 95.00) 93.00 (89.75, 95.00) 90.00 (85.00, 95.00)
    Range
20.00, 99.00 65.00, 99.00 20.00, 99.00
Thông khí cơ học 170 103 / 170 (61%) 20 / 72 (28%) 83 / 98 (85%) <0.001
CRRT 170 41 / 170 (24%) 11 / 72 (15%) 30 / 98 (31%) 0.021
Bạch cầu 170


0.292
    Mean (SD)
15.95 (9.01) 16.59 (8.69) 15.47 (9.25)
    Median (IQR)
14.43 (9.43, 21.46) 15.12 (10.97, 20.74) 13.96 (8.70, 21.77)
    Range
0.85, 48.30 3.00, 46.03 0.85, 48.30
Hct 170


0.868
    Mean (SD)
33.76 (7.82) 33.92 (6.67) 33.65 (8.59)
    Median (IQR)
34.40 (28.53, 39.20) 35.60 (29.50, 39.00) 33.90 (28.13, 39.90)
    Range
8.20, 57.70 14.50, 45.20 8.20, 57.70
Tiểu cầu 170


0.454
    Mean (SD)
201.91 (127.90) 209.60 (129.89) 196.27 (126.79)
    Median (IQR)
175.50 (119.00, 252.50) 193.50 (135.25, 237.00) 173.00 (104.25, 262.00)
    Range
16.00, 710.00 16.00, 691.00 23.00, 710.00
Bilirubin toàn phần 170


0.327
    Mean (SD)
35.87 (86.23) 44.27 (125.00) 29.71 (37.77)
    Median (IQR)
16.50 (11.35, 31.38) 14.95 (10.43, 27.70) 17.95 (11.79, 31.83)
    Range
3.49, 920.82 4.64, 920.82 3.49, 289.20
Natri 170


0.105
    Mean (SD)
135.60 (8.70) 134.67 (8.22) 136.28 (9.02)
    Median (IQR)
135.20 (131.00, 140.00) 135.00 (130.75, 138.00) 136.00 (132.00, 141.00)
    Range
110.90, 169.00 111.00, 163.00 110.90, 169.00
Kali 170


0.178
    Mean (SD)
3.86 (0.91) 3.75 (0.84) 3.93 (0.95)
    Median (IQR)
3.80 (3.20, 4.38) 3.70 (3.09, 4.22) 3.90 (3.30, 4.40)
    Range
1.80, 7.60 2.30, 5.84 1.80, 7.60
AST 170


0.930
    Mean (SD)
196.83 (523.31) 173.27 (302.42) 214.13 (639.90)
    Median (IQR)
57.50 (33.00, 129.25) 62.00 (32.48, 130.50) 54.55 (34.00, 125.50)
    Range
13.00, 5,903.00 14.00, 1,474.00 13.00, 5,903.00
ALT 170


0.406
    Mean (SD)
123.10 (346.45) 112.96 (300.18) 130.55 (378.19)
    Median (IQR)
38.00 (20.03, 84.00) 40.50 (22.75, 92.00) 36.55 (19.25, 74.75)
    Range
4.00, 3,117.00 9.00, 2,429.00 4.00, 3,117.00
Ure 170


0.569
    Mean (SD)
14.78 (9.99) 14.77 (10.69) 14.78 (9.50)
    Median (IQR)
12.15 (8.03, 17.08) 11.70 (7.33, 18.70) 12.35 (8.45, 16.88)
    Range
1.30, 57.10 1.30, 57.10 2.40, 52.90
Creatinine 170


0.263
    Mean (SD)
197.30 (136.93) 217.73 (158.13) 182.29 (117.59)
    Median (IQR)
154.50 (106.25, 240.50) 163.50 (111.50, 275.25) 154.00 (106.25, 224.25)
    Range
30.00, 778.00 30.00, 778.00 54.00, 701.00
pH 170


0.039
    Mean (SD)
7.38 (0.11) 7.40 (0.10) 7.36 (0.12)
    Median (IQR)
7.40 (7.34, 7.46) 7.42 (7.37, 7.46) 7.38 (7.31, 7.45)
    Range
6.98, 7.61 7.08, 7.61 6.98, 7.53
pC02 170


0.428
    Mean (SD)
30.85 (11.07) 29.99 (10.10) 31.49 (11.73)
    Median (IQR)
28.55 (23.63, 35.18) 28.20 (23.58, 32.73) 29.15 (24.05, 36.50)
    Range
14.60, 81.50 14.60, 70.00 15.80, 81.50
p02 170


0.892
    Mean (SD)
103.02 (59.30) 98.58 (53.51) 106.29 (63.29)
    Median (IQR)
82.10 (67.63, 106.65) 80.40 (71.15, 102.03) 83.50 (66.38, 107.83)
    Range
39.10, 399.10 54.50, 399.10 39.10, 334.50
HC03 170


0.138
    Mean (SD)
17.72 (5.75) 18.29 (5.91) 17.30 (5.62)
    Median (IQR)
17.35 (13.90, 21.02) 18.30 (15.48, 21.28) 16.75 (13.23, 20.48)
    Range
5.00, 46.00 5.00, 46.00 5.00, 36.40
Lactate máu 170


<0.001
    Mean (SD)
6.04 (4.04) 3.85 (2.26) 7.64 (4.31)
    Median (IQR)
4.95 (3.08, 8.03) 3.16 (2.08, 5.49) 7.14 (4.10, 10.09)
    Range
0.77, 22.37 0.77, 10.41 1.60, 22.37
ALbumin máu 170


0.009
    Mean (SD)
2.59 (0.59) 2.71 (0.50) 2.50 (0.64)
    Median (IQR)
2.57 (2.23, 2.94) 2.73 (2.36, 3.06) 2.48 (2.12, 2.81)
    Range
0.82, 4.46 1.49, 4.11 0.82, 4.46
Lactate/Albumin máu 170


<0.001
    Mean (SD)
2.50 (1.88) 1.48 (0.90) 3.26 (2.04)
    Median (IQR)
2.18 (1.06, 3.40) 1.07 (0.80, 2.20) 2.86 (1.74, 4.13)
    Range
0.29, 13.29 0.29, 3.82 0.53, 13.29
PCT 164


0.723
    Mean (SD)
32.28 (37.78) 34.14 (38.99) 31.00 (37.06)
    Median (IQR)
10.65 (2.18, 63.13) 14.90 (2.39, 75.43) 10.08 (2.03, 54.42)
    Range
0.05, 101.56 0.07, 100.00 0.05, 101.56
    (Missing)
6 5 1
Cấy máu dương tính 170 44 / 170 (26%) 17 / 72 (24%) 27 / 98 (28%) 0.562
Thang điểm SOFA 170


<0.001
    Mean (SD)
7.68 (3.51) 6.38 (2.88) 8.64 (3.62)
    Median (IQR)
8.00 (5.00, 10.00) 6.50 (4.00, 9.00) 9.00 (6.00, 11.00)
    Range
2.00, 20.00 2.00, 15.00 2.00, 20.00
Thang điểm APACHE II 170


<0.001
    Mean (SD)
19.19 (6.98) 16.89 (6.20) 20.89 (7.07)
    Median (IQR)
18.00 (14.25, 24.00) 17.00 (12.00, 21.00) 19.00 (15.00, 25.00)
    Range
5.00, 41.00 5.00, 32.00 8.00, 41.00
Số ngày nằm khoa hồi sức 170


0.027
    Mean (SD)
9.64 (10.31) 11.58 (12.26) 8.20 (8.39)
    Median (IQR)
6.00 (2.00, 14.00) 7.00 (3.00, 15.50) 5.00 (2.00, 13.00)
    Range
0.00, 69.00 1.00, 69.00 0.00, 36.00
1 n / N (%)
2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test

Phân tích chọn biến bằng phương pháp Lasso để đưa vào mô hình hồi quy Logistic đa biến

library(Matrix)
library(glmnet)
## Loaded glmnet 4.1-8
library(carData)
xvars <- model.matrix(tuvong ~ age + dqn + bpm + bthm
                      + nkp + nktn + nkth + nkdmm 
                      + gcs + nhietdo + hatb + na + k + ph
                      + hc03 + lactate + albumin
                      + lar + sofa + apacheii, data=datalar)
yvar <- scale(datalar$tuvong)
ridge <- cv.glmnet(xvars, yvar, alpha=0)
plot(ridge)

coef(ridge)
## 22 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)  1.2215324701
## (Intercept)  .           
## age          0.0025531369
## dqn          0.0401059143
## bpm         -0.1226852832
## bthm         0.0705360815
## nkp          0.0859860995
## nktn         0.0383467920
## nkth        -0.0576417384
## nkdmm       -0.0840431301
## gcs         -0.0090827936
## nhietdo     -0.0203671811
## hatb        -0.0004490363
## na           0.0006842493
## k            0.0131111125
## ph          -0.1262646953
## hc03        -0.0006278012
## lactate      0.0194288748
## albumin     -0.0475977234
## lar          0.0404015882
## sofa         0.0126043733
## apacheii     0.0043672999
lasso <- cv.glmnet(xvars, yvar, alpha=1)
plot(lasso)

coef(lasso)
## 22 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) -0.590435063
## (Intercept)  .          
## age          0.001689660
## dqn          .          
## bpm          .          
## bthm         .          
## nkp          0.107958742
## nktn         .          
## nkth         .          
## nkdmm        .          
## gcs         -0.006327289
## nhietdo      .          
## hatb         .          
## na           .          
## k            .          
## ph           .          
## hc03         .          
## lactate      0.034973320
## albumin      .          
## lar          0.084448871
## sofa         0.009000562
## apacheii     .

Phân tích hồi quy logistic đa biến

temp <- glm(tuvong ~ age + nkp + lactate
              + lar + sofa + apacheii, family="binomial", data=datalar)
library(nnet)
library(foreign)
library(survival)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:gtsummary':
## 
##     select
library(epiDisplay)
logistic.display(temp)
## 
## Logistic regression predicting tuvong 
##  
##                       crude OR(95%CI)         adj. OR(95%CI)         
## age (cont. var.)      1.03 (1.01,1.05)        1.03 (1.01,1.06)       
##                                                                      
## nkp: 1 vs 0           2.92 (1.55,5.48)        2.96 (1.29,6.79)       
##                                                                      
## lactate (cont. var.)  1.47 (1.28,1.68)        1.01 (0.7,1.45)        
##                                                                      
## lar (cont. var.)      2.72 (1.93,3.85)        2.74 (1.09,6.86)       
##                                                                      
## sofa (cont. var.)     1.23 (1.11,1.37)        1.13 (0.98,1.3)        
##                                                                      
## apacheii (cont. var.) 1.0964 (1.0422,1.1535)  1.0053 (0.9335,1.0826) 
##                                                                      
##                       P(Wald's test) P(LR-test)
## age (cont. var.)      0.018          0.016     
##                                                
## nkp: 1 vs 0           0.011          0.009     
##                                                
## lactate (cont. var.)  0.956          0.956     
##                                                
## lar (cont. var.)      0.032          0.017     
##                                                
## sofa (cont. var.)     0.099          0.094     
##                                                
## apacheii (cont. var.) 0.888          0.888     
##                                                
## Log-likelihood = -76.5758
## No. of observations = 170
## AIC value = 167.1515

Nếu albumin ban đầu thấp thì có phải là yếu tố gây nhiễu cho nghiên cứu –> Chia 2 nhóm albumin nhỏ hơn 25 và lớn hơn 25 để phân tích tương quan với LAR

datalar$alb_group <- ifelse(datalar$albumin < 2.5, "low", "normal")
table(datalar$alb_group)
## 
##    low normal 
##     75     95
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
results <- datalar %>%
  group_by(alb_group) %>%
  summarise(
    corr = cor(lar, tuvong, method = "spearman", use = "complete.obs"),
    p_value = cor.test(lar, tuvong, method = "spearman")$p.value,
    n = n()
  )
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `p_value = cor.test(lar, tuvong, method = "spearman")$p.value`.
## ℹ In group 1: `alb_group = "low"`.
## Caused by warning in `cor.test.default()`:
## ! Cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
print(results)
## # A tibble: 2 × 4
##   alb_group  corr      p_value     n
##   <chr>     <dbl>        <dbl> <int>
## 1 low       0.464 0.0000274       75
## 2 normal    0.536 0.0000000218    95
logistic_results <- datalar %>%
  group_by(alb_group) %>%
  group_map(~ {
    model <- glm(tuvong ~ lar, data = .x, family = binomial)
    summary(model)
  })
logistic_results
## [[1]]
## 
## Call:
## glm(formula = tuvong ~ lar, family = binomial, data = .x)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4542     0.6329  -2.298 0.021578 *  
## lar           0.8890     0.2634   3.376 0.000736 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 94.030  on 74  degrees of freedom
## Residual deviance: 74.751  on 73  degrees of freedom
## AIC: 78.751
## 
## Number of Fisher Scoring iterations: 5
## 
## 
## [[2]]
## 
## Call:
## glm(formula = tuvong ~ lar, family = binomial, data = .x)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.0219     0.4879  -4.144 3.41e-05 ***
## lar           1.0475     0.2428   4.314 1.61e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131.69  on 94  degrees of freedom
## Residual deviance: 101.06  on 93  degrees of freedom
## AIC: 105.06
## 
## Number of Fisher Scoring iterations: 5

Vẽ đường cong AUC và tính các giá trị tiên lượng

plot(x=datalar$lar, y=datalar$tuvong)
m2 = glm(datalar$tuvong ~ datalar$lar, family=binomial)
lines(datalar$lar, m2$fitted.values)

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
larroc <- roc(datalar$tuvong, m2$fitted.values, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

par(pty = "s")
auc(larroc)
## Area under the curve: 0.8105
ci.auc(larroc)
## 95% CI: 0.7473-0.8737 (DeLong)
roc(datalar$tuvong, m2$fitted.values, plot = TRUE,
    main = "AUROC của Lactate/Albumin máu",
    xlab="1-Độ đặc hiệu", ylab="Độ nhạy", 
    col="#b8377e", ldw=4, print.auc=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE,
    print.thres = "best", print.thres.best.method = "youden")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = datalar$tuvong, predictor = m2$fitted.values,     plot = TRUE, main = "AUROC của Lactate/Albumin máu", xlab = "1-Độ đặc hiệu",     ylab = "Độ nhạy", col = "#b8377e", ldw = 4, print.auc = TRUE,     auc.polygon = TRUE, max.auc.polygon = TRUE, print.thres = "best",     print.thres.best.method = "youden")
## 
## Data: m2$fitted.values in 72 controls (datalar$tuvong 0) < 98 cases (datalar$tuvong 1).
## Area under the curve: 0.8105
par(pty = "m")

dataset <- data.frame(datalar$lar, datalar$tuvong, stringsAsFactors = TRUE)
attach(dataset)
dataset
##     datalar.lar datalar.tuvong
## 1          0.98              1
## 2          2.20              1
## 3          0.31              0
## 4          4.36              1
## 5          3.53              1
## 6          1.68              1
## 7          0.95              0
## 8          6.37              1
## 9          3.01              1
## 10         2.69              1
## 11         2.86              1
## 12         3.89              1
## 13         2.79              0
## 14         3.47              1
## 15         2.86              1
## 16         1.70              0
## 17         4.55              1
## 18         5.46              1
## 19         2.41              0
## 20         2.62              1
## 21         0.81              0
## 22         1.28              1
## 23         2.08              0
## 24         5.00              1
## 25         3.79              1
## 26         2.36              0
## 27         4.29              1
## 28         0.53              0
## 29         3.23              1
## 30         1.00              0
## 31         1.05              0
## 32         5.05              1
## 33         4.74              1
## 34         2.60              1
## 35         0.59              0
## 36         3.45              1
## 37         5.07              1
## 38         5.40              1
## 39         0.70              0
## 40         1.45              0
## 41         2.70              1
## 42         0.65              0
## 43         0.77              0
## 44         0.72              0
## 45         0.85              0
## 46         1.54              0
## 47         2.25              1
## 48         3.80              1
## 49         0.62              0
## 50         2.20              0
## 51         0.62              0
## 52         0.78              0
## 53         2.46              0
## 54         1.00              0
## 55         2.86              1
## 56         2.88              0
## 57         1.44              1
## 58         0.56              0
## 59         2.60              0
## 60         0.93              0
## 61         0.90              0
## 62         0.55              1
## 63         2.21              0
## 64         0.87              0
## 65         2.62              0
## 66         1.36              1
## 67         0.50              0
## 68         1.29              1
## 69         2.24              0
## 70         3.88              1
## 71         3.04              1
## 72         1.94              0
## 73         0.94              0
## 74         4.98              1
## 75         0.53              1
## 76         4.01              1
## 77         4.14              1
## 78         3.41              1
## 79         1.44              0
## 80         2.48              0
## 81         1.74              1
## 82         1.52              1
## 83         2.48              0
## 84         0.97              0
## 85         2.99              0
## 86         5.67              1
## 87         1.54              1
## 88         2.75              1
## 89         1.08              1
## 90         2.01              0
## 91         0.82              0
## 92         2.28              1
## 93         1.76              1
## 94         4.62              1
## 95         1.84              1
## 96         7.41              1
## 97         1.76              0
## 98         2.86              1
## 99         3.52              1
## 100        2.27              1
## 101        1.11              1
## 102        2.43              1
## 103        1.75              1
## 104        6.65              1
## 105        4.06              1
## 106        1.30              1
## 107        2.95              1
## 108        1.20              1
## 109        1.00              1
## 110        1.93              0
## 111        1.84              1
## 112        2.17              1
## 113        2.47              1
## 114        2.86              1
## 115        7.07              1
## 116        3.19              1
## 117        2.06              1
## 118        1.75              1
## 119        1.47              1
## 120        0.43              0
## 121        1.74              1
## 122        1.47              1
## 123        0.62              1
## 124        3.69              1
## 125        1.86              1
## 126        4.65              1
## 127        9.20              1
## 128        1.20              1
## 129        0.86              1
## 130        3.96              1
## 131        3.45              0
## 132        1.13              0
## 133        0.29              0
## 134        0.96              1
## 135        1.46              1
## 136        3.36              1
## 137        0.64              0
## 138        1.05              0
## 139        1.11              0
## 140        1.09              0
## 141        0.60              0
## 142        3.15              1
## 143        7.58              1
## 144        3.15              0
## 145        2.81              0
## 146        0.32              0
## 147        4.09              1
## 148       13.29              1
## 149        4.02              1
## 150        6.15              1
## 151        1.52              1
## 152        5.33              1
## 153        0.88              0
## 154        3.82              0
## 155        6.10              1
## 156        0.81              0
## 157        5.17              1
## 158        1.85              0
## 159        3.56              0
## 160        1.00              0
## 161        1.13              0
## 162        0.96              0
## 163        2.32              1
## 164        1.11              0
## 165        2.46              0
## 166        2.18              0
## 167        0.99              0
## 168        0.40              0
## 169        2.60              1
## 170        2.01              0
dataset$datalar.tuvong <- factor(dataset$datalar.tuvong,labels = c("Song", "Tuvong"))
library(OptimalCutpoints)
result <- optimal.cutpoints(X = "datalar.lar", status = "datalar.tuvong", tag.healthy ="Song",
                            methods = "Youden", data = dataset)
summary(result)
## 
## Call:
## optimal.cutpoints.default(X = "datalar.lar", status = "datalar.tuvong", 
##     tag.healthy = "Song", methods = "Youden", data = dataset)
## 
## Area under the ROC curve (AUC):  0.811 (0.747, 0.874) 
## 
## CRITERION: Youden
## Number of optimal cutoffs: 1
## 
##                     Estimate
## cutoff             1.2000000
## Se                 0.9081633
## Sp                 0.5694444
## PPV                0.7416667
## NPV                0.8200000
## DLR.Positive       2.1092824
## DLR.Negative       0.1612743
## FP                31.0000000
## FN                 9.0000000
## Optimal criterion  0.4776077
plot(result)

## Press return for next page....

Dùng phương pháp BMA để tìm ra mô hình phù hợp trên lâm sàng

library(dominanceanalysis)
temp1 <- glm(tuvong ~ age + nkp + lactate
            + lar + sofa + apacheii, family="binomial", data=datalar)
dom <- dominanceAnalysis(temp1)
plot(dom, which.graph ="general",fit.function ="r2.m")

df <- data.frame(datalar$age, datalar$lactate, datalar$lar, datalar$sofa, datalar$apacheii,
                 datalar$nkp, datalar$tuvong)
head(df)
##   datalar.age datalar.lactate datalar.lar datalar.sofa datalar.apacheii
## 1          87             2.1        0.98           10               32
## 2          70             4.5        2.20            4               18
## 3          69             0.9        0.31            9               22
## 4          71            10.9        4.36            2               15
## 5          66             6.1        3.53            6               23
## 6          78             4.7        1.68            6               18
##   datalar.nkp datalar.tuvong
## 1           1              1
## 2           1              1
## 3           0              0
## 4           0              1
## 5           0              1
## 6           1              1
attach(df)
## The following objects are masked from dataset:
## 
##     datalar.lar, datalar.tuvong
names(df)
## [1] "datalar.age"      "datalar.lactate"  "datalar.lar"      "datalar.sofa"    
## [5] "datalar.apacheii" "datalar.nkp"      "datalar.tuvong"
xvars <- df[,1:6]
y <- datalar$tuvong
library(survival)
library(leaps)
library(robustbase)
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
library(inline)
library(rrcov)
## Warning: package 'rrcov' was built under R version 4.4.1
## Scalable Robust Estimators with High Breakdown Point (version 1.7-6)
library(BMA)
bma.search1 <- bic.glm(xvars, y, strict=F, OR=20, glm.family="binomial")
summary(bma.search1)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = y, glm.family = "binomial",     strict = F, OR = 20)
## 
## 
##   9  models were selected
##  Best  5  models (cumulative posterior probability =  0.8532 ): 
## 
##                   p!=0    EV         SD       model 1     model 2   
## Intercept         100    -4.3408487  1.35289    -4.75641    -2.70978
## datalar.age        68.2   0.0209587  0.01758     0.02999       .    
## datalar.lactate     2.8  -0.0001623  0.02941       .           .    
## datalar.lar       100.0   1.0544229  0.20271     1.08211     1.06073
## datalar.sofa       30.1   0.0383765  0.06910       .           .    
## datalar.apacheii    7.9   0.0029351  0.01401       .           .    
## datalar.nkp        90.5   1.1228499  0.54006     1.22839     1.39740
##                                                                     
## nVar                                               3           2    
## BIC                                           -695.89463  -694.60728
## post prob                                        0.362       0.190  
##                   model 3     model 4     model 5   
## Intercept           -5.46903    -3.39075    -5.40209
## datalar.age          0.03010       .         0.03553
## datalar.lactate        .           .           .    
## datalar.lar          1.03355     1.01078     0.98710
## datalar.sofa         0.12271     0.12103     0.15165
## datalar.apacheii       .           .           .    
## datalar.nkp          1.09660     1.25729       .    
##                                                     
## nVar                   4           3           3    
## BIC               -694.23342  -693.06694  -692.12947
## post prob            0.158       0.088       0.055
imageplot.bma(bma.search1)

Phân tích tính chính xác của mô hình bằng Calibration

library(rms)
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Registered S3 method overwritten by 'rms':
##   method       from      
##   print.lrtest epiDisplay
## 
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
## 
##     lrtest
model1 <- lrm(tuvong ~ age + lar + nkp, x=TRUE,y=TRUE, data=datalar)
model2 <- lrm(tuvong ~ lar + nkp, x=TRUE,y=TRUE, data=datalar)
model3 <- lrm(tuvong ~ age + lar + nkp + sofa, x=TRUE,y=TRUE, data=datalar)
model1
## Logistic Regression Model
## 
## lrm(formula = tuvong ~ age + lar + nkp, data = datalar, x = TRUE, 
##     y = TRUE)
## 
##                        Model Likelihood      Discrimination    Rank Discrim.    
##                              Ratio Test             Indexes          Indexes    
## Obs           170    LR chi2      75.03      R2       0.480    C       0.855    
##  0             72    d.f.             3      R2(3,170)0.345    Dxy     0.709    
##  1             98    Pr(> chi2) <0.0001    R2(3,124.5)0.439    gamma   0.709    
## max |deriv| 8e-11                            Brier    0.153    tau-a   0.348    
## 
##           Coef    S.E.   Wald Z Pr(>|Z|)
## Intercept -4.7564 1.0158 -4.68  <0.0001 
## age        0.0300 0.0122  2.47  0.0136  
## lar        1.0821 0.1902  5.69  <0.0001 
## nkp        1.2284 0.4088  3.00  0.0027
model2
## Logistic Regression Model
## 
## lrm(formula = tuvong ~ lar + nkp, data = datalar, x = TRUE, y = TRUE)
## 
##                        Model Likelihood      Discrimination    Rank Discrim.    
##                              Ratio Test             Indexes          Indexes    
## Obs           170    LR chi2      68.61      R2       0.446    C       0.839    
##  0             72    d.f.             2      R2(2,170)0.324    Dxy     0.678    
##  1             98    Pr(> chi2) <0.0001    R2(2,124.5)0.414    gamma   0.679    
## max |deriv| 5e-06                            Brier    0.161    tau-a   0.333    
## 
##           Coef    S.E.   Wald Z Pr(>|Z|)
## Intercept -2.7098 0.5053 -5.36  <0.0001 
## lar        1.0607 0.1842  5.76  <0.0001 
## nkp        1.3974 0.3981  3.51  0.0004
model3
## Logistic Regression Model
## 
## lrm(formula = tuvong ~ age + lar + nkp + sofa, data = datalar, 
##     x = TRUE, y = TRUE)
## 
##                        Model Likelihood      Discrimination    Rank Discrim.    
##                              Ratio Test             Indexes          Indexes    
## Obs           170    LR chi2      78.50      R2       0.497    C       0.864    
##  0             72    d.f.             4      R2(4,170)0.355    Dxy     0.728    
##  1             98    Pr(> chi2) <0.0001    R2(4,124.5)0.450    gamma   0.728    
## max |deriv| 4e-10                            Brier    0.148    tau-a   0.357    
## 
##           Coef    S.E.   Wald Z Pr(>|Z|)
## Intercept -5.4690 1.1221 -4.87  <0.0001 
## age        0.0301 0.0123  2.45  0.0142  
## lar        1.0336 0.1940  5.33  <0.0001 
## nkp        1.0966 0.4154  2.64  0.0083  
## sofa       0.1227 0.0671  1.83  0.0673
library(rms)
model1 <- lrm(tuvong ~ age + lar + nkp, x=TRUE,y=TRUE, data=datalar)
pred.logit <- predict(model1)
phat <- 1/(1+ exp(-pred.logit))
val.prob(phat, datalar$tuvong, m=10, cex=.5)

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  7.090420e-01  8.545210e-01  4.795776e-01  4.354707e-01  7.503001e+01 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA -1.176471e-02 -5.684342e-14  1.000000e+00  4.472354e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  1.531266e-01  2.069542e-14  1.000000e+00  5.176247e-02  4.402857e-02 
##          Eavg           S:z           S:p 
##  2.718828e-02  1.262687e-01  8.995192e-01
library(rms)
model3 <- lrm(tuvong ~ age + lar + nkp + sofa, x=TRUE,y=TRUE, data=datalar)
pred.logit1 <- predict(model3)
phat1 <- 1/(1+ exp(-pred.logit1))
val.prob(phat1, datalar$tuvong, m=10, cex=.5)

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  7.277494e-01  8.638747e-01  4.970655e-01  4.559094e-01  7.850460e+01 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA -1.176471e-02 -2.842171e-14  1.000000e+00  4.676741e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  1.484784e-01  9.276900e-14  1.000000e+00  1.007307e-01  8.852993e-02 
##          Eavg           S:z           S:p 
##  4.910426e-02  5.790848e-02  9.538215e-01

Vẽ Nomogram cho mô hình tiên lượng bao gồm lactate - tuổi - nhiễm khuẩn hô hấp

set.seed(123)
data <- data.frame(
  age = datalar$age,
  lar = datalar$lar,
  nkp = datalar$nkp,
  tuvong = datalar$tuvong
)
options(datadist = "ddist")
ddist <- datadist(data)
m1 <- lrm(tuvong ~ lar + age + nkp, data = data)
nom <- nomogram(m1, fun = plogis, fun.at = c(0.1, 0.25, 0.5, 0.75, 0.9),
                lp = FALSE, funlabel = "Probability of Death")
plot(nom, xfrac = 0.2,  label.every = list(age = 1))

Phân tích hồi quy rừng ngẫu nhiên để kiểm tra cho mô hình

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
str(datalar)
## tibble [170 × 49] (S3: tbl_df/tbl/data.frame)
##  $ stt       : num [1:170] 1 2 3 4 5 6 7 8 9 10 ...
##  $ sex       : num [1:170] 0 1 1 1 0 1 1 0 0 1 ...
##  $ age       : num [1:170] 87 70 69 71 66 78 73 87 63 67 ...
##  $ ngay      : num [1:170] 18 6 20 12 13 4 8 1 1 11 ...
##  $ tha       : num [1:170] 1 0 1 1 1 1 0 0 0 1 ...
##  $ dtd       : num [1:170] 0 0 1 1 1 0 0 0 0 0 ...
##  $ btm       : num [1:170] 0 0 0 1 0 1 0 0 0 1 ...
##  $ dqn       : num [1:170] 0 0 0 0 0 0 0 1 1 0 ...
##  $ bpm       : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
##  $ bthm      : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
##  $ bgm       : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ut        : num [1:170] 0 1 0 0 0 0 0 0 1 0 ...
##  $ nkp       : num [1:170] 1 1 0 0 0 1 1 1 1 1 ...
##  $ nktn      : num [1:170] 0 0 0 0 1 0 0 0 1 0 ...
##  $ nkth      : num [1:170] 0 0 0 1 0 1 0 0 0 0 ...
##  $ nkdmm     : num [1:170] 0 0 0 0 0 0 0 0 1 0 ...
##  $ khac      : num [1:170] 0 0 1 0 0 0 0 0 0 0 ...
##  $ gcs       : num [1:170] 8 15 15 14 8 12 15 6 14 15 ...
##  $ hatb      : num [1:170] 60 102 123 163 60 63 87 83 88 88 ...
##  $ mach      : num [1:170] 103 90 62 98 112 130 66 160 135 82 ...
##  $ nhietdo   : num [1:170] 36.8 38.5 37 36 36.5 36.4 38.2 39 37.6 38.1 ...
##  $ nhiptho   : num [1:170] 30 35 20 25 33 32 19 29 30 28 ...
##  $ sp02      : num [1:170] 94 90 82 97 85 85 95 78 85 98 ...
##  $ caymau    : num [1:170] 0 0 0 1 1 0 0 1 0 0 ...
##  $ thongkhich: num [1:170] 1 1 0 1 1 1 0 1 0 1 ...
##  $ locmau    : num [1:170] 1 0 1 1 1 1 0 1 0 1 ...
##  $ albumin   : num [1:170] 2.15 2.05 2.94 2.5 1.73 2.8 2.11 2.01 2.06 3.2 ...
##  $ lar       : num [1:170] 0.98 2.2 0.31 4.36 3.53 1.68 0.95 6.37 3.01 2.69 ...
##  $ lactate   : num [1:170] 2.1 4.5 0.9 10.9 6.1 4.7 2 12.8 6.2 8.6 ...
##  $ pct       : num [1:170] 3.33 2.03 5.02 100 1.56 ...
##  $ tuvong    : num [1:170] 1 1 0 1 1 1 0 1 1 1 ...
##  $ apacheii  : num [1:170] 32 18 22 15 23 18 19 30 15 14 ...
##  $ sofa      : num [1:170] 10 4 9 2 6 6 2 12 4 7 ...
##  $ bc        : num [1:170] 25 14.2 26.4 15.7 16.2 ...
##  $ hct       : num [1:170] 40.1 42.3 37.1 39.1 28.3 39.2 28.9 45.2 23.3 36.7 ...
##  $ tc        : num [1:170] 202 399 162 119 390 293 339 238 135 76 ...
##  $ bilitp    : num [1:170] 12.8 20.1 564 11.1 9.9 10.8 15.8 24.3 14.2 39.3 ...
##  $ na        : num [1:170] 161 121 123 136 119 138 129 152 123 137 ...
##  $ k         : num [1:170] 4.4 4.9 5.6 3.3 3.8 5 3.9 3.3 3.6 4.5 ...
##  $ ast       : num [1:170] 70 56 1270 33 26 30 35 27 47 238 ...
##  $ alt       : num [1:170] 75 44 2429 19 26 ...
##  $ ure       : num [1:170] 44.7 7.7 38.5 7.3 7.6 9.6 7.9 14.8 11.2 14.6 ...
##  $ crea      : num [1:170] 335 123 632 107 102 144 87 114 74 314 ...
##  $ egfr      : num [1:170] 11.98 53.76 8.16 46.6 50.14 ...
##  $ ph        : num [1:170] 7.45 7.31 7.38 7.3 7.47 ...
##  $ pc02      : num [1:170] 32 21 31.4 32.7 27.6 32.8 64.6 19.9 46.2 18.5 ...
##  $ p02       : num [1:170] 89.4 134.1 73.9 84.4 81.8 ...
##  $ hc03      : num [1:170] 21.9 10.4 18.2 15.5 19.6 20.1 21.9 13.3 21.2 10.3 ...
##  $ alb_group : chr [1:170] "low" "low" "normal" "normal" ...
model_rf <- randomForest(tuvong ~ age + lar + nkp, data = datalar, importance = TRUE, ntree = 500)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
print(model_rf)
## 
## Call:
##  randomForest(formula = tuvong ~ age + lar + nkp, data = datalar,      importance = TRUE, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.1729648
##                     % Var explained: 29.16
predictions <- predict(model_rf, datalar)
mse <- mean((predictions - datalar$tuvong)^2)
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 0.1015607
rsq <- 1 - sum((predictions - datalar$tuvong)^2) / sum((mean(datalar$tuvong) - datalar$tuvong)^2)
cat("R²:", rsq, "\n")
## R²: 0.5840273
# Tính RMSE
rmse <- sqrt(mean((predictions - datalar$tuvong)^2))
cat("RMSE:", rmse, "\n")
## RMSE: 0.3186858
# Tính MAE
mae <- mean(abs(predictions - datalar$tuvong))
cat("MAE:", mae, "\n")
## MAE: 0.2770989
# Tính R² điều chỉnh
n <- nrow(datalar)
p <- length(coef(model_rf)) - 1  # Số biến trong mô hình (cộng cả intercept)
r2_adj <- 1 - (1 - rsq) * (n - 1) / (n - p - 1)
cat("Adjusted R²:", r2_adj, "\n")
## Adjusted R²: 0.5864742
# Vẽ biểu đồ sai số
residuals <- predictions - datalar$tuvong
plot(residuals, main = "Residuals Plot", xlab = "Index", ylab = "Residuals")
abline(h = 0, col = "red")

# Ví dụ với cross-validation (n-folds)
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## The following object is masked from 'package:epiDisplay':
## 
##     alpha
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
## 
##     dotplot
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
train_control <- trainControl(method = "cv", number = 10)  # 10-fold CV
cv_model <- train(tuvong ~ age + lar + nkp, data = datalar, method = "rf", trControl = train_control)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
print(cv_model)
## Random Forest 
## 
## 170 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 153, 153, 153, 153, 153, 153, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE      
##   2     0.4269553  0.3043584  0.3280763
##   3     0.4314351  0.2970112  0.3268998
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
# Tính MAPE
mape <- mean(abs((predictions - datalar$tuvong) / datalar$tuvong)) * 100
cat("MAPE:", mape, "%\n")
## MAPE: Inf %
# Loại bỏ các giá trị thực tế bằng 0 trước khi tính MAPE
non_zero_data <- datalar[datalar$tuvong != 0, ]
predictions_non_zero <- predictions[datalar$tuvong != 0]
# Tính lại MAPE
mape <- mean(abs((predictions_non_zero - non_zero_data$tuvong) / non_zero_data$tuvong)) * 100
cat("MAPE:", mape, "%\n")
## MAPE: 24.2408 %

Với R² = 57.49% và MSE = 0.1038, mô hình hiện tại đang hoạt động tốt. R² = 0.5749125: Hệ số xác định (R²) cho biết mô hình có thể giải thích khoảng 57.49% biến động của biến kết quả (tuvong). Đây là một mức độ giải thích khá tốt, đặc biệt đối với các mô hình phức tạp như hồi quy rừng ngẫu nhiên. MSE = 0.1037861: MSE cho thấy sai số bình phương trung bình của mô hình là 0.1038. Một giá trị MSE thấp cho thấy mô hình của bạn có độ chính xác cao trong việc dự đoán các giá trị thực tế.