Mô hình tiên lượng LAR

LAR model in sepsis patients in Vietnam Intensive care unit

Nhập dữ liệu vào phần mềm R - Đánh giá phân phối chuẩn hay không chuẩn của các biến liên tục

library(readxl)
datalar <- read_excel("~/Desktop/Developer/datalar.xlsx")
View(datalar)
attach(datalar)
shapiro.test(datalar$age)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$age
## W = 0.94995, p-value = 9.945e-06

shapiro.test(datalar$gcs)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$gcs
## W = 0.76095, p-value = 2.361e-15

shapiro.test(datalar$mach)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$mach
## W = 0.98003, p-value = 0.015

shapiro.test(datalar$nhietdo)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$nhietdo
## W = 0.94253, p-value = 2.307e-06

shapiro.test(datalar$nhiptho)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$nhiptho
## W = 0.96489, p-value = 0.0002728

shapiro.test(datalar$hatb)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$hatb
## W = 0.95999, p-value = 8.662e-05

shapiro.test(datalar$sp02)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$sp02
## W = 0.7037, p-value < 2.2e-16

shapiro.test(datalar$bc)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$bc
## W = 0.94903, p-value = 8.249e-06

shapiro.test(datalar$hct)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$hct
## W = 0.98586, p-value = 0.08341

shapiro.test(datalar$tc)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$tc
## W = 0.9027, p-value = 3.67e-09

shapiro.test(datalar$bilitp)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$bilitp
## W = 0.27918, p-value < 2.2e-16

shapiro.test(datalar$na)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$na
## W = 0.95381, p-value = 2.225e-05

shapiro.test(datalar$k)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$k
## W = 0.96519, p-value = 0.0002938

shapiro.test(datalar$ast)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$ast
## W = 0.31451, p-value < 2.2e-16

shapiro.test(datalar$alt)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$alt
## W = 0.305, p-value < 2.2e-16

shapiro.test(datalar$ure)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$ure
## W = 0.8503, p-value = 6.615e-12

shapiro.test(datalar$crea)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$crea
## W = 0.82095, p-value = 3.627e-13

shapiro.test(datalar$ph)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$ph
## W = 0.90407, p-value = 4.443e-09

shapiro.test(datalar$pc02)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$pc02
## W = 0.85577, p-value = 1.186e-11

shapiro.test(datalar$p02)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$p02
## W = 0.71846, p-value < 2.2e-16

shapiro.test(datalar$hc03)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$hc03
## W = 0.95487, p-value = 2.791e-05

shapiro.test(datalar$lactate)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$lactate
## W = 0.89633, p-value = 1.537e-09

shapiro.test(datalar$albumin)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$albumin
## W = 0.98597, p-value = 0.08636

shapiro.test(datalar$lar)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$lar
## W = 0.85129, p-value = 7.344e-12

shapiro.test(datalar$sofa)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$sofa
## W = 0.96972, p-value = 0.0009052

shapiro.test(datalar$apacheii)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$apacheii
## W = 0.97322, p-value = 0.002262

shapiro.test(datalar$ngay)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$ngay
## W = 0.789, p-value = 2.19e-14

shapiro.test(datalar$pct)

## 
##  Shapiro-Wilk normality test
## 
## data:  datalar$pct
## W = 0.75866, p-value = 3.903e-15

–> Kết quả cho thấy các biến phân phối không chuẩn, chỉ có biến albumin và biến Hct là phân phối chuẩn

Đặc điểm dân số nghiên cứu và phân tích sự khác biệt giữa hai nhóm sống và nhóm tử vong

library(gtsummary)
summary2 <- datalar %>%
  select(sex, age, dtd, tha, btm, dqn, bpm, bthm, bgm, ut, nkp, nktn, nkth,
         nkdmm, khac, gcs, mach, nhietdo, nhiptho, hatb, sp02, thongkhich, locmau, bc,
         hct, tc, bilitp, na, k, ast, alt, ure, crea, ph, pc02, p02, hc03,
         lactate, albumin, lar, pct, caymau, sofa, apacheii, ngay, tuvong) %>%
  tbl_summary(
    by = tuvong,
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c("{mean} ({sd})", "{median} ({p25}, {p75})", "{min}, {max}"), 
      all_categorical() ~ "{n} / {N} ({p}%)"),
    digits = all_continuous() ~ 2,
    label  = list(
      sex ~ "Giới tính",
      age ~ "Tuổi",
      dtd ~ "Đái tháo đường",
      tha ~ "Tăng huyết áp",
      btm ~ "Bệnh tim mạch",
      dqn ~ "Đột quỵ não",
      bpm ~ "Bệnh phổi mạn",
      bthm ~ "Bệnh thận mạn", 
      bgm ~ "Bệnh gan mạn", 
      ut ~ "Ung thư", 
      nkp ~ "Nhiễm khuẩn hô hấp", 
      nktn ~ "Nhiễm khuẩn tiết niệu", 
      nkth ~ "Nhiễm khuẩn tiêu hoá",
      nkdmm ~ "Nhiễm khuẩn da và mô mềm", 
      khac ~ "Khác", 
      gcs ~ "GCS", 
      mach ~ "Mạch", 
      nhietdo ~ "Nhiệt độ", 
      nhiptho ~ "Nhịp thở", 
      hatb ~ "Huyết áp trung bình", 
      sp02 ~ "Sp02", 
      thongkhich ~ "Thông khí cơ học", 
      locmau ~ "CRRT",
      bc ~ "Bạch cầu",
      hct ~ "Hct", 
      tc ~ "Tiểu cầu",
      bilitp ~ "Bilirubin toàn phần",
      na ~ "Natri", 
      k ~ "Kali", 
      ast ~ "AST", 
      alt ~ "ALT",
      ure ~ "Ure", 
      crea ~ "Creatinine", 
      ph ~ "pH", 
      pc02 ~"pC02", 
      p02 ~ "p02", 
      hc03 ~ "HC03",
      lactate ~ "Lactate máu", 
      albumin ~ "ALbumin máu", 
      lar ~"Lactate/Albumin máu",
      pct ~ "PCT",
      caymau ~ "Cấy máu dương tính", 
      sofa ~ "Thang điểm SOFA", 
      apacheii ~ "Thang điểm APACHE II", 
      ngay ~ "Số ngày nằm khoa hồi sức"),
    missing_text = "(Missing)"
  ) %>% 
  add_p(pvalue_fun = ~ style_pvalue(.x, digits = 3)) %>%
  add_overall() %>%
  add_n() %>%
  modify_caption("**Đặc điểm dân số nghiên cứu**") %>%
  bold_labels()
summary2

**Đặc điểm dân số nghiên cứu**
Characteristic	N	Overall, N = 170¹	0, N = 72¹	1, N = 98¹	p-value²
Giới tính	170	92 / 170 (54%)	43 / 72 (60%)	49 / 98 (50%)	0.209
Tuổi	170				<0.001
Mean (SD)		70.64 (17.13)	65.88 (17.01)	74.14 (16.44)
Median (IQR)		73.00 (62.00, 84.75)	68.50 (55.75, 78.25)	77.50 (66.00, 86.00)
Range		16.00, 101.00	17.00, 99.00	16.00, 101.00
Đái tháo đường	170	59 / 170 (35%)	23 / 72 (32%)	36 / 98 (37%)	0.517
Tăng huyết áp	170	105 / 170 (62%)	43 / 72 (60%)	62 / 98 (63%)	0.639
Bệnh tim mạch	170	49 / 170 (29%)	24 / 72 (33%)	25 / 98 (26%)	0.266
Đột quỵ não	170	25 / 170 (15%)	7 / 72 (9.7%)	18 / 98 (18%)	0.116
Bệnh phổi mạn	170	8 / 170 (4.7%)	6 / 72 (8.3%)	2 / 98 (2.0%)	0.072
Bệnh thận mạn	170	13 / 170 (7.6%)	3 / 72 (4.2%)	10 / 98 (10%)	0.143
Bệnh gan mạn	170	4 / 170 (2.4%)	2 / 72 (2.8%)	2 / 98 (2.0%)	>0.999
Ung thư	170	12 / 170 (7.1%)	5 / 72 (6.9%)	7 / 98 (7.1%)	0.960
Nhiễm khuẩn hô hấp	170	87 / 170 (51%)	26 / 72 (36%)	61 / 98 (62%)	<0.001
Nhiễm khuẩn tiết niệu	170	34 / 170 (20%)	11 / 72 (15%)	23 / 98 (23%)	0.187
Nhiễm khuẩn tiêu hoá	170	44 / 170 (26%)	24 / 72 (33%)	20 / 98 (20%)	0.057
Nhiễm khuẩn da và mô mềm	170	16 / 170 (9.4%)	11 / 72 (15%)	5 / 98 (5.1%)	0.025
Khác	170	16 / 170 (9.4%)	9 / 72 (13%)	7 / 98 (7.1%)	0.237
GCS	170				<0.001
Mean (SD)		11.98 (3.94)	13.26 (3.15)	11.03 (4.20)
Median (IQR)		14.00 (9.00, 15.00)	15.00 (13.75, 15.00)	12.00 (8.00, 15.00)
Range		3.00, 15.00	3.00, 15.00	3.00, 15.00
Mạch	170				0.975
Mean (SD)		108.13 (24.10)	107.50 (23.65)	108.59 (24.53)
Median (IQR)		105.50 (93.25, 124.00)	106.50 (93.75, 124.00)	105.00 (93.25, 126.00)
Range		60.00, 170.00	62.00, 170.00	60.00, 166.00
Nhiệt độ	170				0.061
Mean (SD)		37.53 (1.05)	37.68 (1.02)	37.42 (1.06)
Median (IQR)		37.15 (36.73, 38.28)	37.65 (37.00, 38.42)	37.00 (36.56, 38.10)
Range		36.00, 41.30	36.00, 41.30	36.00, 41.10
Nhịp thở	170				0.446
Mean (SD)		25.81 (7.16)	25.57 (5.64)	25.99 (8.12)
Median (IQR)		25.00 (20.00, 30.00)	24.00 (20.75, 28.00)	26.00 (20.00, 30.00)
Range		10.00, 50.00	18.00, 40.00	10.00, 50.00
Huyết áp trung bình	170				0.155
Mean (SD)		88.26 (26.22)	91.36 (26.56)	85.98 (25.87)
Median (IQR)		83.00 (70.00, 103.00)	87.00 (72.75, 104.00)	80.00 (70.00, 100.00)
Range		37.00, 163.00	43.00, 163.00	37.00, 163.00
Sp02	170				0.203
Mean (SD)		89.28 (10.60)	90.97 (7.27)	88.04 (12.39)
Median (IQR)		93.00 (86.25, 95.00)	93.00 (89.75, 95.00)	90.00 (85.00, 95.00)
Range		20.00, 99.00	65.00, 99.00	20.00, 99.00
Thông khí cơ học	170	103 / 170 (61%)	20 / 72 (28%)	83 / 98 (85%)	<0.001
CRRT	170	41 / 170 (24%)	11 / 72 (15%)	30 / 98 (31%)	0.021
Bạch cầu	170				0.292
Mean (SD)		15.95 (9.01)	16.59 (8.69)	15.47 (9.25)
Median (IQR)		14.43 (9.43, 21.46)	15.12 (10.97, 20.74)	13.96 (8.70, 21.77)
Range		0.85, 48.30	3.00, 46.03	0.85, 48.30
Hct	170				0.868
Mean (SD)		33.76 (7.82)	33.92 (6.67)	33.65 (8.59)
Median (IQR)		34.40 (28.53, 39.20)	35.60 (29.50, 39.00)	33.90 (28.13, 39.90)
Range		8.20, 57.70	14.50, 45.20	8.20, 57.70
Tiểu cầu	170				0.454
Mean (SD)		201.91 (127.90)	209.60 (129.89)	196.27 (126.79)
Median (IQR)		175.50 (119.00, 252.50)	193.50 (135.25, 237.00)	173.00 (104.25, 262.00)
Range		16.00, 710.00	16.00, 691.00	23.00, 710.00
Bilirubin toàn phần	170				0.327
Mean (SD)		35.87 (86.23)	44.27 (125.00)	29.71 (37.77)
Median (IQR)		16.50 (11.35, 31.38)	14.95 (10.43, 27.70)	17.95 (11.79, 31.83)
Range		3.49, 920.82	4.64, 920.82	3.49, 289.20
Natri	170				0.105
Mean (SD)		135.60 (8.70)	134.67 (8.22)	136.28 (9.02)
Median (IQR)		135.20 (131.00, 140.00)	135.00 (130.75, 138.00)	136.00 (132.00, 141.00)
Range		110.90, 169.00	111.00, 163.00	110.90, 169.00
Kali	170				0.178
Mean (SD)		3.86 (0.91)	3.75 (0.84)	3.93 (0.95)
Median (IQR)		3.80 (3.20, 4.38)	3.70 (3.09, 4.22)	3.90 (3.30, 4.40)
Range		1.80, 7.60	2.30, 5.84	1.80, 7.60
AST	170				0.930
Mean (SD)		196.83 (523.31)	173.27 (302.42)	214.13 (639.90)
Median (IQR)		57.50 (33.00, 129.25)	62.00 (32.48, 130.50)	54.55 (34.00, 125.50)
Range		13.00, 5,903.00	14.00, 1,474.00	13.00, 5,903.00
ALT	170				0.406
Mean (SD)		123.10 (346.45)	112.96 (300.18)	130.55 (378.19)
Median (IQR)		38.00 (20.03, 84.00)	40.50 (22.75, 92.00)	36.55 (19.25, 74.75)
Range		4.00, 3,117.00	9.00, 2,429.00	4.00, 3,117.00
Ure	170				0.569
Mean (SD)		14.78 (9.99)	14.77 (10.69)	14.78 (9.50)
Median (IQR)		12.15 (8.03, 17.08)	11.70 (7.33, 18.70)	12.35 (8.45, 16.88)
Range		1.30, 57.10	1.30, 57.10	2.40, 52.90
Creatinine	170				0.263
Mean (SD)		197.30 (136.93)	217.73 (158.13)	182.29 (117.59)
Median (IQR)		154.50 (106.25, 240.50)	163.50 (111.50, 275.25)	154.00 (106.25, 224.25)
Range		30.00, 778.00	30.00, 778.00	54.00, 701.00
pH	170				0.039
Mean (SD)		7.38 (0.11)	7.40 (0.10)	7.36 (0.12)
Median (IQR)		7.40 (7.34, 7.46)	7.42 (7.37, 7.46)	7.38 (7.31, 7.45)
Range		6.98, 7.61	7.08, 7.61	6.98, 7.53
pC02	170				0.428
Mean (SD)		30.85 (11.07)	29.99 (10.10)	31.49 (11.73)
Median (IQR)		28.55 (23.63, 35.18)	28.20 (23.58, 32.73)	29.15 (24.05, 36.50)
Range		14.60, 81.50	14.60, 70.00	15.80, 81.50
p02	170				0.892
Mean (SD)		103.02 (59.30)	98.58 (53.51)	106.29 (63.29)
Median (IQR)		82.10 (67.63, 106.65)	80.40 (71.15, 102.03)	83.50 (66.38, 107.83)
Range		39.10, 399.10	54.50, 399.10	39.10, 334.50
HC03	170				0.138
Mean (SD)		17.72 (5.75)	18.29 (5.91)	17.30 (5.62)
Median (IQR)		17.35 (13.90, 21.02)	18.30 (15.48, 21.28)	16.75 (13.23, 20.48)
Range		5.00, 46.00	5.00, 46.00	5.00, 36.40
Lactate máu	170				<0.001
Mean (SD)		6.04 (4.04)	3.85 (2.26)	7.64 (4.31)
Median (IQR)		4.95 (3.08, 8.03)	3.16 (2.08, 5.49)	7.14 (4.10, 10.09)
Range		0.77, 22.37	0.77, 10.41	1.60, 22.37
ALbumin máu	170				0.009
Mean (SD)		2.59 (0.59)	2.71 (0.50)	2.50 (0.64)
Median (IQR)		2.57 (2.23, 2.94)	2.73 (2.36, 3.06)	2.48 (2.12, 2.81)
Range		0.82, 4.46	1.49, 4.11	0.82, 4.46
Lactate/Albumin máu	170				<0.001
Mean (SD)		2.50 (1.88)	1.48 (0.90)	3.26 (2.04)
Median (IQR)		2.18 (1.06, 3.40)	1.07 (0.80, 2.20)	2.86 (1.74, 4.13)
Range		0.29, 13.29	0.29, 3.82	0.53, 13.29
PCT	164				0.723
Mean (SD)		32.28 (37.78)	34.14 (38.99)	31.00 (37.06)
Median (IQR)		10.65 (2.18, 63.13)	14.90 (2.39, 75.43)	10.08 (2.03, 54.42)
Range		0.05, 101.56	0.07, 100.00	0.05, 101.56
(Missing)		6	5	1
Cấy máu dương tính	170	44 / 170 (26%)	17 / 72 (24%)	27 / 98 (28%)	0.562
Thang điểm SOFA	170				<0.001
Mean (SD)		7.68 (3.51)	6.38 (2.88)	8.64 (3.62)
Median (IQR)		8.00 (5.00, 10.00)	6.50 (4.00, 9.00)	9.00 (6.00, 11.00)
Range		2.00, 20.00	2.00, 15.00	2.00, 20.00
Thang điểm APACHE II	170				<0.001
Mean (SD)		19.19 (6.98)	16.89 (6.20)	20.89 (7.07)
Median (IQR)		18.00 (14.25, 24.00)	17.00 (12.00, 21.00)	19.00 (15.00, 25.00)
Range		5.00, 41.00	5.00, 32.00	8.00, 41.00
Số ngày nằm khoa hồi sức	170				0.027
Mean (SD)		9.64 (10.31)	11.58 (12.26)	8.20 (8.39)
Median (IQR)		6.00 (2.00, 14.00)	7.00 (3.00, 15.50)	5.00 (2.00, 13.00)
Range		0.00, 69.00	1.00, 69.00	0.00, 36.00
¹ n / N (%)
² Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test

Phân tích chọn biến bằng phương pháp Lasso để đưa vào mô hình hồi quy Logistic đa biến

library(Matrix)
library(glmnet)

## Loaded glmnet 4.1-8

library(carData)
xvars <- model.matrix(tuvong ~ age + dqn + bpm + bthm
                      + nkp + nktn + nkth + nkdmm 
                      + gcs + nhietdo + hatb + na + k + ph
                      + hc03 + lactate + albumin
                      + lar + sofa + apacheii, data=datalar)
yvar <- scale(datalar$tuvong)
ridge <- cv.glmnet(xvars, yvar, alpha=0)
plot(ridge)

coef(ridge)

## 22 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)  1.2215324701
## (Intercept)  .           
## age          0.0025531369
## dqn          0.0401059143
## bpm         -0.1226852832
## bthm         0.0705360815
## nkp          0.0859860995
## nktn         0.0383467920
## nkth        -0.0576417384
## nkdmm       -0.0840431301
## gcs         -0.0090827936
## nhietdo     -0.0203671811
## hatb        -0.0004490363
## na           0.0006842493
## k            0.0131111125
## ph          -0.1262646953
## hc03        -0.0006278012
## lactate      0.0194288748
## albumin     -0.0475977234
## lar          0.0404015882
## sofa         0.0126043733
## apacheii     0.0043672999

lasso <- cv.glmnet(xvars, yvar, alpha=1)
plot(lasso)

coef(lasso)

## 22 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) -0.590435063
## (Intercept)  .          
## age          0.001689660
## dqn          .          
## bpm          .          
## bthm         .          
## nkp          0.107958742
## nktn         .          
## nkth         .          
## nkdmm        .          
## gcs         -0.006327289
## nhietdo      .          
## hatb         .          
## na           .          
## k            .          
## ph           .          
## hc03         .          
## lactate      0.034973320
## albumin      .          
## lar          0.084448871
## sofa         0.009000562
## apacheii     .

Phân tích hồi quy logistic đa biến

temp <- glm(tuvong ~ age + nkp + lactate
              + lar + sofa + apacheii, family="binomial", data=datalar)
library(nnet)
library(foreign)
library(survival)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:gtsummary':
## 
##     select

library(epiDisplay)
logistic.display(temp)

## 
## Logistic regression predicting tuvong 
##  
##                       crude OR(95%CI)         adj. OR(95%CI)         
## age (cont. var.)      1.03 (1.01,1.05)        1.03 (1.01,1.06)       
##                                                                      
## nkp: 1 vs 0           2.92 (1.55,5.48)        2.96 (1.29,6.79)       
##                                                                      
## lactate (cont. var.)  1.47 (1.28,1.68)        1.01 (0.7,1.45)        
##                                                                      
## lar (cont. var.)      2.72 (1.93,3.85)        2.74 (1.09,6.86)       
##                                                                      
## sofa (cont. var.)     1.23 (1.11,1.37)        1.13 (0.98,1.3)        
##                                                                      
## apacheii (cont. var.) 1.0964 (1.0422,1.1535)  1.0053 (0.9335,1.0826) 
##                                                                      
##                       P(Wald's test) P(LR-test)
## age (cont. var.)      0.018          0.016     
##                                                
## nkp: 1 vs 0           0.011          0.009     
##                                                
## lactate (cont. var.)  0.956          0.956     
##                                                
## lar (cont. var.)      0.032          0.017     
##                                                
## sofa (cont. var.)     0.099          0.094     
##                                                
## apacheii (cont. var.) 0.888          0.888     
##                                                
## Log-likelihood = -76.5758
## No. of observations = 170
## AIC value = 167.1515

Nếu albumin ban đầu thấp thì có phải là yếu tố gây nhiễu cho nghiên cứu –> Chia 2 nhóm albumin nhỏ hơn 25 và lớn hơn 25 để phân tích tương quan với LAR

datalar$alb_group <- ifelse(datalar$albumin < 2.5, "low", "normal")
table(datalar$alb_group)

## 
##    low normal 
##     75     95

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

results <- datalar %>%
  group_by(alb_group) %>%
  summarise(
    corr = cor(lar, tuvong, method = "spearman", use = "complete.obs"),
    p_value = cor.test(lar, tuvong, method = "spearman")$p.value,
    n = n()
  )

## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `p_value = cor.test(lar, tuvong, method = "spearman")$p.value`.
## ℹ In group 1: `alb_group = "low"`.
## Caused by warning in `cor.test.default()`:
## ! Cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

print(results)

## # A tibble: 2 × 4
##   alb_group  corr      p_value     n
##   <chr>     <dbl>        <dbl> <int>
## 1 low       0.464 0.0000274       75
## 2 normal    0.536 0.0000000218    95

logistic_results <- datalar %>%
  group_by(alb_group) %>%
  group_map(~ {
    model <- glm(tuvong ~ lar, data = .x, family = binomial)
    summary(model)
  })
logistic_results

## [[1]]
## 
## Call:
## glm(formula = tuvong ~ lar, family = binomial, data = .x)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4542     0.6329  -2.298 0.021578 *  
## lar           0.8890     0.2634   3.376 0.000736 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 94.030  on 74  degrees of freedom
## Residual deviance: 74.751  on 73  degrees of freedom
## AIC: 78.751
## 
## Number of Fisher Scoring iterations: 5
## 
## 
## [[2]]
## 
## Call:
## glm(formula = tuvong ~ lar, family = binomial, data = .x)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.0219     0.4879  -4.144 3.41e-05 ***
## lar           1.0475     0.2428   4.314 1.61e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131.69  on 94  degrees of freedom
## Residual deviance: 101.06  on 93  degrees of freedom
## AIC: 105.06
## 
## Number of Fisher Scoring iterations: 5

Vẽ đường cong AUC và tính các giá trị tiên lượng

plot(x=datalar$lar, y=datalar$tuvong)
m2 = glm(datalar$tuvong ~ datalar$lar, family=binomial)
lines(datalar$lar, m2$fitted.values)

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:epiDisplay':
## 
##     ci

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

larroc <- roc(datalar$tuvong, m2$fitted.values, plot = TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

par(pty = "s")
auc(larroc)

## Area under the curve: 0.8105

ci.auc(larroc)

## 95% CI: 0.7473-0.8737 (DeLong)

roc(datalar$tuvong, m2$fitted.values, plot = TRUE,
    main = "AUROC của Lactate/Albumin máu",
    xlab="1-Độ đặc hiệu", ylab="Độ nhạy", 
    col="#b8377e", ldw=4, print.auc=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE,
    print.thres = "best", print.thres.best.method = "youden")

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = datalar$tuvong, predictor = m2$fitted.values,     plot = TRUE, main = "AUROC của Lactate/Albumin máu", xlab = "1-Độ đặc hiệu",     ylab = "Độ nhạy", col = "#b8377e", ldw = 4, print.auc = TRUE,     auc.polygon = TRUE, max.auc.polygon = TRUE, print.thres = "best",     print.thres.best.method = "youden")
## 
## Data: m2$fitted.values in 72 controls (datalar$tuvong 0) < 98 cases (datalar$tuvong 1).
## Area under the curve: 0.8105

par(pty = "m")

dataset <- data.frame(datalar$lar, datalar$tuvong, stringsAsFactors = TRUE)
attach(dataset)
dataset

##     datalar.lar datalar.tuvong
## 1          0.98              1
## 2          2.20              1
## 3          0.31              0
## 4          4.36              1
## 5          3.53              1
## 6          1.68              1
## 7          0.95              0
## 8          6.37              1
## 9          3.01              1
## 10         2.69              1
## 11         2.86              1
## 12         3.89              1
## 13         2.79              0
## 14         3.47              1
## 15         2.86              1
## 16         1.70              0
## 17         4.55              1
## 18         5.46              1
## 19         2.41              0
## 20         2.62              1
## 21         0.81              0
## 22         1.28              1
## 23         2.08              0
## 24         5.00              1
## 25         3.79              1
## 26         2.36              0
## 27         4.29              1
## 28         0.53              0
## 29         3.23              1
## 30         1.00              0
## 31         1.05              0
## 32         5.05              1
## 33         4.74              1
## 34         2.60              1
## 35         0.59              0
## 36         3.45              1
## 37         5.07              1
## 38         5.40              1
## 39         0.70              0
## 40         1.45              0
## 41         2.70              1
## 42         0.65              0
## 43         0.77              0
## 44         0.72              0
## 45         0.85              0
## 46         1.54              0
## 47         2.25              1
## 48         3.80              1
## 49         0.62              0
## 50         2.20              0
## 51         0.62              0
## 52         0.78              0
## 53         2.46              0
## 54         1.00              0
## 55         2.86              1
## 56         2.88              0
## 57         1.44              1
## 58         0.56              0
## 59         2.60              0
## 60         0.93              0
## 61         0.90              0
## 62         0.55              1
## 63         2.21              0
## 64         0.87              0
## 65         2.62              0
## 66         1.36              1
## 67         0.50              0
## 68         1.29              1
## 69         2.24              0
## 70         3.88              1
## 71         3.04              1
## 72         1.94              0
## 73         0.94              0
## 74         4.98              1
## 75         0.53              1
## 76         4.01              1
## 77         4.14              1
## 78         3.41              1
## 79         1.44              0
## 80         2.48              0
## 81         1.74              1
## 82         1.52              1
## 83         2.48              0
## 84         0.97              0
## 85         2.99              0
## 86         5.67              1
## 87         1.54              1
## 88         2.75              1
## 89         1.08              1
## 90         2.01              0
## 91         0.82              0
## 92         2.28              1
## 93         1.76              1
## 94         4.62              1
## 95         1.84              1
## 96         7.41              1
## 97         1.76              0
## 98         2.86              1
## 99         3.52              1
## 100        2.27              1
## 101        1.11              1
## 102        2.43              1
## 103        1.75              1
## 104        6.65              1
## 105        4.06              1
## 106        1.30              1
## 107        2.95              1
## 108        1.20              1
## 109        1.00              1
## 110        1.93              0
## 111        1.84              1
## 112        2.17              1
## 113        2.47              1
## 114        2.86              1
## 115        7.07              1
## 116        3.19              1
## 117        2.06              1
## 118        1.75              1
## 119        1.47              1
## 120        0.43              0
## 121        1.74              1
## 122        1.47              1
## 123        0.62              1
## 124        3.69              1
## 125        1.86              1
## 126        4.65              1
## 127        9.20              1
## 128        1.20              1
## 129        0.86              1
## 130        3.96              1
## 131        3.45              0
## 132        1.13              0
## 133        0.29              0
## 134        0.96              1
## 135        1.46              1
## 136        3.36              1
## 137        0.64              0
## 138        1.05              0
## 139        1.11              0
## 140        1.09              0
## 141        0.60              0
## 142        3.15              1
## 143        7.58              1
## 144        3.15              0
## 145        2.81              0
## 146        0.32              0
## 147        4.09              1
## 148       13.29              1
## 149        4.02              1
## 150        6.15              1
## 151        1.52              1
## 152        5.33              1
## 153        0.88              0
## 154        3.82              0
## 155        6.10              1
## 156        0.81              0
## 157        5.17              1
## 158        1.85              0
## 159        3.56              0
## 160        1.00              0
## 161        1.13              0
## 162        0.96              0
## 163        2.32              1
## 164        1.11              0
## 165        2.46              0
## 166        2.18              0
## 167        0.99              0
## 168        0.40              0
## 169        2.60              1
## 170        2.01              0

dataset$datalar.tuvong <- factor(dataset$datalar.tuvong,labels = c("Song", "Tuvong"))
library(OptimalCutpoints)
result <- optimal.cutpoints(X = "datalar.lar", status = "datalar.tuvong", tag.healthy ="Song",
                            methods = "Youden", data = dataset)
summary(result)

## 
## Call:
## optimal.cutpoints.default(X = "datalar.lar", status = "datalar.tuvong", 
##     tag.healthy = "Song", methods = "Youden", data = dataset)
## 
## Area under the ROC curve (AUC):  0.811 (0.747, 0.874) 
## 
## CRITERION: Youden
## Number of optimal cutoffs: 1
## 
##                     Estimate
## cutoff             1.2000000
## Se                 0.9081633
## Sp                 0.5694444
## PPV                0.7416667
## NPV                0.8200000
## DLR.Positive       2.1092824
## DLR.Negative       0.1612743
## FP                31.0000000
## FN                 9.0000000
## Optimal criterion  0.4776077

plot(result)

## Press return for next page....

Dùng phương pháp BMA để tìm ra mô hình phù hợp trên lâm sàng

library(dominanceanalysis)
temp1 <- glm(tuvong ~ age + nkp + lactate
            + lar + sofa + apacheii, family="binomial", data=datalar)
dom <- dominanceAnalysis(temp1)
plot(dom, which.graph ="general",fit.function ="r2.m")

df <- data.frame(datalar$age, datalar$lactate, datalar$lar, datalar$sofa, datalar$apacheii,
                 datalar$nkp, datalar$tuvong)
head(df)

##   datalar.age datalar.lactate datalar.lar datalar.sofa datalar.apacheii
## 1          87             2.1        0.98           10               32
## 2          70             4.5        2.20            4               18
## 3          69             0.9        0.31            9               22
## 4          71            10.9        4.36            2               15
## 5          66             6.1        3.53            6               23
## 6          78             4.7        1.68            6               18
##   datalar.nkp datalar.tuvong
## 1           1              1
## 2           1              1
## 3           0              0
## 4           0              1
## 5           0              1
## 6           1              1

attach(df)

## The following objects are masked from dataset:
## 
##     datalar.lar, datalar.tuvong

names(df)

## [1] "datalar.age"      "datalar.lactate"  "datalar.lar"      "datalar.sofa"    
## [5] "datalar.apacheii" "datalar.nkp"      "datalar.tuvong"

xvars <- df[,1:6]
y <- datalar$tuvong
library(survival)
library(leaps)
library(robustbase)

## 
## Attaching package: 'robustbase'

## The following object is masked from 'package:survival':
## 
##     heart

library(inline)
library(rrcov)

## Warning: package 'rrcov' was built under R version 4.4.1

## Scalable Robust Estimators with High Breakdown Point (version 1.7-6)

library(BMA)
bma.search1 <- bic.glm(xvars, y, strict=F, OR=20, glm.family="binomial")
summary(bma.search1)

## 
## Call:
## bic.glm.data.frame(x = xvars, y = y, glm.family = "binomial",     strict = F, OR = 20)
## 
## 
##   9  models were selected
##  Best  5  models (cumulative posterior probability =  0.8532 ): 
## 
##                   p!=0    EV         SD       model 1     model 2   
## Intercept         100    -4.3408487  1.35289    -4.75641    -2.70978
## datalar.age        68.2   0.0209587  0.01758     0.02999       .    
## datalar.lactate     2.8  -0.0001623  0.02941       .           .    
## datalar.lar       100.0   1.0544229  0.20271     1.08211     1.06073
## datalar.sofa       30.1   0.0383765  0.06910       .           .    
## datalar.apacheii    7.9   0.0029351  0.01401       .           .    
## datalar.nkp        90.5   1.1228499  0.54006     1.22839     1.39740
##                                                                     
## nVar                                               3           2    
## BIC                                           -695.89463  -694.60728
## post prob                                        0.362       0.190  
##                   model 3     model 4     model 5   
## Intercept           -5.46903    -3.39075    -5.40209
## datalar.age          0.03010       .         0.03553
## datalar.lactate        .           .           .    
## datalar.lar          1.03355     1.01078     0.98710
## datalar.sofa         0.12271     0.12103     0.15165
## datalar.apacheii       .           .           .    
## datalar.nkp          1.09660     1.25729       .    
##                                                     
## nVar                   4           3           3    
## BIC               -694.23342  -693.06694  -692.12947
## post prob            0.158       0.088       0.055

imageplot.bma(bma.search1)

Phân tích tính chính xác của mô hình bằng Calibration

library(rms)

## Loading required package: Hmisc

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## Registered S3 method overwritten by 'rms':
##   method       from      
##   print.lrtest epiDisplay

## 
## Attaching package: 'rms'

## The following object is masked from 'package:epiDisplay':
## 
##     lrtest

model1 <- lrm(tuvong ~ age + lar + nkp, x=TRUE,y=TRUE, data=datalar)
model2 <- lrm(tuvong ~ lar + nkp, x=TRUE,y=TRUE, data=datalar)
model3 <- lrm(tuvong ~ age + lar + nkp + sofa, x=TRUE,y=TRUE, data=datalar)
model1

## Logistic Regression Model
## 
## lrm(formula = tuvong ~ age + lar + nkp, data = datalar, x = TRUE, 
##     y = TRUE)
## 
##                        Model Likelihood      Discrimination    Rank Discrim.    
##                              Ratio Test             Indexes          Indexes    
## Obs           170    LR chi2      75.03      R2       0.480    C       0.855    
##  0             72    d.f.             3      R2(3,170)0.345    Dxy     0.709    
##  1             98    Pr(> chi2) <0.0001    R2(3,124.5)0.439    gamma   0.709    
## max |deriv| 8e-11                            Brier    0.153    tau-a   0.348    
## 
##           Coef    S.E.   Wald Z Pr(>|Z|)
## Intercept -4.7564 1.0158 -4.68  <0.0001 
## age        0.0300 0.0122  2.47  0.0136  
## lar        1.0821 0.1902  5.69  <0.0001 
## nkp        1.2284 0.4088  3.00  0.0027

model2

## Logistic Regression Model
## 
## lrm(formula = tuvong ~ lar + nkp, data = datalar, x = TRUE, y = TRUE)
## 
##                        Model Likelihood      Discrimination    Rank Discrim.    
##                              Ratio Test             Indexes          Indexes    
## Obs           170    LR chi2      68.61      R2       0.446    C       0.839    
##  0             72    d.f.             2      R2(2,170)0.324    Dxy     0.678    
##  1             98    Pr(> chi2) <0.0001    R2(2,124.5)0.414    gamma   0.679    
## max |deriv| 5e-06                            Brier    0.161    tau-a   0.333    
## 
##           Coef    S.E.   Wald Z Pr(>|Z|)
## Intercept -2.7098 0.5053 -5.36  <0.0001 
## lar        1.0607 0.1842  5.76  <0.0001 
## nkp        1.3974 0.3981  3.51  0.0004

model3

## Logistic Regression Model
## 
## lrm(formula = tuvong ~ age + lar + nkp + sofa, data = datalar, 
##     x = TRUE, y = TRUE)
## 
##                        Model Likelihood      Discrimination    Rank Discrim.    
##                              Ratio Test             Indexes          Indexes    
## Obs           170    LR chi2      78.50      R2       0.497    C       0.864    
##  0             72    d.f.             4      R2(4,170)0.355    Dxy     0.728    
##  1             98    Pr(> chi2) <0.0001    R2(4,124.5)0.450    gamma   0.728    
## max |deriv| 4e-10                            Brier    0.148    tau-a   0.357    
## 
##           Coef    S.E.   Wald Z Pr(>|Z|)
## Intercept -5.4690 1.1221 -4.87  <0.0001 
## age        0.0301 0.0123  2.45  0.0142  
## lar        1.0336 0.1940  5.33  <0.0001 
## nkp        1.0966 0.4154  2.64  0.0083  
## sofa       0.1227 0.0671  1.83  0.0673

library(rms)
model1 <- lrm(tuvong ~ age + lar + nkp, x=TRUE,y=TRUE, data=datalar)
pred.logit <- predict(model1)
phat <- 1/(1+ exp(-pred.logit))
val.prob(phat, datalar$tuvong, m=10, cex=.5)

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  7.090420e-01  8.545210e-01  4.795776e-01  4.354707e-01  7.503001e+01 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA -1.176471e-02 -5.684342e-14  1.000000e+00  4.472354e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  1.531266e-01  2.069542e-14  1.000000e+00  5.176247e-02  4.402857e-02 
##          Eavg           S:z           S:p 
##  2.718828e-02  1.262687e-01  8.995192e-01

library(rms)
model3 <- lrm(tuvong ~ age + lar + nkp + sofa, x=TRUE,y=TRUE, data=datalar)
pred.logit1 <- predict(model3)
phat1 <- 1/(1+ exp(-pred.logit1))
val.prob(phat1, datalar$tuvong, m=10, cex=.5)

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  7.277494e-01  8.638747e-01  4.970655e-01  4.559094e-01  7.850460e+01 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA -1.176471e-02 -2.842171e-14  1.000000e+00  4.676741e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  1.484784e-01  9.276900e-14  1.000000e+00  1.007307e-01  8.852993e-02 
##          Eavg           S:z           S:p 
##  4.910426e-02  5.790848e-02  9.538215e-01

Vẽ Nomogram cho mô hình tiên lượng bao gồm lactate - tuổi - nhiễm khuẩn hô hấp

set.seed(123)
data <- data.frame(
  age = datalar$age,
  lar = datalar$lar,
  nkp = datalar$nkp,
  tuvong = datalar$tuvong
)
options(datadist = "ddist")
ddist <- datadist(data)
m1 <- lrm(tuvong ~ lar + age + nkp, data = data)
nom <- nomogram(m1, fun = plogis, fun.at = c(0.1, 0.25, 0.5, 0.75, 0.9),
                lp = FALSE, funlabel = "Probability of Death")
plot(nom, xfrac = 0.2,  label.every = list(age = 1))

Phân tích hồi quy rừng ngẫu nhiên để kiểm tra cho mô hình

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.1

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

str(datalar)

## tibble [170 × 49] (S3: tbl_df/tbl/data.frame)
##  $ stt       : num [1:170] 1 2 3 4 5 6 7 8 9 10 ...
##  $ sex       : num [1:170] 0 1 1 1 0 1 1 0 0 1 ...
##  $ age       : num [1:170] 87 70 69 71 66 78 73 87 63 67 ...
##  $ ngay      : num [1:170] 18 6 20 12 13 4 8 1 1 11 ...
##  $ tha       : num [1:170] 1 0 1 1 1 1 0 0 0 1 ...
##  $ dtd       : num [1:170] 0 0 1 1 1 0 0 0 0 0 ...
##  $ btm       : num [1:170] 0 0 0 1 0 1 0 0 0 1 ...
##  $ dqn       : num [1:170] 0 0 0 0 0 0 0 1 1 0 ...
##  $ bpm       : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
##  $ bthm      : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
##  $ bgm       : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ut        : num [1:170] 0 1 0 0 0 0 0 0 1 0 ...
##  $ nkp       : num [1:170] 1 1 0 0 0 1 1 1 1 1 ...
##  $ nktn      : num [1:170] 0 0 0 0 1 0 0 0 1 0 ...
##  $ nkth      : num [1:170] 0 0 0 1 0 1 0 0 0 0 ...
##  $ nkdmm     : num [1:170] 0 0 0 0 0 0 0 0 1 0 ...
##  $ khac      : num [1:170] 0 0 1 0 0 0 0 0 0 0 ...
##  $ gcs       : num [1:170] 8 15 15 14 8 12 15 6 14 15 ...
##  $ hatb      : num [1:170] 60 102 123 163 60 63 87 83 88 88 ...
##  $ mach      : num [1:170] 103 90 62 98 112 130 66 160 135 82 ...
##  $ nhietdo   : num [1:170] 36.8 38.5 37 36 36.5 36.4 38.2 39 37.6 38.1 ...
##  $ nhiptho   : num [1:170] 30 35 20 25 33 32 19 29 30 28 ...
##  $ sp02      : num [1:170] 94 90 82 97 85 85 95 78 85 98 ...
##  $ caymau    : num [1:170] 0 0 0 1 1 0 0 1 0 0 ...
##  $ thongkhich: num [1:170] 1 1 0 1 1 1 0 1 0 1 ...
##  $ locmau    : num [1:170] 1 0 1 1 1 1 0 1 0 1 ...
##  $ albumin   : num [1:170] 2.15 2.05 2.94 2.5 1.73 2.8 2.11 2.01 2.06 3.2 ...
##  $ lar       : num [1:170] 0.98 2.2 0.31 4.36 3.53 1.68 0.95 6.37 3.01 2.69 ...
##  $ lactate   : num [1:170] 2.1 4.5 0.9 10.9 6.1 4.7 2 12.8 6.2 8.6 ...
##  $ pct       : num [1:170] 3.33 2.03 5.02 100 1.56 ...
##  $ tuvong    : num [1:170] 1 1 0 1 1 1 0 1 1 1 ...
##  $ apacheii  : num [1:170] 32 18 22 15 23 18 19 30 15 14 ...
##  $ sofa      : num [1:170] 10 4 9 2 6 6 2 12 4 7 ...
##  $ bc        : num [1:170] 25 14.2 26.4 15.7 16.2 ...
##  $ hct       : num [1:170] 40.1 42.3 37.1 39.1 28.3 39.2 28.9 45.2 23.3 36.7 ...
##  $ tc        : num [1:170] 202 399 162 119 390 293 339 238 135 76 ...
##  $ bilitp    : num [1:170] 12.8 20.1 564 11.1 9.9 10.8 15.8 24.3 14.2 39.3 ...
##  $ na        : num [1:170] 161 121 123 136 119 138 129 152 123 137 ...
##  $ k         : num [1:170] 4.4 4.9 5.6 3.3 3.8 5 3.9 3.3 3.6 4.5 ...
##  $ ast       : num [1:170] 70 56 1270 33 26 30 35 27 47 238 ...
##  $ alt       : num [1:170] 75 44 2429 19 26 ...
##  $ ure       : num [1:170] 44.7 7.7 38.5 7.3 7.6 9.6 7.9 14.8 11.2 14.6 ...
##  $ crea      : num [1:170] 335 123 632 107 102 144 87 114 74 314 ...
##  $ egfr      : num [1:170] 11.98 53.76 8.16 46.6 50.14 ...
##  $ ph        : num [1:170] 7.45 7.31 7.38 7.3 7.47 ...
##  $ pc02      : num [1:170] 32 21 31.4 32.7 27.6 32.8 64.6 19.9 46.2 18.5 ...
##  $ p02       : num [1:170] 89.4 134.1 73.9 84.4 81.8 ...
##  $ hc03      : num [1:170] 21.9 10.4 18.2 15.5 19.6 20.1 21.9 13.3 21.2 10.3 ...
##  $ alb_group : chr [1:170] "low" "low" "normal" "normal" ...

model_rf <- randomForest(tuvong ~ age + lar + nkp, data = datalar, importance = TRUE, ntree = 500)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

print(model_rf)

## 
## Call:
##  randomForest(formula = tuvong ~ age + lar + nkp, data = datalar,      importance = TRUE, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.1729648
##                     % Var explained: 29.16

predictions <- predict(model_rf, datalar)
mse <- mean((predictions - datalar$tuvong)^2)
cat("Mean Squared Error (MSE):", mse, "\n")

## Mean Squared Error (MSE): 0.1015607

rsq <- 1 - sum((predictions - datalar$tuvong)^2) / sum((mean(datalar$tuvong) - datalar$tuvong)^2)
cat("R²:", rsq, "\n")

## R²: 0.5840273

# Tính RMSE
rmse <- sqrt(mean((predictions - datalar$tuvong)^2))
cat("RMSE:", rmse, "\n")

## RMSE: 0.3186858

# Tính MAE
mae <- mean(abs(predictions - datalar$tuvong))
cat("MAE:", mae, "\n")

## MAE: 0.2770989

# Tính R² điều chỉnh
n <- nrow(datalar)
p <- length(coef(model_rf)) - 1  # Số biến trong mô hình (cộng cả intercept)
r2_adj <- 1 - (1 - rsq) * (n - 1) / (n - p - 1)
cat("Adjusted R²:", r2_adj, "\n")

## Adjusted R²: 0.5864742

# Vẽ biểu đồ sai số
residuals <- predictions - datalar$tuvong
plot(residuals, main = "Residuals Plot", xlab = "Index", ylab = "Residuals")
abline(h = 0, col = "red")

# Ví dụ với cross-validation (n-folds)
library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## The following object is masked from 'package:epiDisplay':
## 
##     alpha

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:epiDisplay':
## 
##     dotplot

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

train_control <- trainControl(method = "cv", number = 10)  # 10-fold CV
cv_model <- train(tuvong ~ age + lar + nkp, data = datalar, method = "rf", trControl = train_control)

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values.  Are you sure you want to do regression?

print(cv_model)

## Random Forest 
## 
## 170 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 153, 153, 153, 153, 153, 153, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE      
##   2     0.4269553  0.3043584  0.3280763
##   3     0.4314351  0.2970112  0.3268998
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.

# Tính MAPE
mape <- mean(abs((predictions - datalar$tuvong) / datalar$tuvong)) * 100
cat("MAPE:", mape, "%\n")

## MAPE: Inf %

# Loại bỏ các giá trị thực tế bằng 0 trước khi tính MAPE
non_zero_data <- datalar[datalar$tuvong != 0, ]
predictions_non_zero <- predictions[datalar$tuvong != 0]
# Tính lại MAPE
mape <- mean(abs((predictions_non_zero - non_zero_data$tuvong) / non_zero_data$tuvong)) * 100
cat("MAPE:", mape, "%\n")

## MAPE: 24.2408 %

Với R² = 57.49% và MSE = 0.1038, mô hình hiện tại đang hoạt động tốt. R² = 0.5749125: Hệ số xác định (R²) cho biết mô hình có thể giải thích khoảng 57.49% biến động của biến kết quả (tuvong). Đây là một mức độ giải thích khá tốt, đặc biệt đối với các mô hình phức tạp như hồi quy rừng ngẫu nhiên. MSE = 0.1037861: MSE cho thấy sai số bình phương trung bình của mô hình là 0.1038. Một giá trị MSE thấp cho thấy mô hình của bạn có độ chính xác cao trong việc dự đoán các giá trị thực tế.