library(readxl)
datalar <- read_excel("~/Desktop/Developer/datalar.xlsx")
View(datalar)
attach(datalar)
shapiro.test(datalar$age)
##
## Shapiro-Wilk normality test
##
## data: datalar$age
## W = 0.94995, p-value = 9.945e-06
shapiro.test(datalar$gcs)
##
## Shapiro-Wilk normality test
##
## data: datalar$gcs
## W = 0.76095, p-value = 2.361e-15
shapiro.test(datalar$mach)
##
## Shapiro-Wilk normality test
##
## data: datalar$mach
## W = 0.98003, p-value = 0.015
shapiro.test(datalar$nhietdo)
##
## Shapiro-Wilk normality test
##
## data: datalar$nhietdo
## W = 0.94253, p-value = 2.307e-06
shapiro.test(datalar$nhiptho)
##
## Shapiro-Wilk normality test
##
## data: datalar$nhiptho
## W = 0.96489, p-value = 0.0002728
shapiro.test(datalar$hatb)
##
## Shapiro-Wilk normality test
##
## data: datalar$hatb
## W = 0.95999, p-value = 8.662e-05
shapiro.test(datalar$sp02)
##
## Shapiro-Wilk normality test
##
## data: datalar$sp02
## W = 0.7037, p-value < 2.2e-16
shapiro.test(datalar$bc)
##
## Shapiro-Wilk normality test
##
## data: datalar$bc
## W = 0.94903, p-value = 8.249e-06
shapiro.test(datalar$hct)
##
## Shapiro-Wilk normality test
##
## data: datalar$hct
## W = 0.98586, p-value = 0.08341
shapiro.test(datalar$tc)
##
## Shapiro-Wilk normality test
##
## data: datalar$tc
## W = 0.9027, p-value = 3.67e-09
shapiro.test(datalar$bilitp)
##
## Shapiro-Wilk normality test
##
## data: datalar$bilitp
## W = 0.27918, p-value < 2.2e-16
shapiro.test(datalar$na)
##
## Shapiro-Wilk normality test
##
## data: datalar$na
## W = 0.95381, p-value = 2.225e-05
shapiro.test(datalar$k)
##
## Shapiro-Wilk normality test
##
## data: datalar$k
## W = 0.96519, p-value = 0.0002938
shapiro.test(datalar$ast)
##
## Shapiro-Wilk normality test
##
## data: datalar$ast
## W = 0.31451, p-value < 2.2e-16
shapiro.test(datalar$alt)
##
## Shapiro-Wilk normality test
##
## data: datalar$alt
## W = 0.305, p-value < 2.2e-16
shapiro.test(datalar$ure)
##
## Shapiro-Wilk normality test
##
## data: datalar$ure
## W = 0.8503, p-value = 6.615e-12
shapiro.test(datalar$crea)
##
## Shapiro-Wilk normality test
##
## data: datalar$crea
## W = 0.82095, p-value = 3.627e-13
shapiro.test(datalar$ph)
##
## Shapiro-Wilk normality test
##
## data: datalar$ph
## W = 0.90407, p-value = 4.443e-09
shapiro.test(datalar$pc02)
##
## Shapiro-Wilk normality test
##
## data: datalar$pc02
## W = 0.85577, p-value = 1.186e-11
shapiro.test(datalar$p02)
##
## Shapiro-Wilk normality test
##
## data: datalar$p02
## W = 0.71846, p-value < 2.2e-16
shapiro.test(datalar$hc03)
##
## Shapiro-Wilk normality test
##
## data: datalar$hc03
## W = 0.95487, p-value = 2.791e-05
shapiro.test(datalar$lactate)
##
## Shapiro-Wilk normality test
##
## data: datalar$lactate
## W = 0.89633, p-value = 1.537e-09
shapiro.test(datalar$albumin)
##
## Shapiro-Wilk normality test
##
## data: datalar$albumin
## W = 0.98597, p-value = 0.08636
shapiro.test(datalar$lar)
##
## Shapiro-Wilk normality test
##
## data: datalar$lar
## W = 0.85129, p-value = 7.344e-12
shapiro.test(datalar$sofa)
##
## Shapiro-Wilk normality test
##
## data: datalar$sofa
## W = 0.96972, p-value = 0.0009052
shapiro.test(datalar$apacheii)
##
## Shapiro-Wilk normality test
##
## data: datalar$apacheii
## W = 0.97322, p-value = 0.002262
shapiro.test(datalar$ngay)
##
## Shapiro-Wilk normality test
##
## data: datalar$ngay
## W = 0.789, p-value = 2.19e-14
shapiro.test(datalar$pct)
##
## Shapiro-Wilk normality test
##
## data: datalar$pct
## W = 0.75866, p-value = 3.903e-15
–> Kết quả cho thấy các biến phân phối không chuẩn, chỉ có biến albumin và biến Hct là phân phối chuẩn
library(gtsummary)
summary2 <- datalar %>%
select(sex, age, dtd, tha, btm, dqn, bpm, bthm, bgm, ut, nkp, nktn, nkth,
nkdmm, khac, gcs, mach, nhietdo, nhiptho, hatb, sp02, thongkhich, locmau, bc,
hct, tc, bilitp, na, k, ast, alt, ure, crea, ph, pc02, p02, hc03,
lactate, albumin, lar, pct, caymau, sofa, apacheii, ngay, tuvong) %>%
tbl_summary(
by = tuvong,
type = all_continuous() ~ "continuous2",
statistic = list(
all_continuous() ~ c("{mean} ({sd})", "{median} ({p25}, {p75})", "{min}, {max}"),
all_categorical() ~ "{n} / {N} ({p}%)"),
digits = all_continuous() ~ 2,
label = list(
sex ~ "Giới tính",
age ~ "Tuổi",
dtd ~ "Đái tháo đường",
tha ~ "Tăng huyết áp",
btm ~ "Bệnh tim mạch",
dqn ~ "Đột quỵ não",
bpm ~ "Bệnh phổi mạn",
bthm ~ "Bệnh thận mạn",
bgm ~ "Bệnh gan mạn",
ut ~ "Ung thư",
nkp ~ "Nhiễm khuẩn hô hấp",
nktn ~ "Nhiễm khuẩn tiết niệu",
nkth ~ "Nhiễm khuẩn tiêu hoá",
nkdmm ~ "Nhiễm khuẩn da và mô mềm",
khac ~ "Khác",
gcs ~ "GCS",
mach ~ "Mạch",
nhietdo ~ "Nhiệt độ",
nhiptho ~ "Nhịp thở",
hatb ~ "Huyết áp trung bình",
sp02 ~ "Sp02",
thongkhich ~ "Thông khí cơ học",
locmau ~ "CRRT",
bc ~ "Bạch cầu",
hct ~ "Hct",
tc ~ "Tiểu cầu",
bilitp ~ "Bilirubin toàn phần",
na ~ "Natri",
k ~ "Kali",
ast ~ "AST",
alt ~ "ALT",
ure ~ "Ure",
crea ~ "Creatinine",
ph ~ "pH",
pc02 ~"pC02",
p02 ~ "p02",
hc03 ~ "HC03",
lactate ~ "Lactate máu",
albumin ~ "ALbumin máu",
lar ~"Lactate/Albumin máu",
pct ~ "PCT",
caymau ~ "Cấy máu dương tính",
sofa ~ "Thang điểm SOFA",
apacheii ~ "Thang điểm APACHE II",
ngay ~ "Số ngày nằm khoa hồi sức"),
missing_text = "(Missing)"
) %>%
add_p(pvalue_fun = ~ style_pvalue(.x, digits = 3)) %>%
add_overall() %>%
add_n() %>%
modify_caption("**Đặc điểm dân số nghiên cứu**") %>%
bold_labels()
summary2
| Characteristic | N | Overall, N = 1701 | 0, N = 721 | 1, N = 981 | p-value2 |
|---|---|---|---|---|---|
| Giới tính | 170 | 92 / 170 (54%) | 43 / 72 (60%) | 49 / 98 (50%) | 0.209 |
| Tuổi | 170 | <0.001 | |||
| Mean (SD) | 70.64 (17.13) | 65.88 (17.01) | 74.14 (16.44) | ||
| Median (IQR) | 73.00 (62.00, 84.75) | 68.50 (55.75, 78.25) | 77.50 (66.00, 86.00) | ||
| Range | 16.00, 101.00 | 17.00, 99.00 | 16.00, 101.00 | ||
| Đái tháo đường | 170 | 59 / 170 (35%) | 23 / 72 (32%) | 36 / 98 (37%) | 0.517 |
| Tăng huyết áp | 170 | 105 / 170 (62%) | 43 / 72 (60%) | 62 / 98 (63%) | 0.639 |
| Bệnh tim mạch | 170 | 49 / 170 (29%) | 24 / 72 (33%) | 25 / 98 (26%) | 0.266 |
| Đột quỵ não | 170 | 25 / 170 (15%) | 7 / 72 (9.7%) | 18 / 98 (18%) | 0.116 |
| Bệnh phổi mạn | 170 | 8 / 170 (4.7%) | 6 / 72 (8.3%) | 2 / 98 (2.0%) | 0.072 |
| Bệnh thận mạn | 170 | 13 / 170 (7.6%) | 3 / 72 (4.2%) | 10 / 98 (10%) | 0.143 |
| Bệnh gan mạn | 170 | 4 / 170 (2.4%) | 2 / 72 (2.8%) | 2 / 98 (2.0%) | >0.999 |
| Ung thư | 170 | 12 / 170 (7.1%) | 5 / 72 (6.9%) | 7 / 98 (7.1%) | 0.960 |
| Nhiễm khuẩn hô hấp | 170 | 87 / 170 (51%) | 26 / 72 (36%) | 61 / 98 (62%) | <0.001 |
| Nhiễm khuẩn tiết niệu | 170 | 34 / 170 (20%) | 11 / 72 (15%) | 23 / 98 (23%) | 0.187 |
| Nhiễm khuẩn tiêu hoá | 170 | 44 / 170 (26%) | 24 / 72 (33%) | 20 / 98 (20%) | 0.057 |
| Nhiễm khuẩn da và mô mềm | 170 | 16 / 170 (9.4%) | 11 / 72 (15%) | 5 / 98 (5.1%) | 0.025 |
| Khác | 170 | 16 / 170 (9.4%) | 9 / 72 (13%) | 7 / 98 (7.1%) | 0.237 |
| GCS | 170 | <0.001 | |||
| Mean (SD) | 11.98 (3.94) | 13.26 (3.15) | 11.03 (4.20) | ||
| Median (IQR) | 14.00 (9.00, 15.00) | 15.00 (13.75, 15.00) | 12.00 (8.00, 15.00) | ||
| Range | 3.00, 15.00 | 3.00, 15.00 | 3.00, 15.00 | ||
| Mạch | 170 | 0.975 | |||
| Mean (SD) | 108.13 (24.10) | 107.50 (23.65) | 108.59 (24.53) | ||
| Median (IQR) | 105.50 (93.25, 124.00) | 106.50 (93.75, 124.00) | 105.00 (93.25, 126.00) | ||
| Range | 60.00, 170.00 | 62.00, 170.00 | 60.00, 166.00 | ||
| Nhiệt độ | 170 | 0.061 | |||
| Mean (SD) | 37.53 (1.05) | 37.68 (1.02) | 37.42 (1.06) | ||
| Median (IQR) | 37.15 (36.73, 38.28) | 37.65 (37.00, 38.42) | 37.00 (36.56, 38.10) | ||
| Range | 36.00, 41.30 | 36.00, 41.30 | 36.00, 41.10 | ||
| Nhịp thở | 170 | 0.446 | |||
| Mean (SD) | 25.81 (7.16) | 25.57 (5.64) | 25.99 (8.12) | ||
| Median (IQR) | 25.00 (20.00, 30.00) | 24.00 (20.75, 28.00) | 26.00 (20.00, 30.00) | ||
| Range | 10.00, 50.00 | 18.00, 40.00 | 10.00, 50.00 | ||
| Huyết áp trung bình | 170 | 0.155 | |||
| Mean (SD) | 88.26 (26.22) | 91.36 (26.56) | 85.98 (25.87) | ||
| Median (IQR) | 83.00 (70.00, 103.00) | 87.00 (72.75, 104.00) | 80.00 (70.00, 100.00) | ||
| Range | 37.00, 163.00 | 43.00, 163.00 | 37.00, 163.00 | ||
| Sp02 | 170 | 0.203 | |||
| Mean (SD) | 89.28 (10.60) | 90.97 (7.27) | 88.04 (12.39) | ||
| Median (IQR) | 93.00 (86.25, 95.00) | 93.00 (89.75, 95.00) | 90.00 (85.00, 95.00) | ||
| Range | 20.00, 99.00 | 65.00, 99.00 | 20.00, 99.00 | ||
| Thông khí cơ học | 170 | 103 / 170 (61%) | 20 / 72 (28%) | 83 / 98 (85%) | <0.001 |
| CRRT | 170 | 41 / 170 (24%) | 11 / 72 (15%) | 30 / 98 (31%) | 0.021 |
| Bạch cầu | 170 | 0.292 | |||
| Mean (SD) | 15.95 (9.01) | 16.59 (8.69) | 15.47 (9.25) | ||
| Median (IQR) | 14.43 (9.43, 21.46) | 15.12 (10.97, 20.74) | 13.96 (8.70, 21.77) | ||
| Range | 0.85, 48.30 | 3.00, 46.03 | 0.85, 48.30 | ||
| Hct | 170 | 0.868 | |||
| Mean (SD) | 33.76 (7.82) | 33.92 (6.67) | 33.65 (8.59) | ||
| Median (IQR) | 34.40 (28.53, 39.20) | 35.60 (29.50, 39.00) | 33.90 (28.13, 39.90) | ||
| Range | 8.20, 57.70 | 14.50, 45.20 | 8.20, 57.70 | ||
| Tiểu cầu | 170 | 0.454 | |||
| Mean (SD) | 201.91 (127.90) | 209.60 (129.89) | 196.27 (126.79) | ||
| Median (IQR) | 175.50 (119.00, 252.50) | 193.50 (135.25, 237.00) | 173.00 (104.25, 262.00) | ||
| Range | 16.00, 710.00 | 16.00, 691.00 | 23.00, 710.00 | ||
| Bilirubin toàn phần | 170 | 0.327 | |||
| Mean (SD) | 35.87 (86.23) | 44.27 (125.00) | 29.71 (37.77) | ||
| Median (IQR) | 16.50 (11.35, 31.38) | 14.95 (10.43, 27.70) | 17.95 (11.79, 31.83) | ||
| Range | 3.49, 920.82 | 4.64, 920.82 | 3.49, 289.20 | ||
| Natri | 170 | 0.105 | |||
| Mean (SD) | 135.60 (8.70) | 134.67 (8.22) | 136.28 (9.02) | ||
| Median (IQR) | 135.20 (131.00, 140.00) | 135.00 (130.75, 138.00) | 136.00 (132.00, 141.00) | ||
| Range | 110.90, 169.00 | 111.00, 163.00 | 110.90, 169.00 | ||
| Kali | 170 | 0.178 | |||
| Mean (SD) | 3.86 (0.91) | 3.75 (0.84) | 3.93 (0.95) | ||
| Median (IQR) | 3.80 (3.20, 4.38) | 3.70 (3.09, 4.22) | 3.90 (3.30, 4.40) | ||
| Range | 1.80, 7.60 | 2.30, 5.84 | 1.80, 7.60 | ||
| AST | 170 | 0.930 | |||
| Mean (SD) | 196.83 (523.31) | 173.27 (302.42) | 214.13 (639.90) | ||
| Median (IQR) | 57.50 (33.00, 129.25) | 62.00 (32.48, 130.50) | 54.55 (34.00, 125.50) | ||
| Range | 13.00, 5,903.00 | 14.00, 1,474.00 | 13.00, 5,903.00 | ||
| ALT | 170 | 0.406 | |||
| Mean (SD) | 123.10 (346.45) | 112.96 (300.18) | 130.55 (378.19) | ||
| Median (IQR) | 38.00 (20.03, 84.00) | 40.50 (22.75, 92.00) | 36.55 (19.25, 74.75) | ||
| Range | 4.00, 3,117.00 | 9.00, 2,429.00 | 4.00, 3,117.00 | ||
| Ure | 170 | 0.569 | |||
| Mean (SD) | 14.78 (9.99) | 14.77 (10.69) | 14.78 (9.50) | ||
| Median (IQR) | 12.15 (8.03, 17.08) | 11.70 (7.33, 18.70) | 12.35 (8.45, 16.88) | ||
| Range | 1.30, 57.10 | 1.30, 57.10 | 2.40, 52.90 | ||
| Creatinine | 170 | 0.263 | |||
| Mean (SD) | 197.30 (136.93) | 217.73 (158.13) | 182.29 (117.59) | ||
| Median (IQR) | 154.50 (106.25, 240.50) | 163.50 (111.50, 275.25) | 154.00 (106.25, 224.25) | ||
| Range | 30.00, 778.00 | 30.00, 778.00 | 54.00, 701.00 | ||
| pH | 170 | 0.039 | |||
| Mean (SD) | 7.38 (0.11) | 7.40 (0.10) | 7.36 (0.12) | ||
| Median (IQR) | 7.40 (7.34, 7.46) | 7.42 (7.37, 7.46) | 7.38 (7.31, 7.45) | ||
| Range | 6.98, 7.61 | 7.08, 7.61 | 6.98, 7.53 | ||
| pC02 | 170 | 0.428 | |||
| Mean (SD) | 30.85 (11.07) | 29.99 (10.10) | 31.49 (11.73) | ||
| Median (IQR) | 28.55 (23.63, 35.18) | 28.20 (23.58, 32.73) | 29.15 (24.05, 36.50) | ||
| Range | 14.60, 81.50 | 14.60, 70.00 | 15.80, 81.50 | ||
| p02 | 170 | 0.892 | |||
| Mean (SD) | 103.02 (59.30) | 98.58 (53.51) | 106.29 (63.29) | ||
| Median (IQR) | 82.10 (67.63, 106.65) | 80.40 (71.15, 102.03) | 83.50 (66.38, 107.83) | ||
| Range | 39.10, 399.10 | 54.50, 399.10 | 39.10, 334.50 | ||
| HC03 | 170 | 0.138 | |||
| Mean (SD) | 17.72 (5.75) | 18.29 (5.91) | 17.30 (5.62) | ||
| Median (IQR) | 17.35 (13.90, 21.02) | 18.30 (15.48, 21.28) | 16.75 (13.23, 20.48) | ||
| Range | 5.00, 46.00 | 5.00, 46.00 | 5.00, 36.40 | ||
| Lactate máu | 170 | <0.001 | |||
| Mean (SD) | 6.04 (4.04) | 3.85 (2.26) | 7.64 (4.31) | ||
| Median (IQR) | 4.95 (3.08, 8.03) | 3.16 (2.08, 5.49) | 7.14 (4.10, 10.09) | ||
| Range | 0.77, 22.37 | 0.77, 10.41 | 1.60, 22.37 | ||
| ALbumin máu | 170 | 0.009 | |||
| Mean (SD) | 2.59 (0.59) | 2.71 (0.50) | 2.50 (0.64) | ||
| Median (IQR) | 2.57 (2.23, 2.94) | 2.73 (2.36, 3.06) | 2.48 (2.12, 2.81) | ||
| Range | 0.82, 4.46 | 1.49, 4.11 | 0.82, 4.46 | ||
| Lactate/Albumin máu | 170 | <0.001 | |||
| Mean (SD) | 2.50 (1.88) | 1.48 (0.90) | 3.26 (2.04) | ||
| Median (IQR) | 2.18 (1.06, 3.40) | 1.07 (0.80, 2.20) | 2.86 (1.74, 4.13) | ||
| Range | 0.29, 13.29 | 0.29, 3.82 | 0.53, 13.29 | ||
| PCT | 164 | 0.723 | |||
| Mean (SD) | 32.28 (37.78) | 34.14 (38.99) | 31.00 (37.06) | ||
| Median (IQR) | 10.65 (2.18, 63.13) | 14.90 (2.39, 75.43) | 10.08 (2.03, 54.42) | ||
| Range | 0.05, 101.56 | 0.07, 100.00 | 0.05, 101.56 | ||
| (Missing) | 6 | 5 | 1 | ||
| Cấy máu dương tính | 170 | 44 / 170 (26%) | 17 / 72 (24%) | 27 / 98 (28%) | 0.562 |
| Thang điểm SOFA | 170 | <0.001 | |||
| Mean (SD) | 7.68 (3.51) | 6.38 (2.88) | 8.64 (3.62) | ||
| Median (IQR) | 8.00 (5.00, 10.00) | 6.50 (4.00, 9.00) | 9.00 (6.00, 11.00) | ||
| Range | 2.00, 20.00 | 2.00, 15.00 | 2.00, 20.00 | ||
| Thang điểm APACHE II | 170 | <0.001 | |||
| Mean (SD) | 19.19 (6.98) | 16.89 (6.20) | 20.89 (7.07) | ||
| Median (IQR) | 18.00 (14.25, 24.00) | 17.00 (12.00, 21.00) | 19.00 (15.00, 25.00) | ||
| Range | 5.00, 41.00 | 5.00, 32.00 | 8.00, 41.00 | ||
| Số ngày nằm khoa hồi sức | 170 | 0.027 | |||
| Mean (SD) | 9.64 (10.31) | 11.58 (12.26) | 8.20 (8.39) | ||
| Median (IQR) | 6.00 (2.00, 14.00) | 7.00 (3.00, 15.50) | 5.00 (2.00, 13.00) | ||
| Range | 0.00, 69.00 | 1.00, 69.00 | 0.00, 36.00 | ||
| 1 n / N (%) | |||||
| 2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test | |||||
library(Matrix)
library(glmnet)
## Loaded glmnet 4.1-8
library(carData)
xvars <- model.matrix(tuvong ~ age + dqn + bpm + bthm
+ nkp + nktn + nkth + nkdmm
+ gcs + nhietdo + hatb + na + k + ph
+ hc03 + lactate + albumin
+ lar + sofa + apacheii, data=datalar)
yvar <- scale(datalar$tuvong)
ridge <- cv.glmnet(xvars, yvar, alpha=0)
plot(ridge)
coef(ridge)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.2215324701
## (Intercept) .
## age 0.0025531369
## dqn 0.0401059143
## bpm -0.1226852832
## bthm 0.0705360815
## nkp 0.0859860995
## nktn 0.0383467920
## nkth -0.0576417384
## nkdmm -0.0840431301
## gcs -0.0090827936
## nhietdo -0.0203671811
## hatb -0.0004490363
## na 0.0006842493
## k 0.0131111125
## ph -0.1262646953
## hc03 -0.0006278012
## lactate 0.0194288748
## albumin -0.0475977234
## lar 0.0404015882
## sofa 0.0126043733
## apacheii 0.0043672999
lasso <- cv.glmnet(xvars, yvar, alpha=1)
plot(lasso)
coef(lasso)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -0.590435063
## (Intercept) .
## age 0.001689660
## dqn .
## bpm .
## bthm .
## nkp 0.107958742
## nktn .
## nkth .
## nkdmm .
## gcs -0.006327289
## nhietdo .
## hatb .
## na .
## k .
## ph .
## hc03 .
## lactate 0.034973320
## albumin .
## lar 0.084448871
## sofa 0.009000562
## apacheii .
temp <- glm(tuvong ~ age + nkp + lactate
+ lar + sofa + apacheii, family="binomial", data=datalar)
library(nnet)
library(foreign)
library(survival)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:gtsummary':
##
## select
library(epiDisplay)
logistic.display(temp)
##
## Logistic regression predicting tuvong
##
## crude OR(95%CI) adj. OR(95%CI)
## age (cont. var.) 1.03 (1.01,1.05) 1.03 (1.01,1.06)
##
## nkp: 1 vs 0 2.92 (1.55,5.48) 2.96 (1.29,6.79)
##
## lactate (cont. var.) 1.47 (1.28,1.68) 1.01 (0.7,1.45)
##
## lar (cont. var.) 2.72 (1.93,3.85) 2.74 (1.09,6.86)
##
## sofa (cont. var.) 1.23 (1.11,1.37) 1.13 (0.98,1.3)
##
## apacheii (cont. var.) 1.0964 (1.0422,1.1535) 1.0053 (0.9335,1.0826)
##
## P(Wald's test) P(LR-test)
## age (cont. var.) 0.018 0.016
##
## nkp: 1 vs 0 0.011 0.009
##
## lactate (cont. var.) 0.956 0.956
##
## lar (cont. var.) 0.032 0.017
##
## sofa (cont. var.) 0.099 0.094
##
## apacheii (cont. var.) 0.888 0.888
##
## Log-likelihood = -76.5758
## No. of observations = 170
## AIC value = 167.1515
datalar$alb_group <- ifelse(datalar$albumin < 2.5, "low", "normal")
table(datalar$alb_group)
##
## low normal
## 75 95
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
results <- datalar %>%
group_by(alb_group) %>%
summarise(
corr = cor(lar, tuvong, method = "spearman", use = "complete.obs"),
p_value = cor.test(lar, tuvong, method = "spearman")$p.value,
n = n()
)
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `p_value = cor.test(lar, tuvong, method = "spearman")$p.value`.
## ℹ In group 1: `alb_group = "low"`.
## Caused by warning in `cor.test.default()`:
## ! Cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
print(results)
## # A tibble: 2 × 4
## alb_group corr p_value n
## <chr> <dbl> <dbl> <int>
## 1 low 0.464 0.0000274 75
## 2 normal 0.536 0.0000000218 95
logistic_results <- datalar %>%
group_by(alb_group) %>%
group_map(~ {
model <- glm(tuvong ~ lar, data = .x, family = binomial)
summary(model)
})
logistic_results
## [[1]]
##
## Call:
## glm(formula = tuvong ~ lar, family = binomial, data = .x)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4542 0.6329 -2.298 0.021578 *
## lar 0.8890 0.2634 3.376 0.000736 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 94.030 on 74 degrees of freedom
## Residual deviance: 74.751 on 73 degrees of freedom
## AIC: 78.751
##
## Number of Fisher Scoring iterations: 5
##
##
## [[2]]
##
## Call:
## glm(formula = tuvong ~ lar, family = binomial, data = .x)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0219 0.4879 -4.144 3.41e-05 ***
## lar 1.0475 0.2428 4.314 1.61e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 131.69 on 94 degrees of freedom
## Residual deviance: 101.06 on 93 degrees of freedom
## AIC: 105.06
##
## Number of Fisher Scoring iterations: 5
plot(x=datalar$lar, y=datalar$tuvong)
m2 = glm(datalar$tuvong ~ datalar$lar, family=binomial)
lines(datalar$lar, m2$fitted.values)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
larroc <- roc(datalar$tuvong, m2$fitted.values, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
par(pty = "s")
auc(larroc)
## Area under the curve: 0.8105
ci.auc(larroc)
## 95% CI: 0.7473-0.8737 (DeLong)
roc(datalar$tuvong, m2$fitted.values, plot = TRUE,
main = "AUROC của Lactate/Albumin máu",
xlab="1-Độ đặc hiệu", ylab="Độ nhạy",
col="#b8377e", ldw=4, print.auc=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE,
print.thres = "best", print.thres.best.method = "youden")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = datalar$tuvong, predictor = m2$fitted.values, plot = TRUE, main = "AUROC của Lactate/Albumin máu", xlab = "1-Độ đặc hiệu", ylab = "Độ nhạy", col = "#b8377e", ldw = 4, print.auc = TRUE, auc.polygon = TRUE, max.auc.polygon = TRUE, print.thres = "best", print.thres.best.method = "youden")
##
## Data: m2$fitted.values in 72 controls (datalar$tuvong 0) < 98 cases (datalar$tuvong 1).
## Area under the curve: 0.8105
par(pty = "m")
dataset <- data.frame(datalar$lar, datalar$tuvong, stringsAsFactors = TRUE)
attach(dataset)
dataset
## datalar.lar datalar.tuvong
## 1 0.98 1
## 2 2.20 1
## 3 0.31 0
## 4 4.36 1
## 5 3.53 1
## 6 1.68 1
## 7 0.95 0
## 8 6.37 1
## 9 3.01 1
## 10 2.69 1
## 11 2.86 1
## 12 3.89 1
## 13 2.79 0
## 14 3.47 1
## 15 2.86 1
## 16 1.70 0
## 17 4.55 1
## 18 5.46 1
## 19 2.41 0
## 20 2.62 1
## 21 0.81 0
## 22 1.28 1
## 23 2.08 0
## 24 5.00 1
## 25 3.79 1
## 26 2.36 0
## 27 4.29 1
## 28 0.53 0
## 29 3.23 1
## 30 1.00 0
## 31 1.05 0
## 32 5.05 1
## 33 4.74 1
## 34 2.60 1
## 35 0.59 0
## 36 3.45 1
## 37 5.07 1
## 38 5.40 1
## 39 0.70 0
## 40 1.45 0
## 41 2.70 1
## 42 0.65 0
## 43 0.77 0
## 44 0.72 0
## 45 0.85 0
## 46 1.54 0
## 47 2.25 1
## 48 3.80 1
## 49 0.62 0
## 50 2.20 0
## 51 0.62 0
## 52 0.78 0
## 53 2.46 0
## 54 1.00 0
## 55 2.86 1
## 56 2.88 0
## 57 1.44 1
## 58 0.56 0
## 59 2.60 0
## 60 0.93 0
## 61 0.90 0
## 62 0.55 1
## 63 2.21 0
## 64 0.87 0
## 65 2.62 0
## 66 1.36 1
## 67 0.50 0
## 68 1.29 1
## 69 2.24 0
## 70 3.88 1
## 71 3.04 1
## 72 1.94 0
## 73 0.94 0
## 74 4.98 1
## 75 0.53 1
## 76 4.01 1
## 77 4.14 1
## 78 3.41 1
## 79 1.44 0
## 80 2.48 0
## 81 1.74 1
## 82 1.52 1
## 83 2.48 0
## 84 0.97 0
## 85 2.99 0
## 86 5.67 1
## 87 1.54 1
## 88 2.75 1
## 89 1.08 1
## 90 2.01 0
## 91 0.82 0
## 92 2.28 1
## 93 1.76 1
## 94 4.62 1
## 95 1.84 1
## 96 7.41 1
## 97 1.76 0
## 98 2.86 1
## 99 3.52 1
## 100 2.27 1
## 101 1.11 1
## 102 2.43 1
## 103 1.75 1
## 104 6.65 1
## 105 4.06 1
## 106 1.30 1
## 107 2.95 1
## 108 1.20 1
## 109 1.00 1
## 110 1.93 0
## 111 1.84 1
## 112 2.17 1
## 113 2.47 1
## 114 2.86 1
## 115 7.07 1
## 116 3.19 1
## 117 2.06 1
## 118 1.75 1
## 119 1.47 1
## 120 0.43 0
## 121 1.74 1
## 122 1.47 1
## 123 0.62 1
## 124 3.69 1
## 125 1.86 1
## 126 4.65 1
## 127 9.20 1
## 128 1.20 1
## 129 0.86 1
## 130 3.96 1
## 131 3.45 0
## 132 1.13 0
## 133 0.29 0
## 134 0.96 1
## 135 1.46 1
## 136 3.36 1
## 137 0.64 0
## 138 1.05 0
## 139 1.11 0
## 140 1.09 0
## 141 0.60 0
## 142 3.15 1
## 143 7.58 1
## 144 3.15 0
## 145 2.81 0
## 146 0.32 0
## 147 4.09 1
## 148 13.29 1
## 149 4.02 1
## 150 6.15 1
## 151 1.52 1
## 152 5.33 1
## 153 0.88 0
## 154 3.82 0
## 155 6.10 1
## 156 0.81 0
## 157 5.17 1
## 158 1.85 0
## 159 3.56 0
## 160 1.00 0
## 161 1.13 0
## 162 0.96 0
## 163 2.32 1
## 164 1.11 0
## 165 2.46 0
## 166 2.18 0
## 167 0.99 0
## 168 0.40 0
## 169 2.60 1
## 170 2.01 0
dataset$datalar.tuvong <- factor(dataset$datalar.tuvong,labels = c("Song", "Tuvong"))
library(OptimalCutpoints)
result <- optimal.cutpoints(X = "datalar.lar", status = "datalar.tuvong", tag.healthy ="Song",
methods = "Youden", data = dataset)
summary(result)
##
## Call:
## optimal.cutpoints.default(X = "datalar.lar", status = "datalar.tuvong",
## tag.healthy = "Song", methods = "Youden", data = dataset)
##
## Area under the ROC curve (AUC): 0.811 (0.747, 0.874)
##
## CRITERION: Youden
## Number of optimal cutoffs: 1
##
## Estimate
## cutoff 1.2000000
## Se 0.9081633
## Sp 0.5694444
## PPV 0.7416667
## NPV 0.8200000
## DLR.Positive 2.1092824
## DLR.Negative 0.1612743
## FP 31.0000000
## FN 9.0000000
## Optimal criterion 0.4776077
plot(result)
## Press return for next page....
library(dominanceanalysis)
temp1 <- glm(tuvong ~ age + nkp + lactate
+ lar + sofa + apacheii, family="binomial", data=datalar)
dom <- dominanceAnalysis(temp1)
plot(dom, which.graph ="general",fit.function ="r2.m")
df <- data.frame(datalar$age, datalar$lactate, datalar$lar, datalar$sofa, datalar$apacheii,
datalar$nkp, datalar$tuvong)
head(df)
## datalar.age datalar.lactate datalar.lar datalar.sofa datalar.apacheii
## 1 87 2.1 0.98 10 32
## 2 70 4.5 2.20 4 18
## 3 69 0.9 0.31 9 22
## 4 71 10.9 4.36 2 15
## 5 66 6.1 3.53 6 23
## 6 78 4.7 1.68 6 18
## datalar.nkp datalar.tuvong
## 1 1 1
## 2 1 1
## 3 0 0
## 4 0 1
## 5 0 1
## 6 1 1
attach(df)
## The following objects are masked from dataset:
##
## datalar.lar, datalar.tuvong
names(df)
## [1] "datalar.age" "datalar.lactate" "datalar.lar" "datalar.sofa"
## [5] "datalar.apacheii" "datalar.nkp" "datalar.tuvong"
xvars <- df[,1:6]
y <- datalar$tuvong
library(survival)
library(leaps)
library(robustbase)
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
library(inline)
library(rrcov)
## Warning: package 'rrcov' was built under R version 4.4.1
## Scalable Robust Estimators with High Breakdown Point (version 1.7-6)
library(BMA)
bma.search1 <- bic.glm(xvars, y, strict=F, OR=20, glm.family="binomial")
summary(bma.search1)
##
## Call:
## bic.glm.data.frame(x = xvars, y = y, glm.family = "binomial", strict = F, OR = 20)
##
##
## 9 models were selected
## Best 5 models (cumulative posterior probability = 0.8532 ):
##
## p!=0 EV SD model 1 model 2
## Intercept 100 -4.3408487 1.35289 -4.75641 -2.70978
## datalar.age 68.2 0.0209587 0.01758 0.02999 .
## datalar.lactate 2.8 -0.0001623 0.02941 . .
## datalar.lar 100.0 1.0544229 0.20271 1.08211 1.06073
## datalar.sofa 30.1 0.0383765 0.06910 . .
## datalar.apacheii 7.9 0.0029351 0.01401 . .
## datalar.nkp 90.5 1.1228499 0.54006 1.22839 1.39740
##
## nVar 3 2
## BIC -695.89463 -694.60728
## post prob 0.362 0.190
## model 3 model 4 model 5
## Intercept -5.46903 -3.39075 -5.40209
## datalar.age 0.03010 . 0.03553
## datalar.lactate . . .
## datalar.lar 1.03355 1.01078 0.98710
## datalar.sofa 0.12271 0.12103 0.15165
## datalar.apacheii . . .
## datalar.nkp 1.09660 1.25729 .
##
## nVar 4 3 3
## BIC -694.23342 -693.06694 -692.12947
## post prob 0.158 0.088 0.055
imageplot.bma(bma.search1)
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Registered S3 method overwritten by 'rms':
## method from
## print.lrtest epiDisplay
##
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
##
## lrtest
model1 <- lrm(tuvong ~ age + lar + nkp, x=TRUE,y=TRUE, data=datalar)
model2 <- lrm(tuvong ~ lar + nkp, x=TRUE,y=TRUE, data=datalar)
model3 <- lrm(tuvong ~ age + lar + nkp + sofa, x=TRUE,y=TRUE, data=datalar)
model1
## Logistic Regression Model
##
## lrm(formula = tuvong ~ age + lar + nkp, data = datalar, x = TRUE,
## y = TRUE)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 170 LR chi2 75.03 R2 0.480 C 0.855
## 0 72 d.f. 3 R2(3,170)0.345 Dxy 0.709
## 1 98 Pr(> chi2) <0.0001 R2(3,124.5)0.439 gamma 0.709
## max |deriv| 8e-11 Brier 0.153 tau-a 0.348
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -4.7564 1.0158 -4.68 <0.0001
## age 0.0300 0.0122 2.47 0.0136
## lar 1.0821 0.1902 5.69 <0.0001
## nkp 1.2284 0.4088 3.00 0.0027
model2
## Logistic Regression Model
##
## lrm(formula = tuvong ~ lar + nkp, data = datalar, x = TRUE, y = TRUE)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 170 LR chi2 68.61 R2 0.446 C 0.839
## 0 72 d.f. 2 R2(2,170)0.324 Dxy 0.678
## 1 98 Pr(> chi2) <0.0001 R2(2,124.5)0.414 gamma 0.679
## max |deriv| 5e-06 Brier 0.161 tau-a 0.333
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -2.7098 0.5053 -5.36 <0.0001
## lar 1.0607 0.1842 5.76 <0.0001
## nkp 1.3974 0.3981 3.51 0.0004
model3
## Logistic Regression Model
##
## lrm(formula = tuvong ~ age + lar + nkp + sofa, data = datalar,
## x = TRUE, y = TRUE)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 170 LR chi2 78.50 R2 0.497 C 0.864
## 0 72 d.f. 4 R2(4,170)0.355 Dxy 0.728
## 1 98 Pr(> chi2) <0.0001 R2(4,124.5)0.450 gamma 0.728
## max |deriv| 4e-10 Brier 0.148 tau-a 0.357
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -5.4690 1.1221 -4.87 <0.0001
## age 0.0301 0.0123 2.45 0.0142
## lar 1.0336 0.1940 5.33 <0.0001
## nkp 1.0966 0.4154 2.64 0.0083
## sofa 0.1227 0.0671 1.83 0.0673
library(rms)
model1 <- lrm(tuvong ~ age + lar + nkp, x=TRUE,y=TRUE, data=datalar)
pred.logit <- predict(model1)
phat <- 1/(1+ exp(-pred.logit))
val.prob(phat, datalar$tuvong, m=10, cex=.5)
## Dxy C (ROC) R2 D D:Chi-sq
## 7.090420e-01 8.545210e-01 4.795776e-01 4.354707e-01 7.503001e+01
## D:p U U:Chi-sq U:p Q
## NA -1.176471e-02 -5.684342e-14 1.000000e+00 4.472354e-01
## Brier Intercept Slope Emax E90
## 1.531266e-01 2.069542e-14 1.000000e+00 5.176247e-02 4.402857e-02
## Eavg S:z S:p
## 2.718828e-02 1.262687e-01 8.995192e-01
library(rms)
model3 <- lrm(tuvong ~ age + lar + nkp + sofa, x=TRUE,y=TRUE, data=datalar)
pred.logit1 <- predict(model3)
phat1 <- 1/(1+ exp(-pred.logit1))
val.prob(phat1, datalar$tuvong, m=10, cex=.5)
## Dxy C (ROC) R2 D D:Chi-sq
## 7.277494e-01 8.638747e-01 4.970655e-01 4.559094e-01 7.850460e+01
## D:p U U:Chi-sq U:p Q
## NA -1.176471e-02 -2.842171e-14 1.000000e+00 4.676741e-01
## Brier Intercept Slope Emax E90
## 1.484784e-01 9.276900e-14 1.000000e+00 1.007307e-01 8.852993e-02
## Eavg S:z S:p
## 4.910426e-02 5.790848e-02 9.538215e-01
set.seed(123)
data <- data.frame(
age = datalar$age,
lar = datalar$lar,
nkp = datalar$nkp,
tuvong = datalar$tuvong
)
options(datadist = "ddist")
ddist <- datadist(data)
m1 <- lrm(tuvong ~ lar + age + nkp, data = data)
nom <- nomogram(m1, fun = plogis, fun.at = c(0.1, 0.25, 0.5, 0.75, 0.9),
lp = FALSE, funlabel = "Probability of Death")
plot(nom, xfrac = 0.2, label.every = list(age = 1))
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
str(datalar)
## tibble [170 × 49] (S3: tbl_df/tbl/data.frame)
## $ stt : num [1:170] 1 2 3 4 5 6 7 8 9 10 ...
## $ sex : num [1:170] 0 1 1 1 0 1 1 0 0 1 ...
## $ age : num [1:170] 87 70 69 71 66 78 73 87 63 67 ...
## $ ngay : num [1:170] 18 6 20 12 13 4 8 1 1 11 ...
## $ tha : num [1:170] 1 0 1 1 1 1 0 0 0 1 ...
## $ dtd : num [1:170] 0 0 1 1 1 0 0 0 0 0 ...
## $ btm : num [1:170] 0 0 0 1 0 1 0 0 0 1 ...
## $ dqn : num [1:170] 0 0 0 0 0 0 0 1 1 0 ...
## $ bpm : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
## $ bthm : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
## $ bgm : num [1:170] 0 0 0 0 0 0 0 0 0 0 ...
## $ ut : num [1:170] 0 1 0 0 0 0 0 0 1 0 ...
## $ nkp : num [1:170] 1 1 0 0 0 1 1 1 1 1 ...
## $ nktn : num [1:170] 0 0 0 0 1 0 0 0 1 0 ...
## $ nkth : num [1:170] 0 0 0 1 0 1 0 0 0 0 ...
## $ nkdmm : num [1:170] 0 0 0 0 0 0 0 0 1 0 ...
## $ khac : num [1:170] 0 0 1 0 0 0 0 0 0 0 ...
## $ gcs : num [1:170] 8 15 15 14 8 12 15 6 14 15 ...
## $ hatb : num [1:170] 60 102 123 163 60 63 87 83 88 88 ...
## $ mach : num [1:170] 103 90 62 98 112 130 66 160 135 82 ...
## $ nhietdo : num [1:170] 36.8 38.5 37 36 36.5 36.4 38.2 39 37.6 38.1 ...
## $ nhiptho : num [1:170] 30 35 20 25 33 32 19 29 30 28 ...
## $ sp02 : num [1:170] 94 90 82 97 85 85 95 78 85 98 ...
## $ caymau : num [1:170] 0 0 0 1 1 0 0 1 0 0 ...
## $ thongkhich: num [1:170] 1 1 0 1 1 1 0 1 0 1 ...
## $ locmau : num [1:170] 1 0 1 1 1 1 0 1 0 1 ...
## $ albumin : num [1:170] 2.15 2.05 2.94 2.5 1.73 2.8 2.11 2.01 2.06 3.2 ...
## $ lar : num [1:170] 0.98 2.2 0.31 4.36 3.53 1.68 0.95 6.37 3.01 2.69 ...
## $ lactate : num [1:170] 2.1 4.5 0.9 10.9 6.1 4.7 2 12.8 6.2 8.6 ...
## $ pct : num [1:170] 3.33 2.03 5.02 100 1.56 ...
## $ tuvong : num [1:170] 1 1 0 1 1 1 0 1 1 1 ...
## $ apacheii : num [1:170] 32 18 22 15 23 18 19 30 15 14 ...
## $ sofa : num [1:170] 10 4 9 2 6 6 2 12 4 7 ...
## $ bc : num [1:170] 25 14.2 26.4 15.7 16.2 ...
## $ hct : num [1:170] 40.1 42.3 37.1 39.1 28.3 39.2 28.9 45.2 23.3 36.7 ...
## $ tc : num [1:170] 202 399 162 119 390 293 339 238 135 76 ...
## $ bilitp : num [1:170] 12.8 20.1 564 11.1 9.9 10.8 15.8 24.3 14.2 39.3 ...
## $ na : num [1:170] 161 121 123 136 119 138 129 152 123 137 ...
## $ k : num [1:170] 4.4 4.9 5.6 3.3 3.8 5 3.9 3.3 3.6 4.5 ...
## $ ast : num [1:170] 70 56 1270 33 26 30 35 27 47 238 ...
## $ alt : num [1:170] 75 44 2429 19 26 ...
## $ ure : num [1:170] 44.7 7.7 38.5 7.3 7.6 9.6 7.9 14.8 11.2 14.6 ...
## $ crea : num [1:170] 335 123 632 107 102 144 87 114 74 314 ...
## $ egfr : num [1:170] 11.98 53.76 8.16 46.6 50.14 ...
## $ ph : num [1:170] 7.45 7.31 7.38 7.3 7.47 ...
## $ pc02 : num [1:170] 32 21 31.4 32.7 27.6 32.8 64.6 19.9 46.2 18.5 ...
## $ p02 : num [1:170] 89.4 134.1 73.9 84.4 81.8 ...
## $ hc03 : num [1:170] 21.9 10.4 18.2 15.5 19.6 20.1 21.9 13.3 21.2 10.3 ...
## $ alb_group : chr [1:170] "low" "low" "normal" "normal" ...
model_rf <- randomForest(tuvong ~ age + lar + nkp, data = datalar, importance = TRUE, ntree = 500)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
print(model_rf)
##
## Call:
## randomForest(formula = tuvong ~ age + lar + nkp, data = datalar, importance = TRUE, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.1729648
## % Var explained: 29.16
predictions <- predict(model_rf, datalar)
mse <- mean((predictions - datalar$tuvong)^2)
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 0.1015607
rsq <- 1 - sum((predictions - datalar$tuvong)^2) / sum((mean(datalar$tuvong) - datalar$tuvong)^2)
cat("R²:", rsq, "\n")
## R²: 0.5840273
# Tính RMSE
rmse <- sqrt(mean((predictions - datalar$tuvong)^2))
cat("RMSE:", rmse, "\n")
## RMSE: 0.3186858
# Tính MAE
mae <- mean(abs(predictions - datalar$tuvong))
cat("MAE:", mae, "\n")
## MAE: 0.2770989
# Tính R² điều chỉnh
n <- nrow(datalar)
p <- length(coef(model_rf)) - 1 # Số biến trong mô hình (cộng cả intercept)
r2_adj <- 1 - (1 - rsq) * (n - 1) / (n - p - 1)
cat("Adjusted R²:", r2_adj, "\n")
## Adjusted R²: 0.5864742
# Vẽ biểu đồ sai số
residuals <- predictions - datalar$tuvong
plot(residuals, main = "Residuals Plot", xlab = "Index", ylab = "Residuals")
abline(h = 0, col = "red")
# Ví dụ với cross-validation (n-folds)
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## The following object is masked from 'package:epiDisplay':
##
## alpha
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
##
## dotplot
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
train_control <- trainControl(method = "cv", number = 10) # 10-fold CV
cv_model <- train(tuvong ~ age + lar + nkp, data = datalar, method = "rf", trControl = train_control)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): The response has
## five or fewer unique values. Are you sure you want to do regression?
print(cv_model)
## Random Forest
##
## 170 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 153, 153, 153, 153, 153, 153, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.4269553 0.3043584 0.3280763
## 3 0.4314351 0.2970112 0.3268998
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
# Tính MAPE
mape <- mean(abs((predictions - datalar$tuvong) / datalar$tuvong)) * 100
cat("MAPE:", mape, "%\n")
## MAPE: Inf %
# Loại bỏ các giá trị thực tế bằng 0 trước khi tính MAPE
non_zero_data <- datalar[datalar$tuvong != 0, ]
predictions_non_zero <- predictions[datalar$tuvong != 0]
# Tính lại MAPE
mape <- mean(abs((predictions_non_zero - non_zero_data$tuvong) / non_zero_data$tuvong)) * 100
cat("MAPE:", mape, "%\n")
## MAPE: 24.2408 %
Với R² = 57.49% và MSE = 0.1038, mô hình hiện tại đang hoạt động tốt. R² = 0.5749125: Hệ số xác định (R²) cho biết mô hình có thể giải thích khoảng 57.49% biến động của biến kết quả (tuvong). Đây là một mức độ giải thích khá tốt, đặc biệt đối với các mô hình phức tạp như hồi quy rừng ngẫu nhiên. MSE = 0.1037861: MSE cho thấy sai số bình phương trung bình của mô hình là 0.1038. Một giá trị MSE thấp cho thấy mô hình của bạn có độ chính xác cao trong việc dự đoán các giá trị thực tế.