Xử lý giá trị trống - BV Thống Nhất (29-30/12/2025)
Libraries
library(VIM)
library(mice)
Dữ liệu sleep
data(sleep)
dim(sleep)
## [1] 62 10
summary(sleep)
## BodyWgt BrainWgt NonD Dream
## Min. : 0.005 Min. : 0.14 Min. : 2.100 Min. :0.000
## 1st Qu.: 0.600 1st Qu.: 4.25 1st Qu.: 6.250 1st Qu.:0.900
## Median : 3.342 Median : 17.25 Median : 8.350 Median :1.800
## Mean : 198.790 Mean : 283.13 Mean : 8.673 Mean :1.972
## 3rd Qu.: 48.202 3rd Qu.: 166.00 3rd Qu.:11.000 3rd Qu.:2.550
## Max. :6654.000 Max. :5712.00 Max. :17.900 Max. :6.600
## NA's :14 NA's :12
## Sleep Span Gest Pred
## Min. : 2.60 Min. : 2.000 Min. : 12.00 Min. :1.000
## 1st Qu.: 8.05 1st Qu.: 6.625 1st Qu.: 35.75 1st Qu.:2.000
## Median :10.45 Median : 15.100 Median : 79.00 Median :3.000
## Mean :10.53 Mean : 19.878 Mean :142.35 Mean :2.871
## 3rd Qu.:13.20 3rd Qu.: 27.750 3rd Qu.:207.50 3rd Qu.:4.000
## Max. :19.90 Max. :100.000 Max. :645.00 Max. :5.000
## NA's :4 NA's :4 NA's :4
## Exp Danger
## Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :2.000
## Mean :2.419 Mean :2.613
## 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000
##
Đánh giá dữ liệu trống
Tóm tắt
md.pattern(sleep)

## BodyWgt BrainWgt Pred Exp Danger Sleep Span Gest Dream NonD
## 42 1 1 1 1 1 1 1 1 1 1 0
## 9 1 1 1 1 1 1 1 1 0 0 2
## 3 1 1 1 1 1 1 1 0 1 1 1
## 2 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 1 1 1 1 0 1 0 0 3
## 1 1 1 1 1 1 1 0 0 1 1 2
## 2 1 1 1 1 1 0 1 1 1 0 2
## 2 1 1 1 1 1 0 1 1 0 0 3
## 0 0 0 0 0 4 4 4 12 14 38
Biểu đồ:
aggr(sleep, prop = TRUE, numbers = TRUE)

matrixplot(sleep, interactive = TRUE, sortby = "Sleep")

Thực hiện imputation
sleep.i = mice(sleep, seed = 1234, maxit = 5, defaultMethod = c("pmm"), printFlag = F)
sleep.i$imp$Dream
## 1 2 3 4 5
## 1 0.5 1.4 0.5 0.5 0.6
## 3 1.4 1.9 1.8 2.0 1.3
## 4 2.7 3.4 3.1 4.1 2.7
## 14 0.3 1.0 0.5 0.0 0.0
## 24 3.6 1.0 1.4 1.2 1.4
## 26 2.7 0.5 3.9 3.1 1.2
## 30 2.2 0.6 2.7 2.4 3.4
## 31 0.9 0.5 0.9 1.4 2.4
## 47 1.3 3.1 1.8 2.8 3.9
## 53 1.0 0.5 0.6 0.5 0.5
## 55 2.6 3.4 2.4 2.0 0.5
## 62 2.2 3.6 2.6 3.4 2.8
Xem 1 tập dữ liệu được imputed
sleep.3 = complete(sleep.i, action = 3)
head(sleep)
## BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
## 1 6654.000 5712.0 NA NA 3.3 38.6 645 3 5 3
## 2 1.000 6.6 6.3 2.0 8.3 4.5 42 3 1 3
## 3 3.385 44.5 NA NA 12.5 14.0 60 1 1 1
## 4 0.920 5.7 NA NA 16.5 NA 25 5 2 3
## 5 2547.000 4603.0 2.1 1.8 3.9 69.0 624 3 5 4
## 6 10.550 179.5 9.1 0.7 9.8 27.0 180 4 4 4
head(sleep.3)
## BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
## 1 6654.000 5712.0 3.2 0.5 3.3 38.6 645 3 5 3
## 2 1.000 6.6 6.3 2.0 8.3 4.5 42 3 1 3
## 3 3.385 44.5 11.0 1.8 12.5 14.0 60 1 1 1
## 4 0.920 5.7 13.2 3.1 16.5 2.0 25 5 2 3
## 5 2547.000 4603.0 2.1 1.8 3.9 69.0 624 3 5 4
## 6 10.550 179.5 9.1 0.7 9.8 27.0 180 4 4 4
Thực hiện phân tích
m.mi = with(data = sleep.i, exp = lm(Dream ~ Span + Gest))
summary(pool(m.mi))
## term estimate std.error statistic df p.value
## 1 (Intercept) 2.598553331 0.247119369 10.515377 51.61960 1.949165e-14
## 2 Span -0.005256987 0.011726809 -0.448288 53.36003 6.557604e-01
## 3 Gest -0.004050236 0.001495123 -2.708965 48.20381 9.316284e-03
So sánh các kết quả
m.complete = lm(Dream ~ Span + Gest, data = sleep)
summary(m.complete)
##
## Call:
## lm(formula = Dream ~ Span + Gest, data = sleep)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3135 -0.8576 -0.2179 0.4079 4.1844
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.4787482 0.2906617 8.528 1.26e-10 ***
## Span -0.0008873 0.0123100 -0.072 0.9429
## Gest -0.0043187 0.0017557 -2.460 0.0182 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.271 on 41 degrees of freedom
## (18 observations deleted due to missingness)
## Multiple R-squared: 0.1953, Adjusted R-squared: 0.156
## F-statistic: 4.975 on 2 and 41 DF, p-value: 0.01163
m.single = lm(Dream ~ Span + Gest, data = sleep.3)
summary(m.single)
##
## Call:
## lm(formula = Dream ~ Span + Gest, data = sleep.3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3066 -0.7156 -0.1811 0.3891 4.0520
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.619976 0.233530 11.219 2.9e-16 ***
## Span -0.003851 0.011302 -0.341 0.73452
## Gest -0.004316 0.001416 -3.048 0.00345 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.225 on 59 degrees of freedom
## Multiple R-squared: 0.2357, Adjusted R-squared: 0.2098
## F-statistic: 9.098 on 2 and 59 DF, p-value: 0.0003599