Xử lý giá trị trống - BV Thống Nhất (29-30/12/2025)

Libraries

library(VIM)
library(mice)

Dữ liệu sleep

data(sleep)
dim(sleep)
## [1] 62 10
summary(sleep)
##     BodyWgt            BrainWgt            NonD            Dream      
##  Min.   :   0.005   Min.   :   0.14   Min.   : 2.100   Min.   :0.000  
##  1st Qu.:   0.600   1st Qu.:   4.25   1st Qu.: 6.250   1st Qu.:0.900  
##  Median :   3.342   Median :  17.25   Median : 8.350   Median :1.800  
##  Mean   : 198.790   Mean   : 283.13   Mean   : 8.673   Mean   :1.972  
##  3rd Qu.:  48.202   3rd Qu.: 166.00   3rd Qu.:11.000   3rd Qu.:2.550  
##  Max.   :6654.000   Max.   :5712.00   Max.   :17.900   Max.   :6.600  
##                                       NA's   :14       NA's   :12     
##      Sleep            Span              Gest             Pred      
##  Min.   : 2.60   Min.   :  2.000   Min.   : 12.00   Min.   :1.000  
##  1st Qu.: 8.05   1st Qu.:  6.625   1st Qu.: 35.75   1st Qu.:2.000  
##  Median :10.45   Median : 15.100   Median : 79.00   Median :3.000  
##  Mean   :10.53   Mean   : 19.878   Mean   :142.35   Mean   :2.871  
##  3rd Qu.:13.20   3rd Qu.: 27.750   3rd Qu.:207.50   3rd Qu.:4.000  
##  Max.   :19.90   Max.   :100.000   Max.   :645.00   Max.   :5.000  
##  NA's   :4       NA's   :4         NA's   :4                       
##       Exp            Danger     
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.000   Median :2.000  
##  Mean   :2.419   Mean   :2.613  
##  3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000  
## 

Đánh giá dữ liệu trống

Tóm tắt

md.pattern(sleep)

##    BodyWgt BrainWgt Pred Exp Danger Sleep Span Gest Dream NonD   
## 42       1        1    1   1      1     1    1    1     1    1  0
## 9        1        1    1   1      1     1    1    1     0    0  2
## 3        1        1    1   1      1     1    1    0     1    1  1
## 2        1        1    1   1      1     1    0    1     1    1  1
## 1        1        1    1   1      1     1    0    1     0    0  3
## 1        1        1    1   1      1     1    0    0     1    1  2
## 2        1        1    1   1      1     0    1    1     1    0  2
## 2        1        1    1   1      1     0    1    1     0    0  3
##          0        0    0   0      0     4    4    4    12   14 38

Biểu đồ:

aggr(sleep, prop = TRUE, numbers = TRUE)

matrixplot(sleep, interactive = TRUE, sortby = "Sleep")

Thực hiện imputation

sleep.i = mice(sleep, seed = 1234, maxit = 5, defaultMethod = c("pmm"), printFlag = F)

sleep.i$imp$Dream
##      1   2   3   4   5
## 1  0.5 1.4 0.5 0.5 0.6
## 3  1.4 1.9 1.8 2.0 1.3
## 4  2.7 3.4 3.1 4.1 2.7
## 14 0.3 1.0 0.5 0.0 0.0
## 24 3.6 1.0 1.4 1.2 1.4
## 26 2.7 0.5 3.9 3.1 1.2
## 30 2.2 0.6 2.7 2.4 3.4
## 31 0.9 0.5 0.9 1.4 2.4
## 47 1.3 3.1 1.8 2.8 3.9
## 53 1.0 0.5 0.6 0.5 0.5
## 55 2.6 3.4 2.4 2.0 0.5
## 62 2.2 3.6 2.6 3.4 2.8

Xem 1 tập dữ liệu được imputed

sleep.3 = complete(sleep.i, action = 3)
head(sleep)
##    BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
## 1 6654.000   5712.0   NA    NA   3.3 38.6  645    3   5      3
## 2    1.000      6.6  6.3   2.0   8.3  4.5   42    3   1      3
## 3    3.385     44.5   NA    NA  12.5 14.0   60    1   1      1
## 4    0.920      5.7   NA    NA  16.5   NA   25    5   2      3
## 5 2547.000   4603.0  2.1   1.8   3.9 69.0  624    3   5      4
## 6   10.550    179.5  9.1   0.7   9.8 27.0  180    4   4      4
head(sleep.3)
##    BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
## 1 6654.000   5712.0  3.2   0.5   3.3 38.6  645    3   5      3
## 2    1.000      6.6  6.3   2.0   8.3  4.5   42    3   1      3
## 3    3.385     44.5 11.0   1.8  12.5 14.0   60    1   1      1
## 4    0.920      5.7 13.2   3.1  16.5  2.0   25    5   2      3
## 5 2547.000   4603.0  2.1   1.8   3.9 69.0  624    3   5      4
## 6   10.550    179.5  9.1   0.7   9.8 27.0  180    4   4      4

Thực hiện phân tích

m.mi = with(data = sleep.i, exp = lm(Dream ~ Span + Gest))
summary(pool(m.mi))
##          term     estimate   std.error statistic       df      p.value
## 1 (Intercept)  2.598553331 0.247119369 10.515377 51.61960 1.949165e-14
## 2        Span -0.005256987 0.011726809 -0.448288 53.36003 6.557604e-01
## 3        Gest -0.004050236 0.001495123 -2.708965 48.20381 9.316284e-03

So sánh các kết quả

m.complete = lm(Dream ~ Span + Gest, data = sleep)
summary(m.complete)
## 
## Call:
## lm(formula = Dream ~ Span + Gest, data = sleep)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3135 -0.8576 -0.2179  0.4079  4.1844 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.4787482  0.2906617   8.528 1.26e-10 ***
## Span        -0.0008873  0.0123100  -0.072   0.9429    
## Gest        -0.0043187  0.0017557  -2.460   0.0182 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.271 on 41 degrees of freedom
##   (18 observations deleted due to missingness)
## Multiple R-squared:  0.1953, Adjusted R-squared:  0.156 
## F-statistic: 4.975 on 2 and 41 DF,  p-value: 0.01163
m.single = lm(Dream ~ Span + Gest, data = sleep.3)
summary(m.single)
## 
## Call:
## lm(formula = Dream ~ Span + Gest, data = sleep.3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3066 -0.7156 -0.1811  0.3891  4.0520 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.619976   0.233530  11.219  2.9e-16 ***
## Span        -0.003851   0.011302  -0.341  0.73452    
## Gest        -0.004316   0.001416  -3.048  0.00345 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.225 on 59 degrees of freedom
## Multiple R-squared:  0.2357, Adjusted R-squared:  0.2098 
## F-statistic: 9.098 on 2 and 59 DF,  p-value: 0.0003599