Xử lý giá trị trống theo phương pháp hiện đại: IMPUTATION * Multiple imputation (MI), giả định MAR (Giá trị trống ngẫu nhiên (Miss at ramdom)) * Imputation 5 - 20 lần * Dùng mô hình, và tham số của mô hình được ước tính 5 - 20 lần * Trung bình các tham số * Xem xét đến phương sai, phản ánh quá trình bất định.

Kỹ thuật Multiple Imputation * Giả định MAR () * Tạo ra nhiều dữ liệu mới từ dữ liệu gốc (Ngẫu nhiên hóa) * Mỗi dữ liệu hay dataset sẽ khác nhau vì quá trình chọn ngẫu nhiên * Ước tính các tham số của mô hình cho từng dataset * tính trung bình tham số

# Gọi thư viện để xử lý dữ liệu trống
library("VIM", lib.loc="~/R/win-library/3.4")
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library("mice", lib.loc="~/R/win-library/3.4")
path = dir("C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data", full.names = TRUE)
path
## [1] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/DuLieuXuLyDuLieuTrongXong.csv"       
## [2] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/GiaiDoan016.csv"                     
## [3] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/GiaiDoan612.csv"                     
## [4] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanNhom-TheoGiaiDoan.xlsx"          
## [5] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanTich-TatCaBenhNhan-ChuyenDoc.csv"
## [6] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanTich-TatCaBenhNhan.csv"          
## [7] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanTich-TatCaBenhNhan.xlsx"
#NhomTong = read.csv(path[4], header = TRUE)
NhomTong = read.csv("C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanTich-TatCaBenhNhan-ChuyenDoc.csv", header = TRUE)
# Kiem Tra NhomTong
#head(NhomTong)
#str(NhomTong)
#NhomTong
#Bước 1: Xem xét dữ liệu và biểu đồ
md.pattern(NhomTong)
##     IDBenhNhan time MI.0 BI.0 RLEP.0 RnaDna16s.0 RnaDna18k.0    
## 202          1    1    1    1      1           1           1   0
##  22          1    1    0    0      0           0           0   5
##              0    0   22   22     22          22          22 110

Qua biểu đồ ta thấy có 0,098 dữ liệu trống

matrixplot(NhomTong, interactive = F)

Bước 2: Impute giá trị trống

# Với dataset là NhomTong, chạy tự động với mã 2410, và không in ra kết quả
mNhomTong = mice(NhomTong, seed = 2410, printFlag = F)

Bước 3: kiểm tra giá trị imputed

mNhomTong$imp$MI.0
##      1  2  3  4  5
## 66  34  2 33  0  2
## 67  24 19 33  0 32
## 68  11  0 36  0  1
## 69  31  2  0  0  2
## 70   2  2  4  2 23
## 71  18  2  0 10  0
## 72   2  8 19  2  7
## 73  11  4  4 12  0
## 74   1 11  7 11 29
## 75   4  2  0  0  0
## 76   6  0  0 10  5
## 77   0  2 23  0  6
## 78  14 23  1  4  4
## 169  0  0  0  0  0
## 170  0  0  0  0  0
## 171  0  0  0  0  0
## 172  2  0  0  0  0
## 173  0  0  0  0  0
## 174  0  0  0  0  0
## 175  0  0  0  0  0
## 176  0  0  0  0  0
## 177  0  0  0  0  0
mNhomTong$imp$BI.0
##     1 2 3 4 5
## 66  6 3 5 4 4
## 67  6 4 5 3 3
## 68  6 3 5 5 4
## 69  4 4 4 4 4
## 70  4 5 4 4 5
## 71  6 5 5 5 4
## 72  3 5 3 4 3
## 73  4 5 3 5 4
## 74  3 5 4 3 5
## 75  5 4 3 5 3
## 76  6 4 3 5 6
## 77  3 5 5 4 5
## 78  5 5 4 4 5
## 169 5 3 3 5 4
## 170 4 4 3 4 3
## 171 2 4 5 4 1
## 172 4 4 2 3 4
## 173 3 4 4 3 1
## 174 5 1 3 2 1
## 175 3 3 5 3 2
## 176 3 3 5 2 2
## 177 3 1 3 5 2
mNhomTong$imp$RLEP.0
##           1         2       3        4      5
## 66  2420000      27.6 7540000   4520.0   9245
## 67   332000  168000.0  117000    135.0 332000
## 68   117000      87.0  117000   4040.0 211000
## 69  7540000   80000.0    2870    789.0   1900
## 70    54381  253000.0  326000   7650.0 332000
## 71   764302   99400.0     129   6970.0   3240
## 72    87600  117000.0  666600  44400.0   3870
## 73    35200  546301.0    6120  15700.0  45321
## 74    13250   46301.0   12700 503000.0 332000
## 75    15700   76543.0     567    153.0    186
## 76  4800000    3080.0    8170 253000.0  97200
## 77       54  523000.0  523000     54.0 764310
## 78   434000 2910000.0    3987  10000.0    153
## 169 4610000    9245.0    3240 101000.0     54
## 170 1200000 4610000.0  264000  70500.0    765
## 171     761   96800.0   54321   6970.0    242
## 172  656540   76500.0   24500  45321.0    226
## 173    3289   30600.0   35200     24.3   7240
## 174   67890     152.0    4780  13250.0    242
## 175    7650    1900.0  272000     45.1   7240
## 176     239     987.0 4800000    567.0   1580
## 177    3870    1580.0  264000   3080.0   7240
mNhomTong$imp$RnaDna16s.0
##             1         2         3         4         5
## 66  0.7489405 1.0042201 0.7664103 0.7664103 1.0129067
## 67  1.0000000 0.4222044 2.3403119 0.9463608 0.7413186
## 68  1.2371603 0.6879172 1.2149860 0.7761954 0.7236891
## 69  1.0298951 0.7546582 0.6780385 1.0896362 0.9031621
## 70  1.1924612 1.0001313 1.0895081 0.7138568 0.7047716
## 71  0.7422513 0.8331555 1.0941767 1.1924612 0.8428145
## 72  1.0001313 1.0341532 0.5390218 1.1924612 0.7546582
## 73  0.6070761 0.8331555 0.7656988 0.7761954 0.8428145
## 74  0.9662293 1.2877231 0.9031621 1.1785694 1.1095624
## 75  0.7791702 1.0895081 0.8038947 0.6914069 1.1049442
## 76  0.4630914 1.0466421 0.8038947 0.8825880 0.8825880
## 77  0.7219633 0.9276746 1.2149860 0.9031621 1.0945120
## 78  0.7761954 0.8513429 0.7047716 1.0165241 1.3446611
## 169 0.5036371 0.5036371 0.0000000 0.6267415 0.4169918
## 170 1.4327026 0.0000000 0.0000000 0.4169918 0.5602715
## 171 0.0000000 0.3958349 0.3387598 0.5308539 0.0000000
## 172 0.6526269 0.5653620 0.3997562 0.0000000 0.6885706
## 173 0.0000000 0.0000000 0.4169918 0.5602715 0.0000000
## 174 0.0000000 0.2856312 0.5602715 0.0000000 0.0000000
## 175 0.0000000 0.5602715 0.0000000 0.2219920 1.0308191
## 176 0.3636024 0.5602715 0.0000000 0.0000000 0.0000000
## 177 0.4683268 0.0000000 0.0000000 0.6267415 0.0000000
mNhomTong$imp$RnaDna18k.0
##             1         2         3          4         5
## 66  1.1464661 0.4470152 0.3864323 0.00000000 1.3225832
## 67  0.0000000 0.6960050 0.6043562 0.35105605 0.3822505
## 68  0.2905415 0.2513088 0.7762126 0.57764071 0.0000000
## 69  0.4244201 0.0000000 0.0000000 0.00000000 0.2471131
## 70  0.5776407 0.0000000 0.6960050 0.00000000 0.2149637
## 71  0.9584373 0.0000000 0.0000000 1.28235895 0.0000000
## 72  0.0000000 0.9065027 0.0000000 1.32258322 0.4838407
## 73  0.0000000 0.7285392 0.3489503 0.40156631 0.0000000
## 74  0.0000000 0.8720276 0.2317003 0.46583084 0.0000000
## 75  0.7698667 0.0000000 0.0000000 0.49082777 0.0000000
## 76  0.0000000 0.6635444 0.3334165 0.07817380 0.7698667
## 77  0.0000000 0.0000000 0.0000000 0.00000000 0.4848191
## 78  0.0000000 0.0000000 0.0000000 0.03108466 0.0000000
## 169 0.0000000 0.2513088 0.0000000 0.00000000 0.2591279
## 170 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 171 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 172 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 173 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 174 0.0000000 0.0000000 0.2591279 0.00000000 0.0000000
## 175 0.0000000 0.0000000 0.0000000 0.00000000 0.6563490
## 176 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 177 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000

Bước 4: Tạo ra dataset mới (imputed data)

mNhomTong1 = complete(mNhomTong, action = 1)
mNhomTong2 = complete(mNhomTong, action = 2)
mNhomTong3 = complete(mNhomTong, action = 3)
mNhomTong4 = complete(mNhomTong, action = 4)
mNhomTong5 = complete(mNhomTong, action = 5)

Bước 5: Phân tích bằng mô hình

model0 = lm (NhomTong$RnaDna16s.0 ~ NhomTong$time, data = NhomTong)
summary(model0)
## 
## Call:
## lm(formula = NhomTong$RnaDna16s.0 ~ NhomTong$time, data = NhomTong)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8518 -0.2796 -0.0310  0.2213  1.4365 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.903826   0.039049  23.146  < 2e-16 ***
## NhomTong$time -0.052022   0.005907  -8.807 6.17e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3929 on 200 degrees of freedom
##   (22 observations deleted due to missingness)
## Multiple R-squared:  0.2794, Adjusted R-squared:  0.2758 
## F-statistic: 77.55 on 1 and 200 DF,  p-value: 6.174e-16
model1 = lm (mNhomTong1$RnaDna16s.0 ~ mNhomTong1$time, data = mNhomTong1)
summary(model1)
## 
## Call:
## lm(formula = mNhomTong1$RnaDna16s.0 ~ mNhomTong1$time, data = mNhomTong1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85238 -0.29279 -0.03733  0.20754  1.43706 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.903257   0.036577  24.694   <2e-16 ***
## mNhomTong1$time -0.050872   0.005438  -9.356   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3876 on 222 degrees of freedom
## Multiple R-squared:  0.2828, Adjusted R-squared:  0.2795 
## F-statistic: 87.53 on 1 and 222 DF,  p-value: < 2.2e-16
model2 = lm (mNhomTong2$RnaDna16s.0 ~ mNhomTong2$time, data = mNhomTong2)
summary(model2)
## 
## Call:
## lm(formula = mNhomTong2$RnaDna16s.0 ~ mNhomTong2$time, data = mNhomTong2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85654 -0.28420 -0.02339  0.21944  1.43174 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.908576   0.035827  25.360   <2e-16 ***
## mNhomTong2$time -0.052031   0.005326  -9.769   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3797 on 222 degrees of freedom
## Multiple R-squared:  0.3007, Adjusted R-squared:  0.2975 
## F-statistic: 95.44 on 1 and 222 DF,  p-value: < 2.2e-16
model3 = lm (mNhomTong3$RnaDna16s.0 ~ mNhomTong3$time, data = mNhomTong3)
summary(model3)
## 
## Call:
## lm(formula = mNhomTong3$RnaDna16s.0 ~ mNhomTong3$time, data = mNhomTong3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86586 -0.26608 -0.05062  0.22482  1.47445 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.920382   0.036991  24.881   <2e-16 ***
## mNhomTong3$time -0.054525   0.005499  -9.915   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.392 on 222 degrees of freedom
## Multiple R-squared:  0.3069, Adjusted R-squared:  0.3038 
## F-statistic: 98.31 on 1 and 222 DF,  p-value: < 2.2e-16
model4 = lm (mNhomTong4$RnaDna16s.0 ~ mNhomTong4$time, data = mNhomTong4)
summary(model4)
## 
## Call:
## lm(formula = mNhomTong4$RnaDna16s.0 ~ mNhomTong4$time, data = mNhomTong4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85900 -0.28555 -0.03384  0.23771  1.42918 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.911131   0.035830  25.429   <2e-16 ***
## mNhomTong4$time -0.052131   0.005326  -9.787   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3797 on 222 degrees of freedom
## Multiple R-squared:  0.3014, Adjusted R-squared:  0.2983 
## F-statistic: 95.79 on 1 and 222 DF,  p-value: < 2.2e-16
model5 = lm (mNhomTong5$RnaDna16s.0 ~ mNhomTong5$time, data = mNhomTong5)
summary(model5)
## 
## Call:
## lm(formula = mNhomTong5$RnaDna16s.0 ~ mNhomTong5$time, data = mNhomTong5)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85880 -0.28136 -0.03291  0.23547  1.42902 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.911289   0.036168  25.196   <2e-16 ***
## mNhomTong5$time -0.052494   0.005377  -9.763   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3833 on 222 degrees of freedom
## Multiple R-squared:  0.3004, Adjusted R-squared:  0.2972 
## F-statistic: 95.32 on 1 and 222 DF,  p-value: < 2.2e-16

Ta chọn mô hình model1 có intercept giống với model0 nhất để thực hiện bước tiếp theo.

Xuất dữ liệu sang file csv để lưu trữ

write.csv(mNhomTong1,"DuLieu.csv")