Xử lý giá trị trống theo phương pháp hiện đại: IMPUTATION * Multiple imputation (MI), giả định MAR (Giá trị trống ngẫu nhiên (Miss at ramdom)) * Imputation 5 - 20 lần * Dùng mô hình, và tham số của mô hình được ước tính 5 - 20 lần * Trung bình các tham số * Xem xét đến phương sai, phản ánh quá trình bất định.
Kỹ thuật Multiple Imputation * Giả định MAR () * Tạo ra nhiều dữ liệu mới từ dữ liệu gốc (Ngẫu nhiên hóa) * Mỗi dữ liệu hay dataset sẽ khác nhau vì quá trình chọn ngẫu nhiên * Ước tính các tham số của mô hình cho từng dataset * tính trung bình tham số
# Gọi thư viện để xử lý dữ liệu trống
library("VIM", lib.loc="~/R/win-library/3.4")
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library("mice", lib.loc="~/R/win-library/3.4")
path = dir("C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data", full.names = TRUE)
path
## [1] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/DuLieuXuLyDuLieuTrongXong.csv"
## [2] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/GiaiDoan016.csv"
## [3] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/GiaiDoan612.csv"
## [4] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanNhom-TheoGiaiDoan.xlsx"
## [5] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanTich-TatCaBenhNhan-ChuyenDoc.csv"
## [6] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanTich-TatCaBenhNhan.csv"
## [7] "C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanTich-TatCaBenhNhan.xlsx"
#NhomTong = read.csv(path[4], header = TRUE)
NhomTong = read.csv("C:\\Users\\tran\\Google Drive\\20170526 Chi Ha SHPT\\R\\Lan2\\data/PhanTich-TatCaBenhNhan-ChuyenDoc.csv", header = TRUE)
# Kiem Tra NhomTong
#head(NhomTong)
#str(NhomTong)
#NhomTong
#Bước 1: Xem xét dữ liệu và biểu đồ
md.pattern(NhomTong)
## IDBenhNhan time MI.0 BI.0 RLEP.0 RnaDna16s.0 RnaDna18k.0
## 202 1 1 1 1 1 1 1 0
## 22 1 1 0 0 0 0 0 5
## 0 0 22 22 22 22 22 110
Qua biểu đồ ta thấy có 0,098 dữ liệu trống
matrixplot(NhomTong, interactive = F)
Bước 2: Impute giá trị trống
# Với dataset là NhomTong, chạy tự động với mã 2410, và không in ra kết quả
mNhomTong = mice(NhomTong, seed = 2410, printFlag = F)
Bước 3: kiểm tra giá trị imputed
mNhomTong$imp$MI.0
## 1 2 3 4 5
## 66 34 2 33 0 2
## 67 24 19 33 0 32
## 68 11 0 36 0 1
## 69 31 2 0 0 2
## 70 2 2 4 2 23
## 71 18 2 0 10 0
## 72 2 8 19 2 7
## 73 11 4 4 12 0
## 74 1 11 7 11 29
## 75 4 2 0 0 0
## 76 6 0 0 10 5
## 77 0 2 23 0 6
## 78 14 23 1 4 4
## 169 0 0 0 0 0
## 170 0 0 0 0 0
## 171 0 0 0 0 0
## 172 2 0 0 0 0
## 173 0 0 0 0 0
## 174 0 0 0 0 0
## 175 0 0 0 0 0
## 176 0 0 0 0 0
## 177 0 0 0 0 0
mNhomTong$imp$BI.0
## 1 2 3 4 5
## 66 6 3 5 4 4
## 67 6 4 5 3 3
## 68 6 3 5 5 4
## 69 4 4 4 4 4
## 70 4 5 4 4 5
## 71 6 5 5 5 4
## 72 3 5 3 4 3
## 73 4 5 3 5 4
## 74 3 5 4 3 5
## 75 5 4 3 5 3
## 76 6 4 3 5 6
## 77 3 5 5 4 5
## 78 5 5 4 4 5
## 169 5 3 3 5 4
## 170 4 4 3 4 3
## 171 2 4 5 4 1
## 172 4 4 2 3 4
## 173 3 4 4 3 1
## 174 5 1 3 2 1
## 175 3 3 5 3 2
## 176 3 3 5 2 2
## 177 3 1 3 5 2
mNhomTong$imp$RLEP.0
## 1 2 3 4 5
## 66 2420000 27.6 7540000 4520.0 9245
## 67 332000 168000.0 117000 135.0 332000
## 68 117000 87.0 117000 4040.0 211000
## 69 7540000 80000.0 2870 789.0 1900
## 70 54381 253000.0 326000 7650.0 332000
## 71 764302 99400.0 129 6970.0 3240
## 72 87600 117000.0 666600 44400.0 3870
## 73 35200 546301.0 6120 15700.0 45321
## 74 13250 46301.0 12700 503000.0 332000
## 75 15700 76543.0 567 153.0 186
## 76 4800000 3080.0 8170 253000.0 97200
## 77 54 523000.0 523000 54.0 764310
## 78 434000 2910000.0 3987 10000.0 153
## 169 4610000 9245.0 3240 101000.0 54
## 170 1200000 4610000.0 264000 70500.0 765
## 171 761 96800.0 54321 6970.0 242
## 172 656540 76500.0 24500 45321.0 226
## 173 3289 30600.0 35200 24.3 7240
## 174 67890 152.0 4780 13250.0 242
## 175 7650 1900.0 272000 45.1 7240
## 176 239 987.0 4800000 567.0 1580
## 177 3870 1580.0 264000 3080.0 7240
mNhomTong$imp$RnaDna16s.0
## 1 2 3 4 5
## 66 0.7489405 1.0042201 0.7664103 0.7664103 1.0129067
## 67 1.0000000 0.4222044 2.3403119 0.9463608 0.7413186
## 68 1.2371603 0.6879172 1.2149860 0.7761954 0.7236891
## 69 1.0298951 0.7546582 0.6780385 1.0896362 0.9031621
## 70 1.1924612 1.0001313 1.0895081 0.7138568 0.7047716
## 71 0.7422513 0.8331555 1.0941767 1.1924612 0.8428145
## 72 1.0001313 1.0341532 0.5390218 1.1924612 0.7546582
## 73 0.6070761 0.8331555 0.7656988 0.7761954 0.8428145
## 74 0.9662293 1.2877231 0.9031621 1.1785694 1.1095624
## 75 0.7791702 1.0895081 0.8038947 0.6914069 1.1049442
## 76 0.4630914 1.0466421 0.8038947 0.8825880 0.8825880
## 77 0.7219633 0.9276746 1.2149860 0.9031621 1.0945120
## 78 0.7761954 0.8513429 0.7047716 1.0165241 1.3446611
## 169 0.5036371 0.5036371 0.0000000 0.6267415 0.4169918
## 170 1.4327026 0.0000000 0.0000000 0.4169918 0.5602715
## 171 0.0000000 0.3958349 0.3387598 0.5308539 0.0000000
## 172 0.6526269 0.5653620 0.3997562 0.0000000 0.6885706
## 173 0.0000000 0.0000000 0.4169918 0.5602715 0.0000000
## 174 0.0000000 0.2856312 0.5602715 0.0000000 0.0000000
## 175 0.0000000 0.5602715 0.0000000 0.2219920 1.0308191
## 176 0.3636024 0.5602715 0.0000000 0.0000000 0.0000000
## 177 0.4683268 0.0000000 0.0000000 0.6267415 0.0000000
mNhomTong$imp$RnaDna18k.0
## 1 2 3 4 5
## 66 1.1464661 0.4470152 0.3864323 0.00000000 1.3225832
## 67 0.0000000 0.6960050 0.6043562 0.35105605 0.3822505
## 68 0.2905415 0.2513088 0.7762126 0.57764071 0.0000000
## 69 0.4244201 0.0000000 0.0000000 0.00000000 0.2471131
## 70 0.5776407 0.0000000 0.6960050 0.00000000 0.2149637
## 71 0.9584373 0.0000000 0.0000000 1.28235895 0.0000000
## 72 0.0000000 0.9065027 0.0000000 1.32258322 0.4838407
## 73 0.0000000 0.7285392 0.3489503 0.40156631 0.0000000
## 74 0.0000000 0.8720276 0.2317003 0.46583084 0.0000000
## 75 0.7698667 0.0000000 0.0000000 0.49082777 0.0000000
## 76 0.0000000 0.6635444 0.3334165 0.07817380 0.7698667
## 77 0.0000000 0.0000000 0.0000000 0.00000000 0.4848191
## 78 0.0000000 0.0000000 0.0000000 0.03108466 0.0000000
## 169 0.0000000 0.2513088 0.0000000 0.00000000 0.2591279
## 170 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 171 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 172 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 173 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 174 0.0000000 0.0000000 0.2591279 0.00000000 0.0000000
## 175 0.0000000 0.0000000 0.0000000 0.00000000 0.6563490
## 176 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
## 177 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000
Bước 4: Tạo ra dataset mới (imputed data)
mNhomTong1 = complete(mNhomTong, action = 1)
mNhomTong2 = complete(mNhomTong, action = 2)
mNhomTong3 = complete(mNhomTong, action = 3)
mNhomTong4 = complete(mNhomTong, action = 4)
mNhomTong5 = complete(mNhomTong, action = 5)
Bước 5: Phân tích bằng mô hình
model0 = lm (NhomTong$RnaDna16s.0 ~ NhomTong$time, data = NhomTong)
summary(model0)
##
## Call:
## lm(formula = NhomTong$RnaDna16s.0 ~ NhomTong$time, data = NhomTong)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8518 -0.2796 -0.0310 0.2213 1.4365
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.903826 0.039049 23.146 < 2e-16 ***
## NhomTong$time -0.052022 0.005907 -8.807 6.17e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3929 on 200 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.2794, Adjusted R-squared: 0.2758
## F-statistic: 77.55 on 1 and 200 DF, p-value: 6.174e-16
model1 = lm (mNhomTong1$RnaDna16s.0 ~ mNhomTong1$time, data = mNhomTong1)
summary(model1)
##
## Call:
## lm(formula = mNhomTong1$RnaDna16s.0 ~ mNhomTong1$time, data = mNhomTong1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85238 -0.29279 -0.03733 0.20754 1.43706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.903257 0.036577 24.694 <2e-16 ***
## mNhomTong1$time -0.050872 0.005438 -9.356 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3876 on 222 degrees of freedom
## Multiple R-squared: 0.2828, Adjusted R-squared: 0.2795
## F-statistic: 87.53 on 1 and 222 DF, p-value: < 2.2e-16
model2 = lm (mNhomTong2$RnaDna16s.0 ~ mNhomTong2$time, data = mNhomTong2)
summary(model2)
##
## Call:
## lm(formula = mNhomTong2$RnaDna16s.0 ~ mNhomTong2$time, data = mNhomTong2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85654 -0.28420 -0.02339 0.21944 1.43174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.908576 0.035827 25.360 <2e-16 ***
## mNhomTong2$time -0.052031 0.005326 -9.769 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3797 on 222 degrees of freedom
## Multiple R-squared: 0.3007, Adjusted R-squared: 0.2975
## F-statistic: 95.44 on 1 and 222 DF, p-value: < 2.2e-16
model3 = lm (mNhomTong3$RnaDna16s.0 ~ mNhomTong3$time, data = mNhomTong3)
summary(model3)
##
## Call:
## lm(formula = mNhomTong3$RnaDna16s.0 ~ mNhomTong3$time, data = mNhomTong3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86586 -0.26608 -0.05062 0.22482 1.47445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.920382 0.036991 24.881 <2e-16 ***
## mNhomTong3$time -0.054525 0.005499 -9.915 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.392 on 222 degrees of freedom
## Multiple R-squared: 0.3069, Adjusted R-squared: 0.3038
## F-statistic: 98.31 on 1 and 222 DF, p-value: < 2.2e-16
model4 = lm (mNhomTong4$RnaDna16s.0 ~ mNhomTong4$time, data = mNhomTong4)
summary(model4)
##
## Call:
## lm(formula = mNhomTong4$RnaDna16s.0 ~ mNhomTong4$time, data = mNhomTong4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85900 -0.28555 -0.03384 0.23771 1.42918
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.911131 0.035830 25.429 <2e-16 ***
## mNhomTong4$time -0.052131 0.005326 -9.787 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3797 on 222 degrees of freedom
## Multiple R-squared: 0.3014, Adjusted R-squared: 0.2983
## F-statistic: 95.79 on 1 and 222 DF, p-value: < 2.2e-16
model5 = lm (mNhomTong5$RnaDna16s.0 ~ mNhomTong5$time, data = mNhomTong5)
summary(model5)
##
## Call:
## lm(formula = mNhomTong5$RnaDna16s.0 ~ mNhomTong5$time, data = mNhomTong5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85880 -0.28136 -0.03291 0.23547 1.42902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.911289 0.036168 25.196 <2e-16 ***
## mNhomTong5$time -0.052494 0.005377 -9.763 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3833 on 222 degrees of freedom
## Multiple R-squared: 0.3004, Adjusted R-squared: 0.2972
## F-statistic: 95.32 on 1 and 222 DF, p-value: < 2.2e-16
Ta chọn mô hình model1 có intercept giống với model0 nhất để thực hiện bước tiếp theo.
Xuất dữ liệu sang file csv để lưu trữ
write.csv(mNhomTong1,"DuLieu.csv")