set.seed(1)
data.copy = loan.data
data.copy$debt_income[sample(1:nrow(data.copy), 10)] <- NA
data.copy$education[sample(1:nrow(data.copy), 10)] <- NA
levels(data.copy$education)
## [1] "Did not complete high school" "High school degree"
## [3] "Some college" "College degree"
data.copy$education=factor(data.copy$education)
levels(data.copy$education)
## [1] "Did not complete high school" "High school degree"
## [3] "Some college" "College degree"
MICE 包 — 检查缺失值分布summary(data.copy)
## age education year_emp
## Min. :20.00 Did not complete high school:44 Min. : 0.00
## 1st Qu.:28.00 High school degree :24 1st Qu.: 3.00
## Median :34.00 Some college :17 Median : 6.00
## Mean :34.38 College degree : 5 Mean : 8.18
## 3rd Qu.:41.00 NA's :10 3rd Qu.:13.25
## Max. :53.00 Max. :26.00
##
## income debt_income cred_debt other_debt
## Min. : 14.00 Min. : 0.400 Min. : 0.02507 Min. : 0.1009
## 1st Qu.: 22.75 1st Qu.: 5.425 1st Qu.: 0.38774 1st Qu.: 1.0565
## Median : 35.50 Median : 9.700 Median : 1.01752 Median : 2.0751
## Mean : 45.04 Mean :10.797 Mean : 1.92120 Mean : 2.9174
## 3rd Qu.: 63.00 3rd Qu.:14.250 3rd Qu.: 2.18088 3rd Qu.: 3.9708
## Max. :176.00 Max. :41.300 Max. :15.01668 Max. :14.7193
## NA's :10
## Loan Logistc PGR_1 Dis_1 Dis1_1
## No :64 Min. :0.0002818 No :83 No :70 Min. :0.000924
## Yes :20 1st Qu.:0.0278299 Yes:17 Yes:30 1st Qu.:0.446390
## NA's:16 Median :0.0953964 Median :0.710173
## Mean :0.2288153 Mean :0.637483
## 3rd Qu.:0.3650043 3rd Qu.:0.891036
## Max. :0.9998494 Max. :0.991606
##
## Discrim
## Min. :0.008394
## 1st Qu.:0.108964
## Median :0.289827
## Mean :0.362517
## 3rd Qu.:0.553610
## Max. :0.999076
##
pMiss <- function(x){sum(is.na(x))/length(x)*100}
apply(data.copy, 2, pMiss)
## age education year_emp income debt_income cred_debt
## 0 10 0 0 10 0
## other_debt Loan Logistc PGR_1 Dis_1 Dis1_1
## 0 16 0 0 0 0
## Discrim
## 0
apply(data.copy, 1, pMiss)
## [1] 0.000000 0.000000 0.000000 0.000000 0.000000 7.692308 0.000000
## [8] 7.692308 7.692308 0.000000 0.000000 0.000000 0.000000 0.000000
## [15] 0.000000 0.000000 0.000000 7.692308 0.000000 7.692308 7.692308
## [22] 0.000000 0.000000 0.000000 0.000000 0.000000 15.384615 0.000000
## [29] 7.692308 0.000000 0.000000 0.000000 0.000000 0.000000 15.384615
## [36] 0.000000 7.692308 15.384615 0.000000 7.692308 0.000000 0.000000
## [43] 0.000000 0.000000 0.000000 0.000000 0.000000 15.384615 0.000000
## [50] 0.000000 0.000000 7.692308 7.692308 0.000000 0.000000 0.000000
## [57] 15.384615 7.692308 0.000000 0.000000 0.000000 7.692308 0.000000
## [64] 0.000000 0.000000 7.692308 0.000000 15.384615 0.000000 0.000000
## [71] 7.692308 7.692308 0.000000 7.692308 0.000000 0.000000 7.692308
## [78] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [85] 0.000000 7.692308 0.000000 0.000000 7.692308 0.000000 0.000000
## [92] 0.000000 7.692308 0.000000 0.000000 0.000000 7.692308 7.692308
## [99] 7.692308 0.000000
md.pattern(data.copy)
## age year_emp income cred_debt other_debt Logistc PGR_1 Dis_1 Dis1_1
## 70 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1
## 8 1 1 1 1 1 1 1 1 1
## 10 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0
## Discrim education debt_income Loan
## 70 1 1 1 1 0
## 6 1 0 1 1 1
## 8 1 1 0 1 1
## 10 1 1 1 0 1
## 4 1 0 1 0 2
## 2 1 1 0 0 2
## 0 10 10 16 36
VIM 包 + box plot 箱线图 — 缺失值视觉化aggr_plot <- aggr(data.copy, col=c('blue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data.copy), cex.axis=.7, gap=3, ylab=c("Missing data histogram","Missing value pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Loan 0.16
## education 0.10
## debt_income 0.10
## age 0.00
## year_emp 0.00
## income 0.00
## cred_debt 0.00
## other_debt 0.00
## Logistc 0.00
## PGR_1 0.00
## Dis_1 0.00
## Dis1_1 0.00
## Discrim 0.00
marginplot(data.copy[,c(5,2)], main='Missing value box plot\n Constrain: 2 variables')
debt_income 的红 - 缺失 education 的分布debt_income 的蓝 - 其余不缺失 education 的分布na.action=na.omit)impute(data.copy$debt_income, mean) # replace with mean, indicated by star suffix
## 1 2 3 4 5 6 7
## 5.50000 3.60000 3.40000 0.40000 12.80000 10.79667* 4.50000
## 8 9 10 11 12 13 14
## 10.90000 15.80000 0.80000 6.70000 1.70000 4.00000 12.00000
## 15 16 17 18 19 20 21
## 2.60000 2.90000 7.80000 7.20000 5.00000 10.79667* 16.50000
## 22 23 24 25 26 27 28
## 18.50000 9.60000 6.10000 10.40000 6.40000 10.79667* 14.00000
## 29 30 31 32 33 34 35
## 6.20000 2.80000 13.00000 4.80000 1.70000 12.90000 1.20000
## 36 37 38 39 40 41 42
## 1.20000 10.79667* 11.40000 4.20000 9.80000 10.90000 4.40000
## 43 44 45 46 47 48 49
## 3.40000 6.50000 6.80000 11.80000 5.40000 17.60000 21.40000
## 50 51 52 53 54 55 56
## 0.60000 10.20000 11.80000 10.60000 6.50000 2.50000 9.00000
## 57 58 59 60 61 62 63
## 10.79667* 10.79667* 14.30000 9.70000 7.10000 10.79667* 2.30000
## 64 65 66 67 68 69 70
## 21.70000 18.10000 7.80000 9.00000 4.10000 14.60000 14.10000
## 71 72 73 74 75 76 77
## 17.80000 7.20000 13.20000 8.90000 9.70000 16.00000 11.00000
## 78 79 80 81 82 83 84
## 24.70000 12.80000 15.40000 8.40000 9.30000 7.70000 23.60000
## 85 86 87 88 89 90 91
## 13.30000 10.79667* 18.70000 10.60000 10.79667* 11.10000 17.10000
## 92 93 94 95 96 97 98
## 23.10000 23.80000 17.30000 9.30000 19.80000 10.79667* 27.70000
## 99 100
## 32.40000 41.30000
impute(data.copy$debt_income, 20) # replace with specific number
## 1 2 3 4 5 6 7 8 9 10 11 12
## 5.5 3.6 3.4 0.4 12.8 20.0* 4.5 10.9 15.8 0.8 6.7 1.7
## 13 14 15 16 17 18 19 20 21 22 23 24
## 4.0 12.0 2.6 2.9 7.8 7.2 5.0 20.0* 16.5 18.5 9.6 6.1
## 25 26 27 28 29 30 31 32 33 34 35 36
## 10.4 6.4 20.0* 14.0 6.2 2.8 13.0 4.8 1.7 12.9 1.2 1.2
## 37 38 39 40 41 42 43 44 45 46 47 48
## 20.0* 11.4 4.2 9.8 10.9 4.4 3.4 6.5 6.8 11.8 5.4 17.6
## 49 50 51 52 53 54 55 56 57 58 59 60
## 21.4 0.6 10.2 11.8 10.6 6.5 2.5 9.0 20.0* 20.0* 14.3 9.7
## 61 62 63 64 65 66 67 68 69 70 71 72
## 7.1 20.0* 2.3 21.7 18.1 7.8 9.0 4.1 14.6 14.1 17.8 7.2
## 73 74 75 76 77 78 79 80 81 82 83 84
## 13.2 8.9 9.7 16.0 11.0 24.7 12.8 15.4 8.4 9.3 7.7 23.6
## 85 86 87 88 89 90 91 92 93 94 95 96
## 13.3 20.0* 18.7 10.6 20.0* 11.1 17.1 23.1 23.8 17.3 9.3 19.8
## 97 98 99 100
## 20.0* 27.7 32.4 41.3
# data.copy$debt_income[is.na(data.copy$debt_income)] <- mean(data.copy$debt_income, na.rm = T) # impute manually
meanactuals <- loan.data$debt_income[is.na(data.copy$debt_income)]
predicteds <- rep(mean(data.copy$debt_income, na.rm=T), length(actuals))
regr.eval(actuals, predicteds)
## mae mse rmse mape
## 3.5093333 30.5949444 5.5312697 0.4691676
kNN, rpart, and mice)kNNknnOutput <- knnImputation(data.copy)
anyNA(knnOutput)
## [1] FALSE
actuals <- loan.data$debt_income[is.na(data.copy$debt_income)]
predicteds <- knnOutput[is.na(data.copy$debt_income), "debt_income"]
regr.eval(actuals, predicteds)
## mae mse rmse mape
## 2.9023289 12.0671334 3.4737780 0.3217348
rpartlibrary(rpart)
class_mod <- rpart(education ~ ., data=data.copy[!is.na(data.copy$education), ], method="class", na.action=na.omit)
anova_mod <- rpart(debt_income ~ ., data=data.copy[!is.na(data.copy$debt_income), ], method="anova", na.action=na.omit)
education_pred <- predict(class_mod, data.copy[is.na(data.copy$education), ],type='class')
debt_income_pred <- predict(anova_mod, data.copy[is.na(data.copy$debt_income), ])
actuals <- loan.data$debt_income[is.na(data.copy$debt_income)] # debt_income accuracy
predicteds <- debt_income_pred
regr.eval(actuals, predicteds)
## mae mse rmse mape
## 3.9141176 27.3804277 5.2326311 0.5382739
actuals <- loan.data$education[is.na(data.copy$education)]
mean(actuals != education_pred) # misclass error for education accuracy
## [1] 0.3
micemiceMod <- mice(data.copy[, !names(data.copy) %in% "Loan"], method="rf") # based on random forests
##
## iter imp variable
## 1 1 education debt_income
## 1 2 education debt_income
## 1 3 education debt_income
## 1 4 education debt_income
## 1 5 education debt_income
## 2 1 education debt_income
## 2 2 education debt_income
## 2 3 education debt_income
## 2 4 education debt_income
## 2 5 education debt_income
## 3 1 education debt_income
## 3 2 education debt_income
## 3 3 education debt_income
## 3 4 education debt_income
## 3 5 education debt_income
## 4 1 education debt_income
## 4 2 education debt_income
## 4 3 education debt_income
## 4 4 education debt_income
## 4 5 education debt_income
## 5 1 education debt_income
## 5 2 education debt_income
## 5 3 education debt_income
## 5 4 education debt_income
## 5 5 education debt_income
miceOutput <- complete(miceMod) # generate completed data.
anyNA(miceOutput)
## [1] FALSE
actuals <- loan.data$debt_income[is.na(data.copy$debt_income)]
predicteds <- miceOutput[is.na(data.copy$debt_income), "debt_income"]
regr.eval(actuals, predicteds) # debt_income accuracy
## mae mse rmse mape
## 3.8900000 21.1290000 4.5966292 0.3902253
actuals <- loan.data$education[is.na(data.copy$education)]
predicteds <- miceOutput[is.na(data.copy$education), "education"]
mean(actuals != predicteds) # misclass error education accuracy
## [1] 0.7
miceMod <- mice(data.copy[, !names(data.copy) %in% "Loan"], method="pmm") # based on predictive mean matching
##
## iter imp variable
## 1 1 education debt_income
## 1 2 education debt_income
## 1 3 education debt_income
## 1 4 education debt_income
## 1 5 education debt_income
## 2 1 education debt_income
## 2 2 education debt_income
## 2 3 education debt_income
## 2 4 education debt_income
## 2 5 education debt_income
## 3 1 education debt_income
## 3 2 education debt_income
## 3 3 education debt_income
## 3 4 education debt_income
## 3 5 education debt_income
## 4 1 education debt_income
## 4 2 education debt_income
## 4 3 education debt_income
## 4 4 education debt_income
## 4 5 education debt_income
## 5 1 education debt_income
## 5 2 education debt_income
## 5 3 education debt_income
## 5 4 education debt_income
## 5 5 education debt_income
miceOutput <- complete(miceMod) # generate completed data.
anyNA(miceOutput)
## [1] FALSE
actuals <- loan.data$debt_income[is.na(data.copy$debt_income)]
predicteds <- miceOutput[is.na(data.copy$debt_income), "debt_income"]
regr.eval(actuals, predicteds) # debt_income accuracy
## mae mse rmse mape
## 1.6500000 3.4110000 1.8468893 0.1682569
actuals <- loan.data$education[is.na(data.copy$education)]
predicteds <- miceOutput[is.na(data.copy$education), "education"]
mean(actuals != predicteds) # misclass error education accuracy
## [1] 0.5
scatterplotxyplot(miceMod,debt_income ~ age + year_emp + income, pch=18,cex=0.8)
density plotdensityplot(miceMod)
stripplot plotstripplot(miceMod, pch = 20, cex = 1.2)
* 点状图
modelFit1 <- with(miceMod,lm(debt_income ~ age + year_emp + income))
summary(pool(modelFit1))
## est se t df Pr(>|t|)
## (Intercept) 11.15428223 3.40430135 3.2765261 93.63905 0.001474744
## age -0.01831564 0.11381798 -0.1609205 93.59853 0.872502852
## year_emp 0.19567065 0.16107071 1.2148121 93.42071 0.227499327
## income -0.02907585 0.03822274 -0.7606951 93.30922 0.448757228
## lo 95 hi 95 nmis fmi lambda
## (Intercept) 4.3946224 17.91394205 NA 0.02470232 0.004091432
## age -0.2443165 0.20768526 0 0.02506311 0.004451015
## year_emp -0.1241649 0.51550618 0 0.02659286 0.005974350
## income -0.1049753 0.04682362 0 0.02751231 0.006888928
tempData2 <- mice(data.copy[, !names(data.copy) %in% "Loan"], method='pmm',m=50, maxit=10, seed=0)
modelFit2 <- with(tempData2,lm(debt_income ~ age + year_emp + income))
summary(pool(modelFit2))
## est se t df Pr(>|t|)
## (Intercept) 11.03464886 3.40295582 3.2426659 93.18060 0.001643887
## age -0.01343613 0.11366864 -0.1182044 93.32245 0.906159763
## year_emp 0.20684756 0.16123475 1.2828969 92.72312 0.202725745
## income -0.03228658 0.03813612 -0.8466142 93.27052 0.399377187
## lo 95 hi 95 nmis fmi lambda
## (Intercept) 4.2772256 17.7920721 NA 0.02979934 0.009196355
## age -0.2391492 0.2122770 0 0.02833763 0.007734640
## year_emp -0.1133453 0.5270404 0 0.03446487 0.013860884
## income -0.1080145 0.0434413 0 0.02887364 0.008270667