Missing Value Analysis Using the mice package

Load all the neccessary libraries.

library(mice)
library(VIM)

Load the dataset.

salary <- read.csv("adult.txt", sep =",", na.strings = c(" ? "," ?","?"),
                            stringsAsFactors = T)

names(salary) <- c("age", "workclass", "fnlwgt", "education", "education_num", "marital",
                   "occupation", "relationship",
                   "race", "sex", "capital_gain", "capital_loss", "hours_per_week",
                   "nat_country", "income")

Percentages of missing values in each column:

pMiss <- function(x){sum(is.na(x))/length(x)*100}
apply(salary,2,pMiss)

##            age      workclass         fnlwgt      education  education_num 
##       0.000000       5.638821       0.000000       0.000000       0.000000 
##        marital     occupation   relationship           race            sex 
##       0.000000       5.660319       0.000000       0.000000       0.000000 
##   capital_gain   capital_loss hours_per_week    nat_country         income 
##       0.000000       0.000000       0.000000       1.790541       0.000000

Total number of missing values in each column:

Miss <- function(x){sum(is.na(x))}
apply(salary,2,Miss)

##            age      workclass         fnlwgt      education  education_num 
##              0           1836              0              0              0 
##        marital     occupation   relationship           race            sex 
##              0           1843              0              0              0 
##   capital_gain   capital_loss hours_per_week    nat_country         income 
##              0              0              0            583              0

Percentages of incomplete cases:

icn(salary)/nrow(salary)* 100

## [1] 7.367936

Patterns of missing values across all the rows.

md.pattern(salary)

##       age fnlwgt education education_num marital relationship race sex
## 30161   1      1         1             1       1            1    1   1
##     7   1      1         1             1       1            1    1   1
##   556   1      1         1             1       1            1    1   1
##  1809   1      1         1             1       1            1    1   1
##    27   1      1         1             1       1            1    1   1
##         0      0         0             0       0            0    0   0
##       capital_gain capital_loss hours_per_week income nat_country
## 30161            1            1              1      1           1
##     7            1            1              1      1           1
##   556            1            1              1      1           0
##  1809            1            1              1      1           1
##    27            1            1              1      1           0
##                  0            0              0      0         583
##       workclass occupation     
## 30161         1          1    0
##     7         1          0    1
##   556         1          1    1
##  1809         0          0    2
##    27         0          0    3
##            1836       1843 4262

Patterns of missing values across all the columns:

aggr_plot <- aggr(salary, col=c('orange','red'),bars = FALSE, numbers=TRUE, sortVars=TRUE, labels=names(salary), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##        Variable      Count
##      occupation 0.05660319
##       workclass 0.05638821
##     nat_country 0.01790541
##             age 0.00000000
##          fnlwgt 0.00000000
##       education 0.00000000
##   education_num 0.00000000
##         marital 0.00000000
##    relationship 0.00000000
##            race 0.00000000
##             sex 0.00000000
##    capital_gain 0.00000000
##    capital_loss 0.00000000
##  hours_per_week 0.00000000
##          income 0.00000000

Missing Value Analysis Using the mice package

Akash Ansari