INSTALL PACKAGES mice & VIM install.packages(c("mice","VIM"))

LOAD PACKAGES FROM LIBRARY

library(mice)
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep

IMPORT DATA
DOWNLOAD DATA

setwd("/Users/anand/RProjects/workingdirectory")
vdata <- read.csv("vehicleMiss.csv")
str(vdata)
## 'data.frame':    1624 obs. of  7 variables:
##  $ vehicle: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ fm     : int  0 10 15 0 13 21 11 5 8 1 ...
##  $ Mileage: int  863 4644 16330 13 22537 40931 34762 11051 7003 11 ...
##  $ lh     : num  1.1 2.4 4.2 1 4.5 3.1 0.7 2.9 3.4 0.7 ...
##  $ lc     : num  66.3 233 325.1 66.6 328.7 ...
##  $ mc     : num  697 120 175 0 175 ...
##  $ State  : Factor w/ 50 levels "AK","AL","AR",..: 25 5 48 37 4 9 18 10 47 38 ...

LET’S CHECK WHAT %AGE OF DATA IS MISSING IN EACH COLUMN

p <- function(x) {sum(is.na(x))/length(x)*100}
apply(vdata, 2, p)
##   vehicle        fm   Mileage        lh        lc        mc     State 
## 0.0000000 0.0000000 0.8004926 0.3694581 0.4926108 0.0000000 0.9236453

LET’S CHECK PATTERN OF MISSING DATA

md.pattern(vdata)

##      vehicle fm mc lh lc Mileage State   
## 1586       1  1  1  1  1       1     1  0
## 11         1  1  1  1  1       1     0  1
## 13         1  1  1  1  1       0     1  1
## 6          1  1  1  1  0       1     1  1
## 2          1  1  1  1  0       1     0  2
## 4          1  1  1  0  1       1     1  1
## 2          1  1  1  0  1       1     0  2
##            0  0  0  6  8      13    15 42

LET’S COMPARE VARIABLES PAIRWISE FOR MISSING DATA

md.pairs(vdata)
## $rr
##         vehicle   fm Mileage   lh   lc   mc State
## vehicle    1624 1624    1611 1618 1616 1624  1609
## fm         1624 1624    1611 1618 1616 1624  1609
## Mileage    1611 1611    1611 1605 1603 1611  1596
## lh         1618 1618    1605 1618 1610 1618  1605
## lc         1616 1616    1603 1610 1616 1616  1603
## mc         1624 1624    1611 1618 1616 1624  1609
## State      1609 1609    1596 1605 1603 1609  1609
## 
## $rm
##         vehicle fm Mileage lh lc mc State
## vehicle       0  0      13  6  8  0    15
## fm            0  0      13  6  8  0    15
## Mileage       0  0       0  6  8  0    15
## lh            0  0      13  0  8  0    13
## lc            0  0      13  6  0  0    13
## mc            0  0      13  6  8  0    15
## State         0  0      13  4  6  0     0
## 
## $mr
##         vehicle fm Mileage lh lc mc State
## vehicle       0  0       0  0  0  0     0
## fm            0  0       0  0  0  0     0
## Mileage      13 13       0 13 13 13    13
## lh            6  6       6  0  6  6     4
## lc            8  8       8  8  0  8     6
## mc            0  0       0  0  0  0     0
## State        15 15      15 13 13 15     0
## 
## $mm
##         vehicle fm Mileage lh lc mc State
## vehicle       0  0       0  0  0  0     0
## fm            0  0       0  0  0  0     0
## Mileage       0  0      13  0  0  0     0
## lh            0  0       0  6  0  0     2
## lc            0  0       0  0  8  0     2
## mc            0  0       0  0  0  0     0
## State         0  0       0  2  2  0    15

LET’S CHECK A MARGIN PLOT

marginplot(vdata[,c("Mileage","lc")])

PERFORMING IMPUTATION ON MISSING DATA, WE WILL GENERATE 3 SETS

impute <- mice(vdata[,2:7],m=3, seed = 123)
## 
##  iter imp variable
##   1   1  Mileage  lh  lc  State
##   1   2  Mileage  lh  lc  State
##   1   3  Mileage  lh  lc  State
##   2   1  Mileage  lh  lc  State
##   2   2  Mileage  lh  lc  State
##   2   3  Mileage  lh  lc  State
##   3   1  Mileage  lh  lc  State
##   3   2  Mileage  lh  lc  State
##   3   3  Mileage  lh  lc  State
##   4   1  Mileage  lh  lc  State
##   4   2  Mileage  lh  lc  State
##   4   3  Mileage  lh  lc  State
##   5   1  Mileage  lh  lc  State
##   5   2  Mileage  lh  lc  State
##   5   3  Mileage  lh  lc  State
print(impute)
## Class: mids
## Number of multiple imputations:  3 
## Imputation methods:
##        fm   Mileage        lh        lc        mc     State 
##        ""     "pmm"     "pmm"     "pmm"        "" "polyreg" 
## PredictorMatrix:
##         fm Mileage lh lc mc State
## fm       0       1  1  1  1     1
## Mileage  1       0  1  1  1     1
## lh       1       1  0  1  1     1
## lc       1       1  1  0  1     1
## mc       1       1  1  1  0     1
## State    1       1  1  1  1     0
impute$imp$State
##       1  2  3
## 68   SD MA GA
## 69   VT AZ NV
## 142  CT IA OH
## 143  MI OK CA
## 144  PA CT CA
## 145  FL MA NY
## 146  TX WI CA
## 147  CA NC CA
## 828  MO CA MT
## 829  MA VT OH
## 830  CA KS TX
## 1348 TN NE SD
## 1598 TX AK TN
## 1599 FL CA FL
## 1600 KY OH TX

LET’S COMPLETE THE DATA USING 1st IMPUTATION SET FROM impute

newVDATA <- complete(impute, 1)

DISTRIBUTION OF OBSERVED/IMPUTED VALUES

stripplot(impute,pch = 20, cex = 1.2)

xyplot(impute, lc~lh | .imp, pch = 20, cex = 1.4)