INSTALL PACKAGES mice & VIM install.packages(c("mice","VIM"))
LOAD PACKAGES FROM LIBRARY
library(mice)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
IMPORT DATA
DOWNLOAD DATA
setwd("/Users/anand/RProjects/workingdirectory")
vdata <- read.csv("vehicleMiss.csv")
str(vdata)
## 'data.frame': 1624 obs. of 7 variables:
## $ vehicle: int 1 2 3 4 5 6 7 8 9 10 ...
## $ fm : int 0 10 15 0 13 21 11 5 8 1 ...
## $ Mileage: int 863 4644 16330 13 22537 40931 34762 11051 7003 11 ...
## $ lh : num 1.1 2.4 4.2 1 4.5 3.1 0.7 2.9 3.4 0.7 ...
## $ lc : num 66.3 233 325.1 66.6 328.7 ...
## $ mc : num 697 120 175 0 175 ...
## $ State : Factor w/ 50 levels "AK","AL","AR",..: 25 5 48 37 4 9 18 10 47 38 ...
LET’S CHECK WHAT %AGE OF DATA IS MISSING IN EACH COLUMN
p <- function(x) {sum(is.na(x))/length(x)*100}
apply(vdata, 2, p)
## vehicle fm Mileage lh lc mc State
## 0.0000000 0.0000000 0.8004926 0.3694581 0.4926108 0.0000000 0.9236453
LET’S CHECK PATTERN OF MISSING DATA
md.pattern(vdata)
## vehicle fm mc lh lc Mileage State
## 1586 1 1 1 1 1 1 1 0
## 11 1 1 1 1 1 1 0 1
## 13 1 1 1 1 1 0 1 1
## 6 1 1 1 1 0 1 1 1
## 2 1 1 1 1 0 1 0 2
## 4 1 1 1 0 1 1 1 1
## 2 1 1 1 0 1 1 0 2
## 0 0 0 6 8 13 15 42
LET’S COMPARE VARIABLES PAIRWISE FOR MISSING DATA
md.pairs(vdata)
## $rr
## vehicle fm Mileage lh lc mc State
## vehicle 1624 1624 1611 1618 1616 1624 1609
## fm 1624 1624 1611 1618 1616 1624 1609
## Mileage 1611 1611 1611 1605 1603 1611 1596
## lh 1618 1618 1605 1618 1610 1618 1605
## lc 1616 1616 1603 1610 1616 1616 1603
## mc 1624 1624 1611 1618 1616 1624 1609
## State 1609 1609 1596 1605 1603 1609 1609
##
## $rm
## vehicle fm Mileage lh lc mc State
## vehicle 0 0 13 6 8 0 15
## fm 0 0 13 6 8 0 15
## Mileage 0 0 0 6 8 0 15
## lh 0 0 13 0 8 0 13
## lc 0 0 13 6 0 0 13
## mc 0 0 13 6 8 0 15
## State 0 0 13 4 6 0 0
##
## $mr
## vehicle fm Mileage lh lc mc State
## vehicle 0 0 0 0 0 0 0
## fm 0 0 0 0 0 0 0
## Mileage 13 13 0 13 13 13 13
## lh 6 6 6 0 6 6 4
## lc 8 8 8 8 0 8 6
## mc 0 0 0 0 0 0 0
## State 15 15 15 13 13 15 0
##
## $mm
## vehicle fm Mileage lh lc mc State
## vehicle 0 0 0 0 0 0 0
## fm 0 0 0 0 0 0 0
## Mileage 0 0 13 0 0 0 0
## lh 0 0 0 6 0 0 2
## lc 0 0 0 0 8 0 2
## mc 0 0 0 0 0 0 0
## State 0 0 0 2 2 0 15
LET’S CHECK A MARGIN PLOT
marginplot(vdata[,c("Mileage","lc")])
PERFORMING IMPUTATION ON MISSING DATA, WE WILL GENERATE 3 SETS
impute <- mice(vdata[,2:7],m=3, seed = 123)
##
## iter imp variable
## 1 1 Mileage lh lc State
## 1 2 Mileage lh lc State
## 1 3 Mileage lh lc State
## 2 1 Mileage lh lc State
## 2 2 Mileage lh lc State
## 2 3 Mileage lh lc State
## 3 1 Mileage lh lc State
## 3 2 Mileage lh lc State
## 3 3 Mileage lh lc State
## 4 1 Mileage lh lc State
## 4 2 Mileage lh lc State
## 4 3 Mileage lh lc State
## 5 1 Mileage lh lc State
## 5 2 Mileage lh lc State
## 5 3 Mileage lh lc State
print(impute)
## Class: mids
## Number of multiple imputations: 3
## Imputation methods:
## fm Mileage lh lc mc State
## "" "pmm" "pmm" "pmm" "" "polyreg"
## PredictorMatrix:
## fm Mileage lh lc mc State
## fm 0 1 1 1 1 1
## Mileage 1 0 1 1 1 1
## lh 1 1 0 1 1 1
## lc 1 1 1 0 1 1
## mc 1 1 1 1 0 1
## State 1 1 1 1 1 0
impute$imp$State
## 1 2 3
## 68 SD MA GA
## 69 VT AZ NV
## 142 CT IA OH
## 143 MI OK CA
## 144 PA CT CA
## 145 FL MA NY
## 146 TX WI CA
## 147 CA NC CA
## 828 MO CA MT
## 829 MA VT OH
## 830 CA KS TX
## 1348 TN NE SD
## 1598 TX AK TN
## 1599 FL CA FL
## 1600 KY OH TX
LET’S COMPLETE THE DATA USING 1st IMPUTATION SET FROM impute
newVDATA <- complete(impute, 1)
DISTRIBUTION OF OBSERVED/IMPUTED VALUES
stripplot(impute,pch = 20, cex = 1.2)
xyplot(impute, lc~lh | .imp, pch = 20, cex = 1.4)