mice.knit

title: “Mice” author: “William” date: ‘2022-03-16’ output: html_document —

Imputing the missing values.

Mice comes in handy when imputing categorical records

#checking the summary
summary(data1)

##       age            bmi             hyp             chl       
##  Min.   :1.00   Min.   :20.40   Min.   :1.000   Min.   :113.0  
##  1st Qu.:1.00   1st Qu.:22.65   1st Qu.:1.000   1st Qu.:185.0  
##  Median :2.00   Median :26.75   Median :1.000   Median :187.0  
##  Mean   :1.76   Mean   :26.56   Mean   :1.235   Mean   :191.4  
##  3rd Qu.:2.00   3rd Qu.:28.93   3rd Qu.:1.000   3rd Qu.:212.0  
##  Max.   :3.00   Max.   :35.30   Max.   :2.000   Max.   :284.0  
##                 NA's   :9       NA's   :8       NA's   :10

Observation: you can’t check the min, median, or mean of binary values i.e hyp. therefore we need to change the variable to the right datatype

#convert hype to a factor
data1$hyp=as.factor(data1$hyp)
#confirming the canges
is.factor(data1$hyp)

## [1] TRUE

#checking  the summary
summary(data1)

##       age            bmi          hyp          chl       
##  Min.   :1.00   Min.   :20.40   1   :13   Min.   :113.0  
##  1st Qu.:1.00   1st Qu.:22.65   2   : 4   1st Qu.:185.0  
##  Median :2.00   Median :26.75   NA's: 8   Median :187.0  
##  Mean   :1.76   Mean   :26.56             Mean   :191.4  
##  3rd Qu.:2.00   3rd Qu.:28.93             3rd Qu.:212.0  
##  Max.   :3.00   Max.   :35.30             Max.   :284.0  
##                 NA's   :9                 NA's   :10

#Replacing bmi and chl NA with the mean since they are continous varaibles
data1$bmi[which(is.na(data1$bmi))]=mean(data1$bmi,na.rm = TRUE)
data1$chl[which(is.na(data1$chl))]=mean(data1$chl,na.rm = TRUE)
#checking
head(data1)

##   age     bmi  hyp   chl
## 1   1 26.5625 <NA> 191.4
## 2   2 22.7000    1 187.0
## 3   1 26.5625    1 187.0
## 4   3 26.5625 <NA> 191.4
## 5   1 20.4000    1 113.0
## 6   3 26.5625 <NA> 184.0

Multivariate Imputation by Chained Equations(MICE)

For binary class,or categorical we can impute using MICE//continous varaibel can be imputed with mice too.

#checking the available imputation function
?mice 
#or
methods(mice)

## Warning in .S3methods(generic.function, class, envir): function 'mice' appears
## not to be S3 generic; found functions that look like S3 methods

##  [1] mice.impute.2l.bin              mice.impute.2l.lmer            
##  [3] mice.impute.2l.norm             mice.impute.2l.pan             
##  [5] mice.impute.2lonly.mean         mice.impute.2lonly.norm        
##  [7] mice.impute.2lonly.pmm          mice.impute.cart               
##  [9] mice.impute.jomoImpute          mice.impute.lasso.logreg       
## [11] mice.impute.lasso.norm          mice.impute.lasso.select.logreg
## [13] mice.impute.lasso.select.norm   mice.impute.lda                
## [15] mice.impute.logreg              mice.impute.logreg.boot        
## [17] mice.impute.mean                mice.impute.midastouch         
## [19] mice.impute.mnar.logreg         mice.impute.mnar.norm          
## [21] mice.impute.norm                mice.impute.norm.boot          
## [23] mice.impute.norm.nob            mice.impute.norm.predict       
## [25] mice.impute.panImpute           mice.impute.passive            
## [27] mice.impute.pmm                 mice.impute.polr               
## [29] mice.impute.polyreg             mice.impute.quadratic          
## [31] mice.impute.rf                  mice.impute.ri                 
## [33] mice.impute.sample              mice.mids                      
## [35] mice.theme                     
## see '?methods' for accessing help and source code

#use original dataset

#loading inbuilt data
data2=nhanes

#checking the 1st 6 rows
head(data2)

##   age  bmi hyp chl
## 1   1   NA  NA  NA
## 2   2 22.7   1 187
## 3   1   NA   1 187
## 4   3   NA  NA  NA
## 5   1 20.4   1 113
## 6   3   NA  NA 184

#convert hype to a factor
data2$hyp=as.factor(data2$hyp)
#confirming the canges
is.factor(data2$hyp)

## [1] TRUE

#Mice imputation (use chain imputaion, apply lot of diff statistical function to specific col)

#creating a function
library(mice)
my_imp=mice(data2, m=5, method=c("","pmm","logreg","pmm"),maxit=20)# you have to apply the methods you feel is ok to each column. for the column without missing value do not impute

## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   1   5  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   2   5  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   3   5  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   4   5  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl
##   5   5  bmi  hyp  chl
##   6   1  bmi  hyp  chl
##   6   2  bmi  hyp  chl
##   6   3  bmi  hyp  chl
##   6   4  bmi  hyp  chl
##   6   5  bmi  hyp  chl
##   7   1  bmi  hyp  chl
##   7   2  bmi  hyp  chl
##   7   3  bmi  hyp  chl
##   7   4  bmi  hyp  chl
##   7   5  bmi  hyp  chl
##   8   1  bmi  hyp  chl
##   8   2  bmi  hyp  chl
##   8   3  bmi  hyp  chl
##   8   4  bmi  hyp  chl
##   8   5  bmi  hyp  chl
##   9   1  bmi  hyp  chl
##   9   2  bmi  hyp  chl
##   9   3  bmi  hyp  chl
##   9   4  bmi  hyp  chl
##   9   5  bmi  hyp  chl
##   10   1  bmi  hyp  chl
##   10   2  bmi  hyp  chl
##   10   3  bmi  hyp  chl
##   10   4  bmi  hyp  chl
##   10   5  bmi  hyp  chl
##   11   1  bmi  hyp  chl
##   11   2  bmi  hyp  chl
##   11   3  bmi  hyp  chl
##   11   4  bmi  hyp  chl
##   11   5  bmi  hyp  chl
##   12   1  bmi  hyp  chl
##   12   2  bmi  hyp  chl
##   12   3  bmi  hyp  chl
##   12   4  bmi  hyp  chl
##   12   5  bmi  hyp  chl
##   13   1  bmi  hyp  chl
##   13   2  bmi  hyp  chl
##   13   3  bmi  hyp  chl
##   13   4  bmi  hyp  chl
##   13   5  bmi  hyp  chl
##   14   1  bmi  hyp  chl
##   14   2  bmi  hyp  chl
##   14   3  bmi  hyp  chl
##   14   4  bmi  hyp  chl
##   14   5  bmi  hyp  chl
##   15   1  bmi  hyp  chl
##   15   2  bmi  hyp  chl
##   15   3  bmi  hyp  chl
##   15   4  bmi  hyp  chl
##   15   5  bmi  hyp  chl
##   16   1  bmi  hyp  chl
##   16   2  bmi  hyp  chl
##   16   3  bmi  hyp  chl
##   16   4  bmi  hyp  chl
##   16   5  bmi  hyp  chl
##   17   1  bmi  hyp  chl
##   17   2  bmi  hyp  chl
##   17   3  bmi  hyp  chl
##   17   4  bmi  hyp  chl
##   17   5  bmi  hyp  chl
##   18   1  bmi  hyp  chl
##   18   2  bmi  hyp  chl
##   18   3  bmi  hyp  chl
##   18   4  bmi  hyp  chl
##   18   5  bmi  hyp  chl
##   19   1  bmi  hyp  chl
##   19   2  bmi  hyp  chl
##   19   3  bmi  hyp  chl
##   19   4  bmi  hyp  chl
##   19   5  bmi  hyp  chl
##   20   1  bmi  hyp  chl
##   20   2  bmi  hyp  chl
##   20   3  bmi  hyp  chl
##   20   4  bmi  hyp  chl
##   20   5  bmi  hyp  chl

summary(data2$bmi)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   20.40   22.65   26.75   26.56   28.93   35.30       9

my_imp$imp$bmi

##       1    2    3    4    5
## 1  30.1 30.1 22.0 27.2 30.1
## 3  35.3 27.2 27.4 30.1 29.6
## 4  22.5 21.7 22.5 20.4 21.7
## 6  28.7 27.4 21.7 25.5 22.7
## 10 30.1 22.0 22.7 30.1 25.5
## 11 35.3 28.7 21.7 22.0 27.2
## 12 28.7 24.9 22.7 27.4 22.7
## 16 29.6 35.3 35.3 26.3 22.5
## 21 27.2 30.1 28.7 22.0 20.4

number 5 gives a better value.

#final imputation
clean_data2= complete(my_imp,5)
clean_data2

##    age  bmi hyp chl
## 1    1 30.1   1 187
## 2    2 22.7   1 187
## 3    1 29.6   1 187
## 4    3 21.7   2 206
## 5    1 20.4   1 113
## 6    3 22.7   2 184
## 7    1 22.5   1 118
## 8    1 30.1   1 187
## 9    2 22.0   1 238
## 10   2 25.5   1 204
## 11   1 27.2   1 118
## 12   2 22.7   2 187
## 13   3 21.7   1 206
## 14   2 28.7   2 204
## 15   1 29.6   1 187
## 16   1 22.5   1 238
## 17   3 27.2   2 284
## 18   2 26.3   2 199
## 19   1 35.3   1 218
## 20   3 25.5   2 184
## 21   1 20.4   1 113
## 22   1 33.2   1 229
## 23   1 27.5   1 131
## 24   3 24.9   1 184
## 25   2 27.4   1 186