# 
# CRP 245 Part 2 Day 6
#
# Missing Data
#
## This file contains the R code used in  

## Van Buuren, S., Groothuis-Oudshoorn, K. (2011) 
## mice: Multivariate Imputation by Chained Equations in R. 
## Journal of Statistical Software, 45(3).
#
# https://www.jstatsoft.org/article/view/v045i03

###
###   2.4 Simple example
###

###################################################
### load mice package
###################################################
# install.packages('mice')
library("mice")
## Warning: package 'mice' was built under R version 4.0.4
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
###################################################
### Load nhanes sample data
###################################################
data(nhanes)
str(nhanes)
## 'data.frame':    25 obs. of  4 variables:
##  $ age: num  1 2 1 3 1 3 1 1 2 2 ...
##  $ bmi: num  NA 22.7 NA NA 20.4 NA 22.5 30.1 22 NA ...
##  $ hyp: num  NA 1 1 NA 1 NA 1 1 1 NA ...
##  $ chl: num  NA 187 187 NA 113 184 118 187 238 NA ...
###################################################
### examine missing data patterns
###################################################
md.pattern(nhanes)

##    age hyp bmi chl   
## 13   1   1   1   1  0
## 3    1   1   1   0  1
## 1    1   1   0   1  1
## 1    1   0   0   1  2
## 7    1   0   0   0  3
##      0   8   9  10 27
# Missingness pattern can also be visualised in VIM package by
library(VIM)
## Warning: package 'VIM' was built under R version 4.0.4
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
nhanes_aggr = aggr(nhanes, col=mdc(1:2), numbers=TRUE, 
                   sortVars=TRUE, labels=names(nhanes), 
                   cex.axis=.7, gap=3, 
                   ylab=c("Proportion of missingness","Missingness Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable Count
##       chl  0.40
##       bmi  0.36
##       hyp  0.32
##       age  0.00
# blue box plots summarise the distribution of observed data given the other variable is observed, 
# and red box plots summarise the distribution of observed data given the other variable is missing.
# For example, the red box plot in the bottom margin shows that bmi is rather missing for lower cholesterol levels. 
# Note that, if data are MCAR, we expect the blue and red box plots to be identical.

# margin plots are also useful
# install.packages('VIM')
library("VIM")
marginplot(nhanes[,c("chl","bmi")], col=mdc(1:2), cex=1.2, cex.lab=1.2, cex.numbers=1.3, pch=19)

# blue box plots summarise the distribution of observed data given 
# the other variable is observed, and red box plots summarise the 
# distribution of observed data given the other variable is missing.
# For example, the red box plot in the bottom margin shows that bmi
#is rather missing for lower cholesterol levels. 
# Note that, if data are MCAR, we expect the blue
# and red box plots to be identical.

###################################################
### perform imputations
###################################################
imp <- mice(nhanes, seed=23109)
## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   1   5  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   2   5  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   3   5  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   4   5  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl
##   5   5  bmi  hyp  chl
###################################################
### inspect actual imputed values
###################################################
imp$imp$bmi
##       1    2    3    4    5
## 1  27.2 27.2 29.6 27.4 20.4
## 3  27.2 27.2 29.6 35.3 27.2
## 4  24.9 25.5 25.5 20.4 22.0
## 6  21.7 21.7 20.4 27.4 25.5
## 10 22.5 22.5 22.5 33.2 26.3
## 11 29.6 27.2 27.2 35.3 33.2
## 12 22.0 25.5 27.2 30.1 28.7
## 16 35.3 27.2 35.3 29.6 27.2
## 21 28.7 25.5 22.5 33.2 20.4
###################################################
### show completed data for imputation 1
### check for implausible values
###################################################
complete(imp)
##    age  bmi hyp chl
## 1    1 27.2   1 238
## 2    2 22.7   1 187
## 3    1 27.2   1 187
## 4    3 24.9   1 184
## 5    1 20.4   1 113
## 6    3 21.7   2 184
## 7    1 22.5   1 118
## 8    1 30.1   1 187
## 9    2 22.0   1 238
## 10   2 22.5   1 187
## 11   1 29.6   1 187
## 12   2 22.0   1 187
## 13   3 21.7   1 206
## 14   2 28.7   2 204
## 15   1 29.6   1 187
## 16   1 35.3   1 218
## 17   3 27.2   2 284
## 18   2 26.3   2 199
## 19   1 35.3   1 218
## 20   3 25.5   2 204
## 21   1 28.7   1 131
## 22   1 33.2   1 229
## 23   1 27.5   1 131
## 24   3 24.9   1 204
## 25   2 27.4   1 186
###################################################
### inspect distribution of original vs imputed data
# The figure shows the distributions of the four variables 
# as individual points. Blue points are observed, 
# the red points are imputed.
###################################################
stripplot(imp, pch=20, cex=1.2)

###################################################
# Imputations are plotted in red. The blue points are the 
# same across different panels, but the red point vary.
# The red points have more or less the same shape as blue 
# data, which indicates that they could have been plausible 
# measurements if they had not been missing.
###################################################
xyplot(imp, bmi~chl|.imp, pch=20, cex=1.4)

###################################################
### analyze imputed data
### mice() ----> with() ----> pool()
###################################################
fit <- with(imp, lm(chl~age+bmi))

###################################################
### summarize pooled results across imputations
###################################################
summary(pool(fit))
##          term   estimate std.error   statistic       df    p.value
## 1 (Intercept)  0.2189968 63.074241 0.003472047 14.80613 0.99727606
## 2         age 28.0471913 10.633709 2.637573645 11.58238 0.02223126
## 3         bmi  5.3831348  2.126757 2.531146954 11.55051 0.02702526
# original data with linear model fit for comparison
fit.original <- lm(chl~age+bmi,data=nhanes)
summary(fit.original)
## 
## Call:
## lm(formula = chl ~ age + bmi, data = nhanes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.187 -19.517  -0.310   6.915  60.606 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -80.194     58.772  -1.364 0.202327    
## age           53.069     11.293   4.699 0.000842 ***
## bmi            6.884      1.846   3.730 0.003913 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.67 on 10 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7318, Adjusted R-squared:  0.6781 
## F-statistic: 13.64 on 2 and 10 DF,  p-value: 0.001388
####end of code