## Load VIM package for Visualization and Imputation of Missing Values
library(VIM)
## Load sleep data in VIM
data(sleep, package = "VIM")
## Show complete cases
sleep[complete.cases(sleep),]
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
2 1.000 6.60 6.3 2.0 8.3 4.5 42.0 3 1 3
5 2547.000 4603.00 2.1 1.8 3.9 69.0 624.0 3 5 4
6 10.550 179.50 9.1 0.7 9.8 27.0 180.0 4 4 4
7 0.023 0.30 15.8 3.9 19.7 19.0 35.0 1 1 1
8 160.000 169.00 5.2 1.0 6.2 30.4 392.0 4 5 4
9 3.300 25.60 10.9 3.6 14.5 28.0 63.0 1 2 1
10 52.160 440.00 8.3 1.4 9.7 50.0 230.0 1 1 1
11 0.425 6.40 11.0 1.5 12.5 7.0 112.0 5 4 4
12 465.000 423.00 3.2 0.7 3.9 30.0 281.0 5 5 5
15 0.075 1.20 6.3 2.1 8.4 3.5 42.0 1 1 1
16 3.000 25.00 8.6 0.0 8.6 50.0 28.0 2 2 2
17 0.785 3.50 6.6 4.1 10.7 6.0 42.0 2 2 2
18 0.200 5.00 9.5 1.2 10.7 10.4 120.0 2 2 2
22 27.660 115.00 3.3 0.5 3.8 20.0 148.0 5 5 5
23 0.120 1.00 11.0 3.4 14.4 3.9 16.0 3 1 2
25 85.000 325.00 4.7 1.5 6.2 41.0 310.0 1 3 1
27 0.101 4.00 10.4 3.4 13.8 9.0 28.0 5 1 3
28 1.040 5.50 7.4 0.8 8.2 7.6 68.0 5 3 4
29 521.000 655.00 2.1 0.8 2.9 46.0 336.0 5 5 5
32 0.005 0.14 7.7 1.4 9.1 2.6 21.5 5 2 4
33 0.010 0.25 17.9 2.0 19.9 24.0 50.0 1 1 1
34 62.000 1320.00 6.1 1.9 8.0 100.0 267.0 1 1 1
37 0.023 0.40 11.9 1.3 13.2 3.2 19.0 4 1 3
38 0.048 0.33 10.8 2.0 12.8 2.0 30.0 4 1 3
39 1.700 6.30 13.8 5.6 19.4 5.0 12.0 2 1 1
40 3.500 10.80 14.3 3.1 17.4 6.5 120.0 2 1 1
42 0.480 15.50 15.2 1.8 17.0 12.0 140.0 2 2 2
43 10.000 115.00 10.0 0.9 10.9 20.2 170.0 4 4 4
44 1.620 11.40 11.9 1.8 13.7 13.0 17.0 2 1 2
45 192.000 180.00 6.5 1.9 8.4 27.0 115.0 4 4 4
46 2.500 12.10 7.5 0.9 8.4 18.0 31.0 5 5 5
48 0.280 1.90 10.6 2.6 13.2 4.7 21.0 3 1 3
49 4.235 50.40 7.4 2.4 9.8 9.8 52.0 1 1 1
50 6.800 179.00 8.4 1.2 9.6 29.0 164.0 2 3 2
51 0.750 12.30 5.7 0.9 6.6 7.0 225.0 2 2 2
52 3.600 21.00 4.9 0.5 5.4 6.0 225.0 3 2 3
54 55.500 175.00 3.2 0.6 3.8 20.0 151.0 5 5 5
57 0.900 2.60 11.0 2.3 13.3 4.5 60.0 2 1 2
58 2.000 12.30 4.9 0.5 5.4 7.5 200.0 3 1 3
59 0.104 2.50 13.2 2.6 15.8 2.3 46.0 3 2 2
60 4.190 58.00 9.7 0.6 10.3 24.0 210.0 4 3 4
61 3.500 3.90 12.8 6.6 19.4 3.0 14.0 2 1 1
## Show cases with missing values
sleep[!complete.cases(sleep),]
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
1 6654.000 5712.0 NA NA 3.3 38.6 645 3 5 3
3 3.385 44.5 NA NA 12.5 14.0 60 1 1 1
4 0.920 5.7 NA NA 16.5 NA 25 5 2 3
13 0.550 2.4 7.6 2.7 10.3 NA NA 2 1 2
14 187.100 419.0 NA NA 3.1 40.0 365 5 5 5
19 1.410 17.5 4.8 1.3 6.1 34.0 NA 1 2 1
20 60.000 81.0 12.0 6.1 18.1 7.0 NA 1 1 1
21 529.000 680.0 NA 0.3 NA 28.0 400 5 5 5
24 207.000 406.0 NA NA 12.0 39.3 252 1 4 1
26 36.330 119.5 NA NA 13.0 16.2 63 1 1 1
30 100.000 157.0 NA NA 10.8 22.4 100 1 1 1
31 35.000 56.0 NA NA NA 16.3 33 3 5 4
35 0.122 3.0 8.2 2.4 10.6 NA 30 2 1 1
36 1.350 8.1 8.4 2.8 11.2 NA 45 3 1 3
41 250.000 490.0 NA 1.0 NA 23.6 440 5 5 5
47 4.288 39.2 NA NA 12.5 13.7 63 2 2 2
53 14.830 98.2 NA NA 2.6 17.0 150 5 5 5
55 1.400 12.5 NA NA 11.0 12.7 90 2 2 2
56 0.060 1.0 8.1 2.2 10.3 3.5 NA 3 1 2
62 4.050 17.0 NA NA NA 13.0 38 3 1 1
## Load mice package for Multivariate Imputation by Chained Equations (MICE)
library(mice)
md.pattern(sleep)
BodyWgt BrainWgt Pred Exp Danger Sleep Span Gest Dream NonD
42 1 1 1 1 1 1 1 1 1 1 0 # 42 obs. with 0 missing values
2 1 1 1 1 1 1 0 1 1 1 1 # 2 obs. with 1 missing value (Span)
3 1 1 1 1 1 1 1 0 1 1 1 # 3 obs. with 1 missing value (Gest)
9 1 1 1 1 1 1 1 1 0 0 2 # 2 obs. with 2 missing values (Dream, NonD)
2 1 1 1 1 1 0 1 1 1 0 2 # etc.
1 1 1 1 1 1 1 0 0 1 1 2
2 1 1 1 1 1 0 1 1 0 0 3
1 1 1 1 1 1 1 0 1 0 0 3
0 0 0 0 0 4 4 4 12 14 38
## in number
aggr(sleep, prop = F, numbers = T)
## in proportions
aggr(sleep, prop = T, numbers = T)
## Matrix plot. Red for missing values, Darker values are high values.
matrixplot(sleep, interactive = F, sortby = "Sleep")
## Margin plot. Red dots have at least one missing. No observation with two missing values here.
marginplot(sleep[,c("Gest","Dream")])
scattmatrixMiss(sleep, interactive = F, highlight = c("Sleep"))
## Create data frame indicating missingness by 1
x <- as.data.frame(abs(is.na(sleep)))
## Select columns with some (but not all) missing values
y <- x[,sapply(x, sd) > 0]
## Create a correlation matrix: Variables missing together have high correlation
cor(y)
NonD Dream Sleep Span Gest
NonD 1.0000 0.90711 0.48626 0.01520 -0.14183
Dream 0.9071 1.00000 0.20370 0.03752 -0.12865
Sleep 0.4863 0.20370 1.00000 -0.06897 -0.06897
Span 0.0152 0.03752 -0.06897 1.00000 0.19828
Gest -0.1418 -0.12865 -0.06897 0.19828 1.00000
## Rows are observed variables, columns are indicator variables for missingness
## high correlation means the row variables is strongly correlated with missingness of the column variable
cor(sleep, y, use = "pairwise.complete.obs")
NonD Dream Sleep Span Gest
BodyWgt 0.22683 0.22259 0.001685 -0.05832 -0.05397
BrainWgt 0.17946 0.16321 0.007859 -0.07921 -0.07333
NonD NA NA NA -0.04315 -0.04553
Dream -0.18895 NA -0.188952 0.11699 0.22775
Sleep -0.08023 -0.08023 NA 0.09638 0.03976
Span 0.08336 0.05981 0.005239 NA -0.06527
Gest 0.20239 0.05140 0.159702 -0.17495 NA
Pred 0.04758 -0.06834 0.202463 0.02314 -0.20102
Exp 0.24547 0.12741 0.260773 -0.19292 -0.19292
Danger 0.06528 -0.06725 0.208884 -0.06666 -0.20444
## Multivariate Imputation by Chained Equations (MICE)
system.time(mi.sleep <- mice(sleep, seed = 1234, printFlag = FALSE))
user system elapsed
0.628 0.019 0.646
## Check mi object
mice:::print.mids(mi.sleep)
Multiply imputed data set
Call:
mice(data = sleep, printFlag = FALSE, seed = 1234)
Number of multiple imputations: 5
Missing cells per column:
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
0 0 14 12 4 4 4 0 0 0
Imputation methods:
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
"" "" "pmm" "pmm" "pmm" "pmm" "pmm" "" "" ""
VisitSequence:
NonD Dream Sleep Span Gest
3 4 5 6 7
PredictorMatrix:
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
BodyWgt 0 0 0 0 0 0 0 0 0 0
BrainWgt 0 0 0 0 0 0 0 0 0 0
NonD 1 1 0 1 1 1 1 1 1 1
Dream 1 1 1 0 1 1 1 1 1 1
Sleep 1 1 1 1 0 1 1 1 1 1
Span 1 1 1 1 1 0 1 1 1 1
Gest 1 1 1 1 1 1 0 1 1 1
Pred 0 0 0 0 0 0 0 0 0 0
Exp 0 0 0 0 0 0 0 0 0 0
Danger 0 0 0 0 0 0 0 0 0 0
Random generator seed value: 1234
## Plot mi object
plot(mi.sleep)
## Check imputed values for Dream variable (5 columns for 5 datasets)
mi.sleep$imp$Dream
1 2 3 4 5
1 0.5 0.5 0.5 0.5 0.0
3 2.3 2.4 1.9 1.5 2.4
4 1.2 1.3 5.6 2.3 1.3
14 0.6 1.0 0.0 0.3 0.5
24 1.2 1.0 5.6 1.0 6.6
26 1.9 6.6 0.9 2.2 2.0
30 1.0 1.2 2.6 2.3 1.4
31 5.6 0.5 1.2 0.5 1.4
47 0.7 0.6 1.4 1.8 3.6
53 0.7 0.5 0.7 0.5 0.5
55 0.5 2.4 0.7 2.6 2.6
62 1.9 1.4 3.6 5.6 6.6
## Show 3rd imputed dataset
complete(mi.sleep, action = 3)
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
1 6654.000 5712.00 2.1 0.5 3.3 38.6 645.0 3 5 3
2 1.000 6.60 6.3 2.0 8.3 4.5 42.0 3 1 3
3 3.385 44.50 10.6 1.9 12.5 14.0 60.0 1 1 1
4 0.920 5.70 11.0 5.6 16.5 4.7 25.0 5 2 3
5 2547.000 4603.00 2.1 1.8 3.9 69.0 624.0 3 5 4
6 10.550 179.50 9.1 0.7 9.8 27.0 180.0 4 4 4
7 0.023 0.30 15.8 3.9 19.7 19.0 35.0 1 1 1
8 160.000 169.00 5.2 1.0 6.2 30.4 392.0 4 5 4
9 3.300 25.60 10.9 3.6 14.5 28.0 63.0 1 2 1
10 52.160 440.00 8.3 1.4 9.7 50.0 230.0 1 1 1
11 0.425 6.40 11.0 1.5 12.5 7.0 112.0 5 4 4
12 465.000 423.00 3.2 0.7 3.9 30.0 281.0 5 5 5
13 0.550 2.40 7.6 2.7 10.3 24.0 21.0 2 1 2
14 187.100 419.00 3.2 0.0 3.1 40.0 365.0 5 5 5
15 0.075 1.20 6.3 2.1 8.4 3.5 42.0 1 1 1
16 3.000 25.00 8.6 0.0 8.6 50.0 28.0 2 2 2
17 0.785 3.50 6.6 4.1 10.7 6.0 42.0 2 2 2
18 0.200 5.00 9.5 1.2 10.7 10.4 120.0 2 2 2
19 1.410 17.50 4.8 1.3 6.1 34.0 120.0 1 2 1
20 60.000 81.00 12.0 6.1 18.1 7.0 25.0 1 1 1
21 529.000 680.00 14.3 0.3 13.8 28.0 400.0 5 5 5
22 27.660 115.00 3.3 0.5 3.8 20.0 148.0 5 5 5
23 0.120 1.00 11.0 3.4 14.4 3.9 16.0 3 1 2
24 207.000 406.00 6.3 5.6 12.0 39.3 252.0 1 4 1
25 85.000 325.00 4.7 1.5 6.2 41.0 310.0 1 3 1
26 36.330 119.50 12.0 0.9 13.0 16.2 63.0 1 1 1
27 0.101 4.00 10.4 3.4 13.8 9.0 28.0 5 1 3
28 1.040 5.50 7.4 0.8 8.2 7.6 68.0 5 3 4
29 521.000 655.00 2.1 0.8 2.9 46.0 336.0 5 5 5
30 100.000 157.00 8.4 2.6 10.8 22.4 100.0 1 1 1
31 35.000 56.00 2.1 1.2 3.3 16.3 33.0 3 5 4
32 0.005 0.14 7.7 1.4 9.1 2.6 21.5 5 2 4
33 0.010 0.25 17.9 2.0 19.9 24.0 50.0 1 1 1
34 62.000 1320.00 6.1 1.9 8.0 100.0 267.0 1 1 1
35 0.122 3.00 8.2 2.4 10.6 28.0 30.0 2 1 1
36 1.350 8.10 8.4 2.8 11.2 9.0 45.0 3 1 3
37 0.023 0.40 11.9 1.3 13.2 3.2 19.0 4 1 3
38 0.048 0.33 10.8 2.0 12.8 2.0 30.0 4 1 3
39 1.700 6.30 13.8 5.6 19.4 5.0 12.0 2 1 1
40 3.500 10.80 14.3 3.1 17.4 6.5 120.0 2 1 1
41 250.000 490.00 7.4 1.0 8.4 23.6 440.0 5 5 5
42 0.480 15.50 15.2 1.8 17.0 12.0 140.0 2 2 2
43 10.000 115.00 10.0 0.9 10.9 20.2 170.0 4 4 4
44 1.620 11.40 11.9 1.8 13.7 13.0 17.0 2 1 2
45 192.000 180.00 6.5 1.9 8.4 27.0 115.0 4 4 4
46 2.500 12.10 7.5 0.9 8.4 18.0 31.0 5 5 5
47 4.288 39.20 11.0 1.4 12.5 13.7 63.0 2 2 2
48 0.280 1.90 10.6 2.6 13.2 4.7 21.0 3 1 3
49 4.235 50.40 7.4 2.4 9.8 9.8 52.0 1 1 1
50 6.800 179.00 8.4 1.2 9.6 29.0 164.0 2 3 2
51 0.750 12.30 5.7 0.9 6.6 7.0 225.0 2 2 2
52 3.600 21.00 4.9 0.5 5.4 6.0 225.0 3 2 3
53 14.830 98.20 2.1 0.7 2.6 17.0 150.0 5 5 5
54 55.500 175.00 3.2 0.6 3.8 20.0 151.0 5 5 5
55 1.400 12.50 10.4 0.7 11.0 12.7 90.0 2 2 2
56 0.060 1.00 8.1 2.2 10.3 3.5 19.0 3 1 2
57 0.900 2.60 11.0 2.3 13.3 4.5 60.0 2 1 2
58 2.000 12.30 4.9 0.5 5.4 7.5 200.0 3 1 3
59 0.104 2.50 13.2 2.6 15.8 2.3 46.0 3 2 2
60 4.190 58.00 9.7 0.6 10.3 24.0 210.0 4 3 4
61 3.500 3.90 12.8 6.6 19.4 3.0 14.0 2 1 1
62 4.050 17.00 15.2 3.6 19.4 13.0 38.0 3 1 1
## Fit a model for each imputed dataset
mi.fit.Dream.by.Span.Gest <- with(mi.sleep, lm(Dream ~ Span + Gest))
## Pool results from imputed datasets
mi.pool.res <- pool(mi.fit.Dream.by.Span.Gest)
## Show summary: nmis: number missing; fmi: fraction of missing information
summary(mi.pool.res)
est se t df Pr(>|t|) lo 95 hi 95 nmis fmi lambda
(Intercept) 2.588583 0.275515 9.3954 52.14 8.344e-13 2.035756 3.141411 NA 0.08701 0.05265
Span -0.002763 0.012953 -0.2133 52.89 8.319e-01 -0.028744 0.023219 4 0.08061 0.04648
Gest -0.004206 0.001575 -2.6707 55.62 9.907e-03 -0.007361 -0.001051 4 0.05369 0.02027
## Complete case analysis
cc.res <- with(sleep, lm(Dream ~ Span + Gest))
summary(cc.res)
Call:
lm(formula = Dream ~ Span + Gest)
Residuals:
Min 1Q Median 3Q Max
-2.313 -0.858 -0.218 0.408 4.184
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.478748 0.290662 8.53 1.3e-10 ***
Span -0.000887 0.012310 -0.07 0.943
Gest -0.004319 0.001756 -2.46 0.018 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.27 on 41 degrees of freedom
(18 observations deleted due to missingness)
Multiple R-squared: 0.195, Adjusted R-squared: 0.156
F-statistic: 4.98 on 2 and 41 DF, p-value: 0.0116
## Load Amelia
library(Amelia)
## Perform imputation
system.time(am.sleep <- amelia(x = sleep, parallel = "multicore", p2s = 0))
user system elapsed
0.264 0.002 0.266
## Summary of imputed dataset
summary(am.sleep)
Amelia output with 5 imputed datasets.
Return code: 1
Message: Normal EM convergence.
Chain Lengths:
--------------
Imputation 1: 232
Imputation 2: 928
Imputation 3: 10
Imputation 4: 221
Imputation 5: 158
Rows after Listwise Deletion: 42
Rows after Imputation: 62
Patterns of missingness in the data: 8
Fraction Missing for original variables:
-----------------------------------------
Fraction Missing
BodyWgt 0.00000
BrainWgt 0.00000
NonD 0.22581
Dream 0.19355
Sleep 0.06452
Span 0.06452
Gest 0.06452
Pred 0.00000
Exp 0.00000
Danger 0.00000
## Plot for imformation
plot(am.sleep)
## Load Zelig
library(Zelig)
## Fit linear model using zelig
res.zelig <- zelig(Dream ~ Span + Gest, data = am.sleep, model = "normal")
How to cite this model in Zelig:
Kosuke Imai, Gary King, and Olivia Lau. 2013.
"normal: Normal Regression for Continuous Dependent Variables"
in Kosuke Imai, Gary King, and Olivia Lau, "Zelig: Everyone's Statistical Software,"
http://gking.harvard.edu/zelig
## Show results
summary(res.zelig)
Model: normal
Number of multiply imputed data sets: 5
Combined results:
Call:
glm(formula = formula, weights = weights, family = gaussian,
model = F, data = data)
Coefficients:
Value Std. Error t-stat p-value
(Intercept) 2.338174 0.79656 2.9353 0.006881
Span -0.029824 0.04635 -0.6435 0.533045
Gest 0.003841 0.01226 0.3133 0.767200
For combined results from datasets i to j, use summary(x, subset = i:j).
For separate results, use print(summary(x), subset = i:j).