#read data with missing
dat<-dat2<-dat2.5<-dat3<-dat7<-dat8<-read.csv("C:/Users/User/Desktop/nyulearningmaterial/2020Spring/missing data/week2/hw2/datawithmissing.csv",header = T)
#load packages
library(mi)
## Warning: package 'mi' was built under R version 3.6.2
## Loading required package: Matrix
## Loading required package: stats4
## mi (Version 1.0, packaged: 2015-04-16 14:03:10 UTC; goodrich)
## mi Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
## This program comes with ABSOLUTELY NO WARRANTY.
## This is free software, and you are welcome to redistribute it
## under the General Public License version 2 or later.
## Execute RShowDoc('COPYING') for details.
library(mice)
## Warning: package 'mice' was built under R version 3.6.3
##
## Attaching package: 'mice'
## The following objects are masked from 'package:mi':
##
## complete, pool
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
1.Listwise delection
lw<-dat[complete.cases(dat),]
2.Mean/mode imputation
#Mean imputation
muimp<-mice(dat2[,2:3], method = "mean", m = 1, maxit = 1)
##
## iter imp variable
## 1 1 ptratio lstat
inp<-mice::complete(muimp,1)
dat2$ptratio<-inp$ptratio
dat2$lstat<-inp$lstat
#Mode imputation
table(dat2$chas)
##
## 0 1
## 431 31
dat2$chas[is.na(dat2$chas)]<-0
3.Random imputation
raninp<-mice(dat[,2:5], method = "sample", m = 1, maxit = 1)
##
## iter imp variable
## 1 1 ptratio lstat chas
raninp<-mice::complete(raninp,1)
dat3$ptratio<-raninp$ptratio
dat3$lstat<-raninp$lstat
dat3$chas<-raninp$chas
4.LVCF Not appliable.
5.Hotdecking
hd<-hotdeck(dat)[,1:5]
6.Regression imputation
#regression imputation
reginp<-mice(dat8[,2:3], method = "norm.nob", m = 1, maxit = 1)
##
## iter imp variable
## 1 1 ptratio lstat
reginp.com<-mice::complete(reginp,1)
dat7$ptratio<-reginp.com[,1]
dat7$lstat<-reginp.com[,2]
#glm imputation
raninp<-mice(dat[,4:5], method = "logreg", m = 1, maxit = 1)
## Warning: Type mismatch for variable(s): chas
## Imputation method logreg is for categorical data.
##
## iter imp variable
## 1 1 chas
raninp<-mice::complete(raninp,1)
dat7$chas<-raninp$chas
7.Regression imputation with noise
#regression with noice
reginpno<-mice(dat8[,2:3], method = "norm.predict", m = 1, maxit = 1)
##
## iter imp variable
## 1 1 ptratio lstat
reginpno.com<-mice::complete(reginpno,1)
dat8$ptratio<-reginpno.com[,1]
dat8$lstat<-reginpno.com[,2]
#glm with noise
Rc = as.numeric(!is.na(dat$chas))
dat.ccc = dat[Rc == 1, ]
dat.droppedc = dat[Rc == 0, ]
modg = glm(chas~ medv, data = dat.ccc,family ="binomial")
p.imp = predict(modg, newdata = dat.droppedc)
ps = predict(modg, newdata=dat.droppedc, type="response")
#add noise
dat8$chas[Rc == 0]<- rbinom(sum(Rc==0), 1, ps)
8.Load data into the package. Obtain summary, histogram, image of data.
mdf<-missing_data.frame(dat)
#Get the summary and number of NAs
summary(mdf)
## X ptratio lstat medv
## Min. : 1.0 Min. :12.60 Min. : 1.920 Min. : 5.00
## 1st Qu.:127.2 1st Qu.:17.40 1st Qu.: 7.037 1st Qu.:17.02
## Median :253.5 Median :19.10 Median :11.430 Median :21.20
## Mean :253.5 Mean :18.51 Mean :12.762 Mean :22.53
## 3rd Qu.:379.8 3rd Qu.:20.20 3rd Qu.:17.065 3rd Qu.:25.00
## Max. :506.0 Max. :22.00 Max. :37.970 Max. :50.00
## NA's :43 NA's :40
## chas
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.0671
## 3rd Qu.:0.0000
## Max. :1.0000
## NA's :44
#histograms
aggr_plot <- aggr(dat, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## chas 0.08695652
## ptratio 0.08498024
## lstat 0.07905138
## X 0.00000000
## medv 0.00000000
#visuailzed missing pattern
image(mdf, grayscale=TRUE)
9.Check data type and make changes if necessary
show(mdf)
## Object of class missing_data.frame with 506 observations on 5 variables
##
## There are 7 missing data patterns
##
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
##
## type missing method model
## X continuous 0 <NA> <NA>
## ptratio continuous 43 ppd linear
## lstat continuous 40 ppd linear
## medv continuous 0 <NA> <NA>
## chas binary 44 ppd logit
##
## family link transformation
## X <NA> <NA> standardize
## ptratio gaussian identity standardize
## lstat gaussian identity standardize
## medv <NA> <NA> standardize
## chas binomial logit <NA>
10. Run mi and check convergence by traceplots how many chains
imputations <- mi(mdf)
converged <- mi2BUGS(imputations)
mean_ptratio = converged[, , 1]
ts.plot(mean_ptratio[,1], col=1)
lines(mean_ptratio[,2], col= 2)
lines(mean_ptratio[,3], col= 3)
lines(mean_ptratio[,4], col= 4)
mean_lstat = converged[, , 2]
ts.plot(mean_lstat[,1], col=1)
lines(mean_lstat[,2], col= 2)
lines(mean_lstat[,3], col= 3)
lines(mean_lstat[,4], col= 4)
#mean_chas = converged[, , 3]
#ts.plot(mean_chas[,1], col=1)
#lines(mean_chas[,2], col= 2)
#lines(mean_chas[,3], col= 3)
#lines(mean_chas[,4], col= 4)
11.Check r-hats
Rhats(imputations)
## mean_ptratio mean_lstat mean_chas sd_ptratio sd_lstat
## 1.0016568 0.9846800 0.9898966 0.9950929 1.0267554
## sd_chas
## 0.9896710
12.Increase number of imputations if necessary,
13.Plot diagnostics
mi::plot(imputations)
14.Change imputation models if necessary.
#change imputations model
mdf1 <- mi::change(mdf, y="lstat", what = "type", to = "positive-continuous")
show(mdf1)
## Object of class missing_data.frame with 506 observations on 5 variables
##
## There are 7 missing data patterns
##
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
##
## type missing method model
## X continuous 0 <NA> <NA>
## ptratio continuous 43 ppd linear
## lstat positive-continuous 40 ppd linear
## medv continuous 0 <NA> <NA>
## chas binary 44 ppd logit
##
## family link transformation
## X <NA> <NA> standardize
## ptratio gaussian identity standardize
## lstat gaussian identity log
## medv <NA> <NA> standardize
## chas binomial logit <NA>
imputations1 <- mi(mdf1, n.iter=50, n.chains = 5)
plot(imputations1)
15. Run pooled analysis
mi::pool(medv~ptratio+lstat+chas, data = imputations1)
## bayesglm(formula = medv ~ ptratio + lstat + chas, data = imputations1)
## coef.est coef.se
## (Intercept) 52.23 2.48
## ptratio -1.06 0.14
## lstat -0.82 0.04
## chas1 4.39 1.12
## n = 502, k = 4
## residual deviance = 16571.9, null deviance = 42716.3 (difference = 26144.4)
## overdispersion parameter = 33.0
## residual sd is sqrt(overdispersion) = 5.75
display(mi::pool(medv~ptratio+lstat+chas, data = imputations1))
## bayesglm(formula = medv ~ ptratio + lstat + chas, data = imputations1)
## coef.est coef.se
## (Intercept) 52.23 2.48
## ptratio -1.06 0.14
## lstat -0.82 0.04
## chas1 4.39 1.12
## n = 502, k = 4
## residual deviance = 16571.9, null deviance = 42716.3 (difference = 26144.4)
## overdispersion parameter = 33.0
## residual sd is sqrt(overdispersion) = 5.75
16. Prepare a table with results from all imputation methods.
model.1<-lm(medv~ptratio+lstat+chas, data = lw)
model.2<-lm(medv~ptratio+lstat+chas, data = dat2)
model.3<-lm(medv~ptratio+lstat+chas, data = dat3)
model.4<-lm(medv~ptratio+lstat+chas, data = hd)
model.5<-lm(medv~ptratio+lstat+chas, data = dat7)
model.6<-lm(medv~ptratio+lstat+chas, data = dat8)
summary(model.1)
##
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = lw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5129 -3.2212 -0.8537 1.7146 27.1686
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.09079 2.50926 19.564 < 2e-16 ***
## ptratio -0.93322 0.14276 -6.537 1.99e-10 ***
## lstat -0.77736 0.04174 -18.625 < 2e-16 ***
## chas 3.86209 1.15782 3.336 0.000933 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.538 on 386 degrees of freedom
## Multiple R-squared: 0.6066, Adjusted R-squared: 0.6035
## F-statistic: 198.4 on 3 and 386 DF, p-value: < 2.2e-16
summary(model.2)
##
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.244 -3.726 -1.012 1.789 26.993
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.8645 2.5366 20.841 < 2e-16 ***
## ptratio -1.0970 0.1432 -7.662 9.52e-14 ***
## lstat -0.8077 0.0424 -19.050 < 2e-16 ***
## chas 4.5951 1.1514 3.991 7.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.152 on 502 degrees of freedom
## Multiple R-squared: 0.5552, Adjusted R-squared: 0.5525
## F-statistic: 208.9 on 3 and 502 DF, p-value: < 2.2e-16
summary(model.3)
##
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = dat3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.187 -3.607 -1.170 1.879 33.108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.08754 2.47007 20.683 < 2e-16 ***
## ptratio -1.02218 0.13779 -7.419 5.09e-13 ***
## lstat -0.78157 0.04134 -18.907 < 2e-16 ***
## chas 4.89790 1.16432 4.207 3.07e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.31 on 502 degrees of freedom
## Multiple R-squared: 0.5322, Adjusted R-squared: 0.5294
## F-statistic: 190.3 on 3 and 502 DF, p-value: < 2.2e-16
summary(model.4)
##
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = hd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.932 -3.557 -1.241 1.930 30.402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.89134 2.50499 21.114 < 2e-16 ***
## ptratio -1.15731 0.14060 -8.231 1.61e-15 ***
## lstat -0.71947 0.04151 -17.333 < 2e-16 ***
## chas 4.07313 1.15941 3.513 0.000483 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.396 on 502 degrees of freedom
## Multiple R-squared: 0.5192, Adjusted R-squared: 0.5163
## F-statistic: 180.7 on 3 and 502 DF, p-value: < 2.2e-16
summary(model.5)
##
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = dat7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.590 -3.717 -1.125 2.018 34.498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.01808 2.51547 20.282 < 2e-16 ***
## ptratio -1.04943 0.14202 -7.389 6.20e-13 ***
## lstat -0.72985 0.04195 -17.398 < 2e-16 ***
## chas 5.14431 1.13448 4.534 7.23e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.393 on 502 degrees of freedom
## Multiple R-squared: 0.5197, Adjusted R-squared: 0.5168
## F-statistic: 181.1 on 3 and 502 DF, p-value: < 2.2e-16
summary(model.6)
##
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = dat8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.0005 -3.7343 -0.9545 1.9684 26.9752
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.39161 2.49839 20.970 < 2e-16 ***
## ptratio -1.07069 0.14249 -7.514 2.65e-13 ***
## lstat -0.80682 0.04226 -19.092 < 2e-16 ***
## chas 4.42665 1.08317 4.087 5.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.048 on 502 degrees of freedom
## Multiple R-squared: 0.5702, Adjusted R-squared: 0.5676
## F-statistic: 222 on 3 and 502 DF, p-value: < 2.2e-16
17.Discuss and compare.
#read the origianl and complete data
comp<-read.csv("C:/Users/User/Desktop/nyulearningmaterial/2020Spring/missing data/week2/hw2/complete.csv",header = T)
library(ggplot2)
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 3.6.2
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
a<-ggplot(comp, aes(x=ptratio, y=lstat,color= medv) ) +
geom_point()+
scale_color_gradient2(midpoint=25, low="blue", mid="white",
high="red", space ="Lab" )+
theme(legend.position=c(0.1,0.8),legend.background = element_rect(fill="transparent"),legend.key = element_rect(fill = "transparent"))+
ggtitle("Complete")
b<-ggplot(lw, aes(x=ptratio, y=lstat,color= medv) ) +
geom_point()+
scale_color_gradient2(midpoint=25, low="blue", mid="white",
high="red", space ="Lab" )+
theme(legend.position="none")+
ggtitle("Imputation")
c<-ggplot(hd, aes(x=ptratio, y=lstat,color= medv) ) +
geom_point()+
scale_color_gradient2(midpoint=25, low="blue", mid="white",
high="red", space ="Lab" )+
theme(legend.position="none")+
ggtitle("Imputation")
d<-ggplot(dat7, aes(x=ptratio, y=lstat,color= medv) ) +
geom_point()+
scale_color_gradient2(midpoint=25, low="blue", mid="white",
high="red", space ="Lab" )+
theme(legend.position="none")+
ggtitle("Imputation")
e<-ggplot(dat2, aes(x=ptratio, y=lstat,color= medv) ) +
geom_point()+
scale_color_gradient2(midpoint=25, low="blue", mid="white",
high="red", space ="Lab" )+
theme(legend.position="none")+
ggtitle("Imputation")+
geom_hline(yintercept=mean(lw$lstat), linetype="dashed")+
geom_vline(xintercept=mean(lw$ptratio), linetype="dashed")
f<-a+geom_hline(yintercept=mean(lw$lstat), linetype="dashed")+
geom_vline(xintercept=mean(lw$ptratio), linetype="dashed")
g<-ggplot(dat3, aes(x=ptratio, y=lstat,color= medv) ) +
geom_point()+
scale_color_gradient2(midpoint=25, low="blue", mid="white",
high="red", space ="Lab" )+
theme(legend.position="none")+
ggtitle("Imputation")
h<-ggplot(dat7, aes(x=ptratio, y=lstat,color= medv) ) +
geom_point()+
scale_color_gradient2(midpoint=25, low="blue", mid="white",
high="red", space ="Lab" )+
theme(legend.position="none")+
ggtitle("Imputation")
i<-ggplot(dat8, aes(x=ptratio, y=lstat,color= medv) ) +
geom_point()+
scale_color_gradient2(midpoint=25, low="blue", mid="white",
high="red", space ="Lab" )+
theme(legend.position="none")+
ggtitle("Imputation")
plot_grid(a,b, labels = "AUTO")
plot_grid(a,c, labels = "AUTO")
plot_grid(a,d, labels = "AUTO")
plot_grid(f,e, labels = "AUTO")
plot_grid(a,g, labels = "AUTO")
plot_grid(a,h, labels = "AUTO")
plot_grid(a,i, labels = "AUTO")