Project

#read data with missing
dat<-dat2<-dat2.5<-dat3<-dat7<-dat8<-read.csv("C:/Users/User/Desktop/nyulearningmaterial/2020Spring/missing data/week2/hw2/datawithmissing.csv",header = T)

#load packages
library(mi)

## Warning: package 'mi' was built under R version 3.6.2

## Loading required package: Matrix

## Loading required package: stats4

## mi (Version 1.0, packaged: 2015-04-16 14:03:10 UTC; goodrich)

## mi  Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University

## This program comes with ABSOLUTELY NO WARRANTY.

## This is free software, and you are welcome to redistribute it

## under the General Public License version 2 or later.

## Execute RShowDoc('COPYING') for details.

library(mice)

## Warning: package 'mice' was built under R version 3.6.3

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:mi':
## 
##     complete, pool

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

library(VIM)

## Warning: package 'VIM' was built under R version 3.6.2

## Loading required package: colorspace

## Loading required package: grid

## Loading required package: data.table

## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.

## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

1.Listwise delection

lw<-dat[complete.cases(dat),]

2.Mean/mode imputation

#Mean imputation
muimp<-mice(dat2[,2:3], method = "mean", m = 1, maxit = 1)

## 
##  iter imp variable
##   1   1  ptratio  lstat

inp<-mice::complete(muimp,1)
dat2$ptratio<-inp$ptratio
dat2$lstat<-inp$lstat

#Mode imputation
table(dat2$chas)

## 
##   0   1 
## 431  31

dat2$chas[is.na(dat2$chas)]<-0

3.Random imputation

raninp<-mice(dat[,2:5], method = "sample", m = 1, maxit = 1)

## 
##  iter imp variable
##   1   1  ptratio  lstat  chas

raninp<-mice::complete(raninp,1)
dat3$ptratio<-raninp$ptratio
dat3$lstat<-raninp$lstat
dat3$chas<-raninp$chas

4.LVCF Not appliable.

5.Hotdecking

hd<-hotdeck(dat)[,1:5]

6.Regression imputation

#regression imputation
reginp<-mice(dat8[,2:3], method = "norm.nob", m = 1, maxit = 1)

## 
##  iter imp variable
##   1   1  ptratio  lstat

reginp.com<-mice::complete(reginp,1)
dat7$ptratio<-reginp.com[,1]
dat7$lstat<-reginp.com[,2]

#glm imputation
raninp<-mice(dat[,4:5], method = "logreg", m = 1, maxit = 1)

## Warning: Type mismatch for variable(s): chas
## Imputation method logreg is for categorical data.

## 
##  iter imp variable
##   1   1  chas

raninp<-mice::complete(raninp,1)
dat7$chas<-raninp$chas

7.Regression imputation with noise

#regression with noice
reginpno<-mice(dat8[,2:3], method = "norm.predict", m = 1, maxit = 1)

## 
##  iter imp variable
##   1   1  ptratio  lstat

reginpno.com<-mice::complete(reginpno,1)
dat8$ptratio<-reginpno.com[,1]
dat8$lstat<-reginpno.com[,2]

#glm with noise
Rc = as.numeric(!is.na(dat$chas))
dat.ccc = dat[Rc == 1, ]
dat.droppedc = dat[Rc == 0, ]
modg = glm(chas~ medv, data = dat.ccc,family ="binomial")
p.imp = predict(modg, newdata = dat.droppedc)

ps = predict(modg, newdata=dat.droppedc, type="response")

#add noise
dat8$chas[Rc == 0]<- rbinom(sum(Rc==0), 1, ps)

8.Load data into the package. Obtain summary, histogram, image of data.

mdf<-missing_data.frame(dat)

#Get the summary and number of NAs 
summary(mdf)

##        X            ptratio          lstat             medv      
##  Min.   :  1.0   Min.   :12.60   Min.   : 1.920   Min.   : 5.00  
##  1st Qu.:127.2   1st Qu.:17.40   1st Qu.: 7.037   1st Qu.:17.02  
##  Median :253.5   Median :19.10   Median :11.430   Median :21.20  
##  Mean   :253.5   Mean   :18.51   Mean   :12.762   Mean   :22.53  
##  3rd Qu.:379.8   3rd Qu.:20.20   3rd Qu.:17.065   3rd Qu.:25.00  
##  Max.   :506.0   Max.   :22.00   Max.   :37.970   Max.   :50.00  
##                  NA's   :43      NA's   :40                      
##       chas       
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.0671  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
##  NA's   :44

#histograms
aggr_plot <- aggr(dat, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable      Count
##      chas 0.08695652
##   ptratio 0.08498024
##     lstat 0.07905138
##         X 0.00000000
##      medv 0.00000000

#visuailzed missing pattern
image(mdf, grayscale=TRUE)

9.Check data type and make changes if necessary

show(mdf)

## Object of class missing_data.frame with 506 observations on 5 variables
## 
## There are 7 missing data patterns
## 
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
## 
##               type missing method  model
## X       continuous       0   <NA>   <NA>
## ptratio continuous      43    ppd linear
## lstat   continuous      40    ppd linear
## medv    continuous       0   <NA>   <NA>
## chas        binary      44    ppd  logit
## 
##           family     link transformation
## X           <NA>     <NA>    standardize
## ptratio gaussian identity    standardize
## lstat   gaussian identity    standardize
## medv        <NA>     <NA>    standardize
## chas    binomial    logit           <NA>

10. Run mi and check convergence by traceplots how many chains

imputations <- mi(mdf)
converged <- mi2BUGS(imputations)

mean_ptratio = converged[, , 1]
ts.plot(mean_ptratio[,1], col=1)
lines(mean_ptratio[,2], col= 2)
lines(mean_ptratio[,3], col= 3)
lines(mean_ptratio[,4], col= 4)

mean_lstat = converged[, , 2]
ts.plot(mean_lstat[,1], col=1)
lines(mean_lstat[,2], col= 2)
lines(mean_lstat[,3], col= 3)
lines(mean_lstat[,4], col= 4)

#mean_chas = converged[, , 3]
#ts.plot(mean_chas[,1], col=1)
#lines(mean_chas[,2], col= 2)
#lines(mean_chas[,3], col= 3)
#lines(mean_chas[,4], col= 4)

11.Check r-hats

Rhats(imputations)

## mean_ptratio   mean_lstat    mean_chas   sd_ptratio     sd_lstat 
##    1.0016568    0.9846800    0.9898966    0.9950929    1.0267554 
##      sd_chas 
##    0.9896710

12.Increase number of imputations if necessary,

13.Plot diagnostics

mi::plot(imputations)

14.Change imputation models if necessary.

#change  imputations model
mdf1 <- mi::change(mdf, y="lstat", what = "type", to = "positive-continuous")
show(mdf1)

## Object of class missing_data.frame with 506 observations on 5 variables
## 
## There are 7 missing data patterns
## 
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
## 
##                        type missing method  model
## X                continuous       0   <NA>   <NA>
## ptratio          continuous      43    ppd linear
## lstat   positive-continuous      40    ppd linear
## medv             continuous       0   <NA>   <NA>
## chas                 binary      44    ppd  logit
## 
##           family     link transformation
## X           <NA>     <NA>    standardize
## ptratio gaussian identity    standardize
## lstat   gaussian identity            log
## medv        <NA>     <NA>    standardize
## chas    binomial    logit           <NA>

imputations1 <- mi(mdf1, n.iter=50, n.chains = 5)
plot(imputations1)

15. Run pooled analysis

mi::pool(medv~ptratio+lstat+chas, data = imputations1)

## bayesglm(formula = medv ~ ptratio + lstat + chas, data = imputations1)
##             coef.est coef.se
## (Intercept) 52.23     2.48  
## ptratio     -1.06     0.14  
## lstat       -0.82     0.04  
## chas1        4.39     1.12  
## n = 502, k = 4
## residual deviance = 16571.9, null deviance = 42716.3 (difference = 26144.4)
## overdispersion parameter = 33.0
## residual sd is sqrt(overdispersion) = 5.75

display(mi::pool(medv~ptratio+lstat+chas, data = imputations1))

## bayesglm(formula = medv ~ ptratio + lstat + chas, data = imputations1)
##             coef.est coef.se
## (Intercept) 52.23     2.48  
## ptratio     -1.06     0.14  
## lstat       -0.82     0.04  
## chas1        4.39     1.12  
## n = 502, k = 4
## residual deviance = 16571.9, null deviance = 42716.3 (difference = 26144.4)
## overdispersion parameter = 33.0
## residual sd is sqrt(overdispersion) = 5.75

16. Prepare a table with results from all imputation methods.

model.1<-lm(medv~ptratio+lstat+chas, data = lw)
model.2<-lm(medv~ptratio+lstat+chas, data = dat2)
model.3<-lm(medv~ptratio+lstat+chas, data = dat3)
model.4<-lm(medv~ptratio+lstat+chas, data = hd)
model.5<-lm(medv~ptratio+lstat+chas, data = dat7)
model.6<-lm(medv~ptratio+lstat+chas, data = dat8)

summary(model.1)

## 
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = lw)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5129  -3.2212  -0.8537   1.7146  27.1686 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.09079    2.50926  19.564  < 2e-16 ***
## ptratio     -0.93322    0.14276  -6.537 1.99e-10 ***
## lstat       -0.77736    0.04174 -18.625  < 2e-16 ***
## chas         3.86209    1.15782   3.336 0.000933 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.538 on 386 degrees of freedom
## Multiple R-squared:  0.6066, Adjusted R-squared:  0.6035 
## F-statistic: 198.4 on 3 and 386 DF,  p-value: < 2.2e-16

summary(model.2)

## 
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = dat2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.244  -3.726  -1.012   1.789  26.993 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  52.8645     2.5366  20.841  < 2e-16 ***
## ptratio      -1.0970     0.1432  -7.662 9.52e-14 ***
## lstat        -0.8077     0.0424 -19.050  < 2e-16 ***
## chas          4.5951     1.1514   3.991 7.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.152 on 502 degrees of freedom
## Multiple R-squared:  0.5552, Adjusted R-squared:  0.5525 
## F-statistic: 208.9 on 3 and 502 DF,  p-value: < 2.2e-16

summary(model.3)

## 
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = dat3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.187  -3.607  -1.170   1.879  33.108 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.08754    2.47007  20.683  < 2e-16 ***
## ptratio     -1.02218    0.13779  -7.419 5.09e-13 ***
## lstat       -0.78157    0.04134 -18.907  < 2e-16 ***
## chas         4.89790    1.16432   4.207 3.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.31 on 502 degrees of freedom
## Multiple R-squared:  0.5322, Adjusted R-squared:  0.5294 
## F-statistic: 190.3 on 3 and 502 DF,  p-value: < 2.2e-16

summary(model.4)

## 
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = hd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.932  -3.557  -1.241   1.930  30.402 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 52.89134    2.50499  21.114  < 2e-16 ***
## ptratio     -1.15731    0.14060  -8.231 1.61e-15 ***
## lstat       -0.71947    0.04151 -17.333  < 2e-16 ***
## chas         4.07313    1.15941   3.513 0.000483 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.396 on 502 degrees of freedom
## Multiple R-squared:  0.5192, Adjusted R-squared:  0.5163 
## F-statistic: 180.7 on 3 and 502 DF,  p-value: < 2.2e-16

summary(model.5)

## 
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = dat7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.590  -3.717  -1.125   2.018  34.498 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.01808    2.51547  20.282  < 2e-16 ***
## ptratio     -1.04943    0.14202  -7.389 6.20e-13 ***
## lstat       -0.72985    0.04195 -17.398  < 2e-16 ***
## chas         5.14431    1.13448   4.534 7.23e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.393 on 502 degrees of freedom
## Multiple R-squared:  0.5197, Adjusted R-squared:  0.5168 
## F-statistic: 181.1 on 3 and 502 DF,  p-value: < 2.2e-16

summary(model.6)

## 
## Call:
## lm(formula = medv ~ ptratio + lstat + chas, data = dat8)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.0005  -3.7343  -0.9545   1.9684  26.9752 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 52.39161    2.49839  20.970  < 2e-16 ***
## ptratio     -1.07069    0.14249  -7.514 2.65e-13 ***
## lstat       -0.80682    0.04226 -19.092  < 2e-16 ***
## chas         4.42665    1.08317   4.087 5.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.048 on 502 degrees of freedom
## Multiple R-squared:  0.5702, Adjusted R-squared:  0.5676 
## F-statistic:   222 on 3 and 502 DF,  p-value: < 2.2e-16

17.Discuss and compare.

#read the origianl and complete data
comp<-read.csv("C:/Users/User/Desktop/nyulearningmaterial/2020Spring/missing data/week2/hw2/complete.csv",header = T)

library(ggplot2)
library(ggExtra)

## Warning: package 'ggExtra' was built under R version 3.6.2

library(cowplot)

## 
## ********************************************************

## Note: As of version 1.0.0, cowplot does not change the

##   default ggplot2 theme anymore. To recover the previous

##   behavior, execute:
##   theme_set(theme_cowplot())

## ********************************************************

a<-ggplot(comp, aes(x=ptratio, y=lstat,color= medv) ) +
  geom_point()+
  scale_color_gradient2(midpoint=25, low="blue", mid="white",
                     high="red", space ="Lab" )+
  theme(legend.position=c(0.1,0.8),legend.background = element_rect(fill="transparent"),legend.key = element_rect(fill = "transparent"))+
  ggtitle("Complete")

b<-ggplot(lw, aes(x=ptratio, y=lstat,color= medv) ) +
   geom_point()+
  scale_color_gradient2(midpoint=25, low="blue", mid="white",
                     high="red", space ="Lab" )+
  theme(legend.position="none")+
  ggtitle("Imputation")

c<-ggplot(hd, aes(x=ptratio, y=lstat,color= medv) ) +
   geom_point()+
  scale_color_gradient2(midpoint=25, low="blue", mid="white",
                     high="red", space ="Lab" )+
  theme(legend.position="none")+
  ggtitle("Imputation")

d<-ggplot(dat7, aes(x=ptratio, y=lstat,color= medv) ) +
   geom_point()+
  scale_color_gradient2(midpoint=25, low="blue", mid="white",
                     high="red", space ="Lab" )+
  theme(legend.position="none")+
  ggtitle("Imputation")

e<-ggplot(dat2, aes(x=ptratio, y=lstat,color= medv) ) +
   geom_point()+
  scale_color_gradient2(midpoint=25, low="blue", mid="white",
                     high="red", space ="Lab" )+
  theme(legend.position="none")+
  ggtitle("Imputation")+
  geom_hline(yintercept=mean(lw$lstat), linetype="dashed")+
  geom_vline(xintercept=mean(lw$ptratio), linetype="dashed")

f<-a+geom_hline(yintercept=mean(lw$lstat), linetype="dashed")+
  geom_vline(xintercept=mean(lw$ptratio), linetype="dashed")

g<-ggplot(dat3, aes(x=ptratio, y=lstat,color= medv) ) +
   geom_point()+
  scale_color_gradient2(midpoint=25, low="blue", mid="white",
                     high="red", space ="Lab" )+
  theme(legend.position="none")+
  ggtitle("Imputation")

h<-ggplot(dat7, aes(x=ptratio, y=lstat,color= medv) ) +
   geom_point()+
  scale_color_gradient2(midpoint=25, low="blue", mid="white",
                     high="red", space ="Lab" )+
  theme(legend.position="none")+
  ggtitle("Imputation")

i<-ggplot(dat8, aes(x=ptratio, y=lstat,color= medv) ) +
   geom_point()+
  scale_color_gradient2(midpoint=25, low="blue", mid="white",
                     high="red", space ="Lab" )+
  theme(legend.position="none")+
  ggtitle("Imputation")

plot_grid(a,b, labels = "AUTO")

plot_grid(a,c, labels = "AUTO")

plot_grid(a,d, labels = "AUTO")

plot_grid(f,e, labels = "AUTO")

plot_grid(a,g, labels = "AUTO")

plot_grid(a,h, labels = "AUTO")

plot_grid(a,i, labels = "AUTO")

Project

Jeffrey (Yi-Hung) Wang

2020/3/12