STAT 388 DATA VIS HW 3

PROBLEM 2

PART A

The five imputed data sets are found below.

require(norm)

## Loading required package: norm

## Warning: package 'norm' was built under R version 3.3.2

hw3data <- read.csv("C:/Users/Kajal/Downloads/hw3data.csv")
impDatList<-list()

prelim.dat<-prelim.norm(as.matrix(hw3data))
theta.hat<-em.norm(prelim.dat)

## Iterations of EM: 
## 1...2...3...4...5...6...7...8...9...10...11...12...13...

rngseed(1234)
impDatList[[1]]<-imp.norm(prelim.dat,theta.hat,as.matrix(hw3data))

for (i in 2:5){
theta.hat.DA<-da.norm(prelim.dat,theta.hat, steps=100)
impDatList[[i]]<-imp.norm(prelim.dat,theta.hat.DA,as.matrix(hw3data))
}

impDatList<-lapply(impDatList,as.data.frame) 
impDatList[[1]][2,]

##           y        x1       x2       x3
## 2 -64.55692 0.1341045 12.24235 97.85021

impDatList[[2]][2,]

##           y        x1       x2       x3
## 2 -64.55692 0.1341045 12.24235 97.85021

impDatList[[3]][2,]

##           y        x1       x2       x3
## 2 -64.55692 0.1341045 12.24235 97.85021

PART B

From the summaries, we see the different estimates for each \(B_0\) and \(B_1\)

lm1<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[1]]))
summary(lm1)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[1]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -142.839  -34.419   -1.536   32.520  153.322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -102.7383    45.1271  -2.277   0.0232 *
## x1            -4.4084     3.5466  -1.243   0.2145  
## x2            -0.4712     1.2474  -0.378   0.7058  
## x3             1.1035     0.4437   2.487   0.0132 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.19 on 496 degrees of freedom
## Multiple R-squared:  0.01311,    Adjusted R-squared:  0.007141 
## F-statistic: 2.196 on 3 and 496 DF,  p-value: 0.08762

lm2<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[2]]))
summary(lm2)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[2]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -147.633  -33.471   -2.915   32.955  155.008 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -138.7960    43.3102  -3.205 0.001439 ** 
## x1            -6.5978     3.3218  -1.986 0.047558 *  
## x2            -0.5788     1.2349  -0.469 0.639495    
## x3             1.4774     0.4288   3.446 0.000618 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.91 on 496 degrees of freedom
## Multiple R-squared:  0.02375,    Adjusted R-squared:  0.01785 
## F-statistic: 4.022 on 3 and 496 DF,  p-value: 0.007611

lm3<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[3]]))
summary(lm3)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[3]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -128.631  -33.277   -1.175   32.629  148.650 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -172.3207    43.7894  -3.935 9.50e-05 ***
## x1            -7.5393     3.3498  -2.251   0.0248 *  
## x2            -0.9718     1.2214  -0.796   0.4266    
## x3             1.8495     0.4362   4.240 2.67e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.58 on 496 degrees of freedom
## Multiple R-squared:  0.03661,    Adjusted R-squared:  0.03078 
## F-statistic: 6.283 on 3 and 496 DF,  p-value: 0.000344

lm4<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[4]]))
summary(lm4)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[4]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -144.675  -33.364   -1.879   32.718  151.587 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -125.5204    44.2050  -2.840  0.00470 **
## x1            -4.6620     3.3386  -1.396  0.16322   
## x2            -0.5903     1.2417  -0.475  0.63468   
## x3             1.3465     0.4325   3.113  0.00196 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.99 on 496 degrees of freedom
## Multiple R-squared:  0.02094,    Adjusted R-squared:  0.01502 
## F-statistic: 3.536 on 3 and 496 DF,  p-value: 0.01471

lm5<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[5]]))
summary(lm5)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[5]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -151.316  -33.660   -1.323   33.661  150.781 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -150.4040    44.8171  -3.356 0.000852 ***
## x1            -7.3838     3.6111  -2.045 0.041406 *  
## x2            -0.6507     1.2376  -0.526 0.599312    
## x3             1.6003     0.4427   3.615 0.000332 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.84 on 496 degrees of freedom
## Multiple R-squared:  0.02651,    Adjusted R-squared:  0.02062 
## F-statistic: 4.502 on 3 and 496 DF,  p-value: 0.003961

PART C

Below are the combined estimates across the 5 data sets for each of the parameters, along with the variance. This is labeled below.

betaList<-seList<-list()
for (i in 1:5){
betaList[[i]]<-summary(lm(y~x1+x2+x3,data=as.data.frame(impDatList[[i]])))$coefficients[,1]
seList[[i]]<-summary(lm(y~x1+x2+x3,data=as.data.frame(impDatList[[i]])))$coefficients[,2]
}

M<-5
#Combined Estimates
apply(do.call(rbind,betaList),2,mean)

##  (Intercept)           x1           x2           x3 
## -137.9558847   -6.1182503   -0.6525596    1.4754427

#B
(B<-apply(do.call(rbind,betaList),2,var))

##  (Intercept)           x1           x2           x3 
## 682.88121687   2.22387571   0.03603207   0.07761653

#W
(W<-apply(do.call(rbind,seList)^2,2,mean))

##  (Intercept)           x1           x2           x3 
## 1958.4796341   11.8040462    1.5292409    0.1908197

T<-(1+1/M)*B + W
#Standard Error
sqrt(T)

## (Intercept)          x1          x2          x3 
##  52.7061391   3.8042998   1.2539854   0.5328785

PART D

Below is the approximate rate. However from mi.inference, we see that in Part E under finf, the rate is y= 0.32378281 x1= 0.19791282 x2= 0.02786445 x3= 0.36145372. The approximation below is close to the finf.

B/(W+B)

## (Intercept)          x1          x2          x3 
##  0.25853386  0.15853209  0.02301967  0.28914328

PART E

The 95% confidence are found below.

mi.inference(betaList,seList) #95% confidence interval

## $est
##  (Intercept)           x1           x2           x3 
## -137.9558847   -6.1182503   -0.6525596    1.4754427 
## 
## $std.err
## (Intercept)          x1          x2          x3 
##  52.7061391   3.8042998   1.2539854   0.5328785 
## 
## $df
## (Intercept)          x1          x2          x3 
##    45.96762   117.64553  5290.40516    37.17936 
## 
## $signif
## (Intercept)          x1          x2          x3 
## 0.011952507 0.110461721 0.602814599 0.008723737 
## 
## $lower
## (Intercept)          x1          x2          x3 
## -244.049857  -13.652035   -3.110888    0.395904 
## 
## $upper
## (Intercept)          x1          x2          x3 
##  -31.861912    1.415534    1.805769    2.554981 
## 
## $r
## (Intercept)          x1          x2          x3 
##  0.41841510  0.22607933  0.02827448  0.48810389 
## 
## $fminf
## (Intercept)          x1          x2          x3 
##  0.32378281  0.19791282  0.02786445  0.36145372

PROBLEM 3

PART A

The five imputed data sets are found below.

require(mice)

## Loading required package: mice

## Warning: package 'mice' was built under R version 3.3.2

set.seed(1234)
mids<-mice(hw3data,m=5,method="pmm")

## 
##  iter imp variable
##   1   1  x1  x3
##   1   2  x1  x3
##   1   3  x1  x3
##   1   4  x1  x3
##   1   5  x1  x3
##   2   1  x1  x3
##   2   2  x1  x3
##   2   3  x1  x3
##   2   4  x1  x3
##   2   5  x1  x3
##   3   1  x1  x3
##   3   2  x1  x3
##   3   3  x1  x3
##   3   4  x1  x3
##   3   5  x1  x3
##   4   1  x1  x3
##   4   2  x1  x3
##   4   3  x1  x3
##   4   4  x1  x3
##   4   5  x1  x3
##   5   1  x1  x3
##   5   2  x1  x3
##   5   3  x1  x3
##   5   4  x1  x3
##   5   5  x1  x3

impDatList<-list()
impDatList[[1]]<-complete(mids)
impDatList[[2]]<-complete(mids,2)
impDatList[[3]]<-complete(mids,3)
impDatList[[4]]<-complete(mids,4)
impDatList[[5]]<-complete(mids,5)

impDatList[[1]][2,]

##           y        x1       x2       x3
## 2 -64.55692 0.1341045 12.24235 97.85021

impDatList[[2]][2,]

##           y        x1       x2       x3
## 2 -64.55692 0.1341045 12.24235 97.85021

impDatList[[3]][2,]

##           y        x1       x2       x3
## 2 -64.55692 0.1341045 12.24235 97.85021

impDatList[[4]][2,]

##           y        x1       x2       x3
## 2 -64.55692 0.1341045 12.24235 97.85021

impDatList[[5]][2,]

##           y        x1       x2       x3
## 2 -64.55692 0.1341045 12.24235 97.85021

PART B

From the summaries, we see the different estimates for each \(B_0\) and \(B_1\)

lm1<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[1]]))
summary(lm1)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[1]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -153.830  -34.663   -1.855   33.052  152.926 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -60.8919    44.4517  -1.370    0.171
## x1           -0.4186     3.5515  -0.118    0.906
## x2           -0.7892     1.2584  -0.627    0.531
## x3            0.7171     0.4399   1.630    0.104
## 
## Residual standard error: 51.29 on 496 degrees of freedom
## Multiple R-squared:  0.009366,   Adjusted R-squared:  0.003374 
## F-statistic: 1.563 on 3 and 496 DF,  p-value: 0.1975

lm2<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[2]]))
summary(lm2)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[2]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.037  -33.369   -2.148   32.979  151.989 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -135.539     42.582  -3.183 0.001549 ** 
## x1            -6.346      3.264  -1.944 0.052474 .  
## x2            -0.554      1.234  -0.449 0.653772    
## x3             1.439      0.420   3.426 0.000663 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.92 on 496 degrees of freedom
## Multiple R-squared:  0.02343,    Adjusted R-squared:  0.01753 
## F-statistic: 3.967 on 3 and 496 DF,  p-value: 0.008205

lm3<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[3]]))
summary(lm3)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[3]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -146.654  -34.477   -1.538   32.334  153.232 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -137.5152    44.8898  -3.063  0.00231 **
## x1            -6.9224     3.3312  -2.078  0.03822 * 
## x2            -0.2534     1.2323  -0.206  0.83716   
## x3             1.4304     0.4386   3.261  0.00119 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.98 on 496 degrees of freedom
## Multiple R-squared:  0.02105,    Adjusted R-squared:  0.01512 
## F-statistic: 3.554 on 3 and 496 DF,  p-value: 0.01436

lm4<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[4]]))
summary(lm4)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[4]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -134.830  -33.760   -1.527   33.418  152.108 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -148.4622    42.7552  -3.472 0.000561 ***
## x1            -7.7574     3.3348  -2.326 0.020411 *  
## x2            -0.3924     1.2370  -0.317 0.751182    
## x3             1.5551     0.4201   3.702 0.000238 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.83 on 496 degrees of freedom
## Multiple R-squared:  0.02691,    Adjusted R-squared:  0.02103 
## F-statistic: 4.573 on 3 and 496 DF,  p-value: 0.003592

lm5<-lm(y~x1+x2+x3,data=as.data.frame(impDatList[[5]]))
summary(lm5)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = as.data.frame(impDatList[[5]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -146.894  -34.093   -2.109   32.572  153.769 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -62.0893    42.6913  -1.454   0.1465  
## x1           -1.6562     3.3965  -0.488   0.6260  
## x2           -0.6458     1.2514  -0.516   0.6060  
## x3            0.7141     0.4313   1.656   0.0984 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.35 on 496 degrees of freedom
## Multiple R-squared:  0.006918,   Adjusted R-squared:  0.0009118 
## F-statistic: 1.152 on 3 and 496 DF,  p-value: 0.3277

PART C

Below are the combined estimates across the 5 data sets for each of the parameters, along with the variance. This is labeled below.

betaList<-seList<-list()
for (i in 1:5){
betaList[[i]]<-summary(lm(y~x1+x2+x3,data=as.data.frame(impDatList[[i]])))$coefficients[,1]
seList[[i]]<-summary(lm(y~x1+x2+x3,data=as.data.frame(impDatList[[i]])))$coefficients[,2]
}
M<-5
apply(do.call(rbind,betaList),2,mean)

##  (Intercept)           x1           x2           x3 
## -108.8994370   -4.6200156   -0.5269705    1.1711433

(B<-apply(do.call(rbind,betaList),2,var))

##  (Intercept)           x1           x2           x3 
## 1.897407e+03 1.113941e+01 4.413824e-02 1.753721e-01

(W<-apply(do.call(rbind,seList)^2,2,mean))

##  (Intercept)           x1           x2           x3 
## 1890.9719198   11.4046745    1.5443709    0.1849593

T<-(1+1/M)*B + W
sqrt(T)

## (Intercept)          x1          x2          x3 
##   64.558971    4.977144    1.263858    0.628813

PART D

Below is the approximate rate. However from mi.inference, we see that in Part E under finf, the rate is y=0.60161624 x1=0.59462756 x2=0.03368997 x3=0.58687248. The approximation below is close to the finf.

B/(W+B)

## (Intercept)          x1          x2          x3 
##  0.50084937  0.49411667  0.02778596  0.48669675

PART E

This is the 95% confidence interval with the lower and upper bound.

require(norm)
mi.inference(betaList,seList)

## $est
##  (Intercept)           x1           x2           x3 
## -108.8994370   -4.6200156   -0.5269705    1.1711433 
## 
## $std.err
## (Intercept)          x1          x2          x3 
##   64.558971    4.977144    1.263858    0.628813 
## 
## $df
## (Intercept)          x1          x2          x3 
##    13.40302    13.73707  3637.98053    14.12090 
## 
## $signif
## (Intercept)          x1          x2          x3 
##  0.11476097  0.36930350  0.67673671  0.08347384 
## 
## $lower
##  (Intercept)           x1           x2           x3 
## -247.9454970  -15.3141278   -3.0049109   -0.1764439 
## 
## $upper
## (Intercept)          x1          x2          x3 
##   30.146623    6.074097    1.950970    2.518731 
## 
## $r
## (Intercept)          x1          x2          x3 
##   1.2040839   1.1720885   0.0342961   1.1377993 
## 
## $fminf
## (Intercept)          x1          x2          x3 
##  0.60161624  0.59462756  0.03368997  0.58687248

Problem 5

PART A

Each \(\hat{\rho}\) value is found below. Set 1 0.3293771 Set 2 0.2971818 Set 3 0.2714318 Set 4 0.2936432 Set 5 0.3427167

rho1<-cor(impDatList[[1]][,3], impDatList[[1]][,4])
rho1

## [1] 0.3293771

rho2<-cor(impDatList[[2]][,3], impDatList[[2]][,4])
rho2

## [1] 0.2971818

rho3<-cor(impDatList[[3]][,3], impDatList[[3]][,4])
rho3

## [1] 0.2714318

rho4<-cor(impDatList[[4]][,3], impDatList[[4]][,4])
rho4

## [1] 0.2936432

rho5<-cor(impDatList[[5]][,3], impDatList[[5]][,4])
rho5

## [1] 0.3427167

PART B

The standard errors of the data sets are as follows: Set 1 0.03747854 Set 2 0.03745454 Set 3 0.03836562 Set 4 0.03659800 Set 5 0.03523512

se<-c()
rho_hat<-c()
for(m in 1:5){
  rho_hat[[m]]<-cor(impDatList[[m]][,3:4])[1,2]
  for(i in 1:1000){
  ind <-sample(1:500,500,replace=TRUE)
  boot<-impDatList[[m]][ind,]
  rho_hat[i]<-cor(boot[,3], boot[,4])
  }
se[m]<-sqrt(var(rho_hat))
}
se

## [1] 0.03747854 0.03745454 0.03836562 0.03659800 0.03523512

mean(rho_hat)

## [1] 0.3436373

var(rho_hat)

## [1] 0.001241514

sqrt(var(rho_hat))

## [1] 0.03523512

PART C

The combined estimate is 0.3436373 and the combined standard error is 0.05113532.

rho_hat<-lapply(rho_hat, as.matrix)
se<- lapply(se, as.matrix)
mi.inference(rho_hat, se)

## $est
## [1] 0.3436373
## 
## $std.err
## [1] 0.05113532
## 
## $df
## [1] 4422.607
## 
## $signif
## [1] 2.045808e-11
## 
## $lower
## [1] 0.2433865
## 
## $upper
## [1] 0.4438881
## 
## $r
## [1] 0.9057545
## 
## $fminf
## [1] 0.4755106

```

STAT 388 DATA VIS HW 3

Kajal Chokshi

February 23, 2017

PROBLEM 2

PART A

PART B

PART C

PART D

PART E

PROBLEM 3

PART A

PART B

PART C

PART D

PART E

Problem 5

PART A

PART B

PART C