Aknowledgment: This work was adapted from the Gelman and Hill 2006 Book.
Citation:
Gelman, A., & Hill, J. (2006). Data analysis using regression and multilevel/hierarchical models. Cambridge university press.
# load packages
library(ggplot2)
library(MASS)
library(Matrix)
library(lme4)
library(arm)
##
## arm (Version 1.13-1, built: 2022-8-25)
## Working directory is C:/Users/brand/Desktop/UCR/PhD CCN/PSYC213_Spring2023/Lab/Week 3
The folder pyth contains outcome y and inputs x1, x2 for 40 data points, with a further 20 points with the inputs but no observed outcome. Save the file to your working directory and read it into R using the read.table() function.
#setwd("C:/Users/brand/Desktop/UCR/PhD CCN/PSYC213_Spring2023/Lab/Week 3/exercise2.1.dat")
www<-"http://www.stat.columbia.edu/%7Egelman/arm/examples/pyth/exercise2.1.dat"
data<-read.table(www, header=TRUE)
data.trainer=data[0:40,]
data.tester=data[-1:-40,]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ds=data %>% head(40)
ds_tester=data%>% tail(20)
mdl_1<-lm(y~x1+x2, ds)
summary(mdl_1)
##
## Call:
## lm(formula = y ~ x1 + x2, data = ds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9585 -0.5865 -0.3356 0.3973 2.8548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.31513 0.38769 3.392 0.00166 **
## x1 0.51481 0.04590 11.216 1.84e-13 ***
## x2 0.80692 0.02434 33.148 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9 on 37 degrees of freedom
## Multiple R-squared: 0.9724, Adjusted R-squared: 0.9709
## F-statistic: 652.4 on 2 and 37 DF, p-value: < 2.2e-16
Comments:
mdl_1_sim<-sim(mdl_1)
coef(mdl_1_sim)[0:2,]
## (Intercept) x1 x2
## [1,] 0.763954 0.5915681 0.8244303
## [2,] 1.364676 0.5176414 0.8047210
par(mfrow=c(1,2))
plot
## function (x, y, ...)
## UseMethod("plot")
## <bytecode: 0x000002637593d350>
## <environment: namespace:base>
plot(ds$x1,ds$y,xlab="y", ylab="x1")
for(i in 1:25){
curve(cbind(1,x,mean(ds$x2)) %*% coef(mdl_1_sim)[i,],lwd=.5,col="lightblue",add=TRUE)
}
curve(cbind(1,x,mean(ds$x2))%*%coef(mdl_1),col="black",add=TRUE)
plot(ds$x1,ds$y,xlab="y",ylab="x2")
for(i in 1:25){
curve(cbind(1,mean(ds$x1),x)%*%coef(mdl_1_sim)[i,],lwd=.5,col="lightblue",add=TRUE)
}
curve(cbind(1,mean(ds$x1),x)%*% coef(mdl_1),col="black",add=TRUE)
y.hat<-fitted(mdl_1)
u<-resid(mdl_1)
sigma<-sigma.hat(mdl_1)
residual.plot(y.hat,u,sigma,lwd=1)
Comments:
mdl_test<-predict(mdl_1,ds_tester,interval="prediction",level=.95)
mdl_test
## fit lwr upr
## 41 14.812484 12.916966 16.708002
## 42 19.142865 17.241520 21.044211
## 43 5.916816 3.958626 7.875005
## 44 10.530475 8.636141 12.424809
## 45 19.012485 17.118597 20.906373
## 46 13.398863 11.551815 15.245911
## 47 4.829144 2.918323 6.739965
## 48 9.145767 7.228364 11.063170
## 49 5.892489 3.979060 7.805918
## 50 12.338639 10.426349 14.250929
## 51 18.908561 17.021818 20.795303
## 52 16.064649 14.212209 17.917088
## 53 8.963122 7.084081 10.842163
## 54 14.972786 13.094194 16.851379
## 55 5.859744 3.959679 7.759808
## 56 7.374900 5.480921 9.268879
## 57 4.535267 2.616996 6.453539
## 58 15.133280 13.282467 16.984094
## 59 9.100899 7.223395 10.978403
## 60 16.084900 14.196990 17.972810
Comments:
In this exercise you will simulate two variables that are statistically independent of each other to see what happens when we run a regression of one on the other.
First generate 1000 data points from a normal distribution with mean 0 and standard deviation 1 by typing var1 <- rnorm(1000,0,1) in R. Generate another variable in the same way (call it var2). Run a regression of one variable on the other. Is the slope coefficient statistically significant?
var1b<-rnorm(1000,0,1)
var2b<-rnorm(1000,0,1)
rmdl1<-lm(var1b~var2b)
summary(rmdl1)
##
## Call:
## lm(formula = var1b ~ var2b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4731 -0.6869 0.0201 0.6663 3.7300
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.007242 0.032035 0.226 0.821
## var2b -0.002174 0.031128 -0.070 0.944
##
## Residual standard error: 1.012 on 998 degrees of freedom
## Multiple R-squared: 4.888e-06, Adjusted R-squared: -0.0009971
## F-statistic: 0.004878 on 1 and 998 DF, p-value: 0.9443
Comment:
zscoreprac<-rep(NA,100)
for(k in 1:100){
var1c<-rnorm(1000,0,1)
var2c<-rnorm(1000,0,1)
fit<-lm(var1c~var2c)
zscoreprac[k]<-coef(fit)[2]/se.coef(fit)[2]
}
zscoreprac
## [1] 0.995031413 -0.548202289 -0.207659611 1.890697774 0.868313039
## [6] -0.196907549 0.340387815 0.012314531 -1.377669661 -1.430860700
## [11] -0.996138658 0.533373683 -0.670711461 -1.793375365 -0.475436628
## [16] 1.966168075 1.126373628 0.056584403 -0.992420469 0.305766329
## [21] -0.558003214 0.631002621 -1.939926082 0.478853537 -0.970135616
## [26] -0.003483509 0.510063756 0.188098724 -0.605488532 0.297736165
## [31] 0.643050496 -0.522942687 -0.748906302 1.412681994 -0.356086675
## [36] -0.244185347 -1.357124740 2.225279051 0.222697324 -0.271612798
## [41] -0.200649949 -1.492993482 -0.554814516 -0.272124562 0.766915014
## [46] 0.809354613 0.791399912 -1.697792966 1.472418430 1.170635482
## [51] -1.173972793 -0.874743914 -0.945368587 -0.977716649 0.445688678
## [56] 0.347307550 -1.070404351 -0.640275017 2.814748362 -1.530542438
## [61] 0.102462728 0.074294936 0.698034266 -0.316501370 0.501322102
## [66] 0.544547590 -0.044955166 -0.374650059 -0.487281331 -1.954666717
## [71] 0.180345319 0.863255030 -0.241587798 0.241039778 1.685312526
## [76] -1.093680208 0.410570486 2.075276312 -1.515880270 0.685581242
## [81] 1.059192132 -1.138060032 0.201791944 0.418682399 -0.582358361
## [86] 0.979451422 -2.882512752 1.627619955 -0.205530171 1.834505267
## [91] 2.444160631 -0.392350982 -0.137260784 0.995758489 0.134599797
## [96] 0.416969019 0.448096264 -0.317343207 0.333109040 0.977656872
Comments:
sum(zscoreprac>2)
## [1] 4
Comments:
The child.iq folder contains a subset of the children and mother data discussed earlier in the chapter. You have access to children’s test scores at age 3, mother’s education, and the mother’s age at the time she gave birth for a sample of 400 children. The data are a Stata file which you can read into R.
www2<-"http://www.stat.columbia.edu/%7Egelman/arm/examples/child.iq/child.iq.dta"
data<-read.table(www2,header=TRUE)
## Warning in read.table(www2, header = TRUE): line 1 appears to contain embedded
## nulls
## Warning in read.table(www2, header = TRUE): incomplete final line found by
## readTableHeader on
## 'http://www.stat.columbia.edu/%7Egelman/arm/examples/child.iq/child.iq.dta'
require("foreign")
## Loading required package: foreign
require("arm")
(Generalized) Linear models make some strong assumptions concerning the data structure:
Independance of each data points Correct distribution of the residuals Correct specification of the variance structure Linear relationship between the response and the linear predictor For simple linear models the last three points mean that the residuals should be normally distributed, the variance should be homogenous across the fitted values of the model and for each predictors separately, and the y’s should be linearly related to the predictors. In R checking these assumptions from a lm and glm object is fairly easy.
iq<-read.dta("http://www.stat.columbia.edu/%7Egelman/arm/examples/child.iq/child.iq.dta")
head(iq)
## ppvt educ_cat momage
## 1 120 2 21
## 2 89 1 17
## 3 78 2 19
## 4 42 1 20
## 5 115 4 26
## 6 97 1 20
iq
## ppvt educ_cat momage
## 1 120 2 21
## 2 89 1 17
## 3 78 2 19
## 4 42 1 20
## 5 115 4 26
## 6 97 1 20
## 7 94 1 20
## 8 68 2 24
## 9 103 3 19
## 10 94 3 24
## 11 117 2 24
## 12 64 4 29
## 13 72 2 19
## 14 104 2 23
## 15 78 3 26
## 16 62 1 22
## 17 73 1 21
## 18 79 3 21
## 19 101 3 22
## 20 88 2 25
## 21 102 2 24
## 22 107 2 26
## 23 56 1 21
## 24 102 2 22
## 25 88 2 26
## 26 92 2 23
## 27 85 1 20
## 28 102 2 29
## 29 90 2 26
## 30 97 3 28
## 31 92 2 19
## 32 98 3 23
## 33 85 2 21
## 34 92 2 28
## 35 100 2 26
## 36 86 2 19
## 37 85 2 21
## 38 56 3 20
## 39 102 3 24
## 40 106 3 23
## 41 78 1 23
## 42 58 2 24
## 43 84 2 27
## 44 61 3 28
## 45 102 3 23
## 46 56 4 24
## 47 101 3 27
## 48 46 3 20
## 49 42 2 21
## 50 108 2 22
## 51 64 3 23
## 52 97 2 24
## 53 86 2 24
## 54 82 2 19
## 55 100 1 23
## 56 104 2 23
## 57 91 2 22
## 58 102 2 23
## 59 91 3 23
## 60 101 1 17
## 61 40 1 20
## 62 86 2 19
## 63 83 2 24
## 64 73 3 22
## 65 80 2 23
## 66 98 2 21
## 67 52 2 26
## 68 92 3 26
## 69 107 2 29
## 70 94 2 22
## 71 79 3 26
## 72 75 1 19
## 73 108 2 22
## 74 77 4 23
## 75 104 2 26
## 76 104 3 26
## 77 72 2 19
## 78 109 2 24
## 79 92 2 19
## 80 69 2 25
## 81 103 4 28
## 82 109 4 27
## 83 43 1 18
## 84 61 3 21
## 85 89 2 22
## 86 75 1 23
## 87 116 2 22
## 88 104 4 29
## 89 99 2 22
## 90 75 1 23
## 91 102 2 21
## 92 106 2 24
## 93 136 1 19
## 94 67 2 22
## 95 88 2 24
## 96 82 3 27
## 97 67 1 28
## 98 87 3 21
## 99 76 2 21
## 100 72 2 21
## 101 112 3 23
## 102 112 3 24
## 103 90 2 26
## 104 95 1 26
## 105 52 2 25
## 106 70 2 20
## 107 106 1 22
## 108 97 1 22
## 109 54 1 22
## 110 98 2 25
## 111 94 4 24
## 112 109 2 22
## 113 66 2 23
## 114 114 2 24
## 115 97 2 23
## 116 91 2 23
## 117 94 2 26
## 118 87 1 24
## 119 110 3 25
## 120 38 2 22
## 121 68 2 22
## 122 58 2 23
## 123 104 2 22
## 124 84 2 25
## 125 100 2 20
## 126 82 1 22
## 127 87 3 28
## 128 113 4 27
## 129 71 1 21
## 130 50 1 24
## 131 76 1 22
## 132 59 1 21
## 133 95 1 25
## 134 120 2 26
## 135 58 1 21
## 136 73 3 23
## 137 84 3 23
## 138 111 1 18
## 139 79 1 24
## 140 93 1 20
## 141 49 2 20
## 142 101 3 20
## 143 64 2 18
## 144 90 2 22
## 145 86 1 23
## 146 95 2 20
## 147 83 2 23
## 148 84 2 20
## 149 84 2 25
## 150 41 1 19
## 151 43 1 25
## 152 42 1 26
## 153 96 3 23
## 154 83 2 26
## 155 100 1 19
## 156 101 3 25
## 157 49 1 23
## 158 86 2 23
## 159 94 2 21
## 160 56 2 23
## 161 90 2 22
## 162 80 2 24
## 163 111 3 23
## 164 119 2 19
## 165 84 2 19
## 166 67 1 20
## 167 87 1 19
## 168 73 1 25
## 169 52 2 20
## 170 77 3 22
## 171 108 2 23
## 172 104 4 21
## 173 88 1 20
## 174 103 1 27
## 175 46 1 25
## 176 74 2 19
## 177 58 2 21
## 178 76 3 23
## 179 89 4 26
## 180 45 2 22
## 181 98 2 25
## 182 104 2 23
## 183 106 3 26
## 184 113 2 20
## 185 87 2 23
## 186 80 1 19
## 187 87 2 23
## 188 76 1 24
## 189 83 1 17
## 190 69 2 24
## 191 122 3 21
## 192 68 2 19
## 193 52 2 26
## 194 92 2 28
## 195 96 1 19
## 196 102 2 24
## 197 89 1 20
## 198 57 1 22
## 199 50 2 20
## 200 83 3 23
## 201 97 4 25
## 202 126 2 19
## 203 83 2 25
## 204 90 2 25
## 205 116 4 24
## 206 44 2 25
## 207 50 1 18
## 208 68 2 23
## 209 62 2 21
## 210 67 2 26
## 211 104 2 23
## 212 110 3 26
## 213 102 2 25
## 214 69 2 20
## 215 98 1 18
## 216 79 2 22
## 217 103 1 21
## 218 104 1 18
## 219 81 2 25
## 220 144 4 25
## 221 42 1 22
## 222 77 2 24
## 223 104 2 19
## 224 95 2 24
## 225 100 2 20
## 226 50 1 23
## 227 104 3 19
## 228 119 2 26
## 229 114 1 20
## 230 136 4 23
## 231 112 2 19
## 232 87 3 24
## 233 86 2 21
## 234 52 4 27
## 235 110 3 23
## 236 90 2 21
## 237 95 2 19
## 238 41 1 25
## 239 88 2 21
## 240 105 2 20
## 241 76 2 23
## 242 78 1 21
## 243 64 3 21
## 244 113 3 24
## 245 102 2 21
## 246 65 2 22
## 247 78 2 22
## 248 122 2 22
## 249 61 2 25
## 250 109 1 22
## 251 60 2 27
## 252 109 2 27
## 253 99 4 28
## 254 61 2 25
## 255 107 2 21
## 256 86 3 21
## 257 106 2 22
## 258 100 2 23
## 259 97 2 22
## 260 93 4 28
## 261 93 1 25
## 262 60 2 22
## 263 58 2 23
## 264 94 2 23
## 265 50 2 24
## 266 111 4 28
## 267 85 2 27
## 268 81 2 17
## 269 89 2 20
## 270 90 2 24
## 271 102 2 20
## 272 100 1 18
## 273 97 2 19
## 274 105 2 21
## 275 94 2 22
## 276 56 2 22
## 277 94 2 23
## 278 72 2 21
## 279 96 4 23
## 280 99 2 20
## 281 69 2 27
## 282 98 2 21
## 283 74 3 20
## 284 94 3 23
## 285 114 2 24
## 286 60 1 22
## 287 98 2 22
## 288 105 4 25
## 289 100 2 19
## 290 94 3 21
## 291 119 2 23
## 292 83 2 23
## 293 63 3 23
## 294 76 1 26
## 295 107 3 29
## 296 90 2 21
## 297 94 4 28
## 298 64 3 24
## 299 94 2 23
## 300 84 2 21
## 301 96 3 21
## 302 47 2 18
## 303 65 2 24
## 304 64 1 18
## 305 85 3 21
## 306 113 2 24
## 307 78 1 21
## 308 56 3 19
## 309 81 3 26
## 310 87 2 19
## 311 110 2 20
## 312 108 2 29
## 313 121 2 22
## 314 98 2 25
## 315 91 2 24
## 316 86 3 25
## 317 86 2 26
## 318 63 2 20
## 319 42 3 27
## 320 70 2 25
## 321 101 2 23
## 322 102 3 24
## 323 92 2 27
## 324 83 2 20
## 325 65 4 27
## 326 104 1 20
## 327 99 4 27
## 328 104 3 29
## 329 110 2 25
## 330 74 3 22
## 331 102 2 24
## 332 104 2 23
## 333 94 1 21
## 334 54 1 23
## 335 106 1 25
## 336 41 1 18
## 337 87 2 21
## 338 92 1 24
## 339 46 3 22
## 340 110 3 27
## 341 89 2 19
## 342 109 2 24
## 343 42 2 25
## 344 87 2 21
## 345 90 2 23
## 346 105 4 23
## 347 88 2 21
## 348 87 3 22
## 349 73 2 20
## 350 102 1 24
## 351 110 3 22
## 352 99 2 22
## 353 68 3 22
## 354 95 3 26
## 355 20 1 23
## 356 117 2 24
## 357 80 1 22
## 358 65 1 21
## 359 89 3 21
## 360 65 2 22
## 361 113 2 19
## 362 121 2 19
## 363 94 2 26
## 364 92 3 26
## 365 116 3 24
## 366 94 2 24
## 367 123 2 25
## 368 106 2 26
## 369 95 4 24
## 370 76 2 20
## 371 99 2 26
## 372 58 3 23
## 373 82 1 20
## 374 96 2 27
## 375 95 2 20
## 376 79 3 25
## 377 99 1 19
## 378 113 2 26
## 379 76 2 19
## 380 100 2 26
## 381 69 3 20
## 382 83 1 28
## 383 100 1 20
## 384 98 2 22
## 385 106 1 25
## 386 93 3 22
## 387 92 3 21
## 388 115 2 27
## 389 112 2 23
## 390 77 3 22
## 391 95 1 21
## 392 109 2 25
## 393 100 2 24
## 394 93 4 27
## 395 107 2 25
## 396 87 3 21
## 397 69 2 20
## 398 80 1 25
## 399 98 1 18
## 400 81 2 22
Just to remind myself what I want to complete
Fit a regression of child test scores on mother’s age
m2<-lm(ppvt~momage,data=iq)
summary(m2)
##
## Call:
## lm(formula = ppvt ~ momage, data = iq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.109 -11.798 2.971 14.860 55.210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.7827 8.6880 7.802 5.42e-14 ***
## momage 0.8403 0.3786 2.219 0.027 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.34 on 398 degrees of freedom
## Multiple R-squared: 0.01223, Adjusted R-squared: 0.009743
## F-statistic: 4.926 on 1 and 398 DF, p-value: 0.02702
par(mfrow=c(2,2))
plot(m2)
Comments:
What age would I suggest moms give birth?
plot(iq$momage,iq$ppvt,xlab="Mother Age",ylab="Child Test Score")
abline(m2)
summary(m2)
##
## Call:
## lm(formula = ppvt ~ momage, data = iq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.109 -11.798 2.971 14.860 55.210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.7827 8.6880 7.802 5.42e-14 ***
## momage 0.8403 0.3786 2.219 0.027 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.34 on 398 degrees of freedom
## Multiple R-squared: 0.01223, Adjusted R-squared: 0.009743
## F-statistic: 4.926 on 1 and 398 DF, p-value: 0.02702
Comments:
m3<-lm(ppvt~momage+educ_cat, data=iq)
summary(m3)
##
## Call:
## lm(formula = ppvt ~ momage + educ_cat, data = iq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.763 -13.130 2.495 14.620 55.610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.1554 8.5706 8.069 8.51e-15 ***
## momage 0.3433 0.3981 0.862 0.389003
## educ_cat 4.7114 1.3165 3.579 0.000388 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.05 on 397 degrees of freedom
## Multiple R-squared: 0.04309, Adjusted R-squared: 0.03827
## F-statistic: 8.939 on 2 and 397 DF, p-value: 0.0001594
par(mfrow=c(2,2))
plot(m3)
#ffb3ba
#ffffba
#baffc9
#bae1ff
colors1=c('#ffb3ba','#ffffba','#baffc9','#bae1ff')
plot(iq$momage,iq$ppvt,xlab="Mother Age", ylab="Child Test Score", col=colors1,pch=20)
for(i in 1:4){
curve(cbind(1,x,i)%*%coef(m3),add=TRUE,col=colors1[i])
}
Comments:
iq$hs<-ifelse(iq$educ_cat>=2,1,0)
m4<-lm(ppvt~hs*momage,data=iq)
display(m4)
## lm(formula = ppvt ~ hs * momage, data = iq)
## coef.est coef.se
## (Intercept) 105.22 17.65
## hs -38.41 20.28
## momage -1.24 0.81
## hs:momage 2.21 0.92
## ---
## n = 400, k = 4
## residual sd = 19.85, R-Squared = 0.06
#ffb3ba
#ffffba
#baffc9
#bae1ff
colors2<-ifelse(iq$hs==1,"#ffffba","#ffb3ba")
plot(iq$momage,iq$ppvt,xlab="Mother Age", ylab="Child Test Score", col= colors2, pch=20)
curve(cbind(1,1,x,1*x)%*% coef(m4), add=TRUE,col="#ffffba")
curve(cbind(1,0,x,0*x)%*% coef(m4), add=TRUE,col="#ffb3ba")
# split data set into training and test sets
iq.trainer<-iq[1:200,]
iq.tester<-iq[201:dim(iq)[1],]
# fit linear model
m5<-lm(ppvt~momage+educ_cat,data=iq.trainer)
summary(m5)
##
## Call:
## lm(formula = ppvt ~ momage + educ_cat, data = iq.trainer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.358 -12.967 2.866 14.435 58.428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.6295 11.8202 5.383 2.07e-07 ***
## momage 0.4473 0.5516 0.811 0.41836
## educ_cat 5.4434 1.8228 2.986 0.00318 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.58 on 197 degrees of freedom
## Multiple R-squared: 0.06199, Adjusted R-squared: 0.05246
## F-statistic: 6.509 on 2 and 197 DF, p-value: 0.001831
# make predictions
iq.predictor<-predict(m5,iq.tester)
plot(iq.predictor,iq.tester$ppvt,xlab="Predicted",ylab="Observed")
abline(a=0,b=1)
The folder beauty contains data from Hamermesh and Parker (2005) on student evaluations of instructors’ beauty and teaching quality for several courses at the University of Texas. The teaching evaluations were conducted at the end of the semester, and the beauty judgments were made later, by six students who had not attended the classes and were not aware of the course evaluations.
beautiful<-read.csv("http://www.stat.columbia.edu/~gelman/arm/examples/beauty/ProfEvaltnsBeautyPublic.csv")
head(beautiful)
## tenured profnumber minority age beautyf2upper beautyflowerdiv beautyfupperdiv
## 1 0 1 1 36 6 5 7
## 2 1 2 0 59 2 4 4
## 3 1 3 0 51 5 5 2
## 4 1 4 0 40 4 2 5
## 5 0 5 0 31 9 7 9
## 6 1 6 0 62 5 6 6
## beautym2upper beautymlowerdiv beautymupperdiv btystdave btystdf2u
## 1 6 2 4 0.2015666 0.2893519
## 2 3 2 3 -0.8260813 -1.6193560
## 3 3 2 3 -0.6603327 -0.1878249
## 4 2 3 3 -0.7663125 -0.6650018
## 5 6 7 6 1.4214450 1.7208830
## 6 6 5 5 0.5002196 -0.1878249
## btystdfl btystdfu btystdm2u btystdml btystdmu class1 class2 class3
## 1 0.4580018 0.8758139 0.6817153 -0.9000649 -0.1954181 0 0 1
## 2 -0.0735065 -0.5770065 -1.1319040 -0.9000649 -0.6546507 0 0 0
## 3 0.4580018 -1.5455530 -1.1319040 -0.9000649 -0.6546507 0 0 0
## 4 -1.1365230 -0.0927330 -1.7364440 -0.3125226 -0.6546507 0 1 0
## 5 1.5210190 1.8443610 0.6817153 2.0376470 0.7230470 0 0 0
## 6 0.9895102 0.3915404 0.6817153 0.8625621 0.2638144 0 0 0
## class4 class5 class6 class7 class8 class9 class10 class11 class12 class13
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## class14 class15 class16 class17 class18 class19 class20 class21 class22
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## class23 class24 class25 class26 class27 class28 class29 class30
## 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## courseevaluation didevaluation female formal fulldept lower multipleclass
## 1 4.3 24 1 0 1 0 1
## 2 4.5 17 0 0 1 0 0
## 3 3.7 55 0 0 1 0 1
## 4 4.3 40 1 0 1 0 1
## 5 4.4 42 1 0 1 0 0
## 6 4.2 182 0 1 1 0 0
## nonenglish onecredit percentevaluating profevaluation students tenuretrack
## 1 0 0 55.81395 4.7 43 1
## 2 0 0 85.00000 4.6 20 1
## 3 0 0 100.00000 4.1 55 1
## 4 0 0 86.95652 4.5 46 1
## 5 0 0 87.50000 4.8 48 1
## 6 0 0 64.53901 4.4 282 1
## blkandwhite btystdvariance btystdavepos btystdaveneg
## 1 0 2.1298060 0.201567 0.000000
## 2 0 1.3860810 0.000000 -0.826081
## 3 0 2.5374350 0.000000 -0.660333
## 4 0 1.7605770 0.000000 -0.766312
## 5 0 1.6931000 1.421450 0.000000
## 6 0 0.9447419 0.500220 0.000000
mdl_beaut1 <- lm(btystdave ~ courseevaluation + female + age, data = beautiful)
summary(mdl_beaut1)
##
## Call:
## lm(formula = btystdave ~ courseevaluation + female + age, data = beautiful)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80374 -0.54913 -0.08697 0.45625 1.87299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.167859 0.335388 -0.500 0.617
## courseevaluation 0.265700 0.063100 4.211 3.06e-05 ***
## female 0.124219 0.073802 1.683 0.093 .
## age -0.021403 0.003684 -5.809 1.17e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7404 on 459 degrees of freedom
## Multiple R-squared: 0.1244, Adjusted R-squared: 0.1187
## F-statistic: 21.73 on 3 and 459 DF, p-value: 3.543e-13
par(mfrow=c(2,2))
plot(mdl_beaut1)
Comments:Residuals versus leverage does not look like it has equal variance across the leverage scale
beta.hat1 <- coef(mdl_beaut1)
beta.sim1 <- sim(mdl_beaut1)@coef
par(mfrow=c(1,3))
# plot course evaluation against beauty score
plot(beautiful$courseevaluation, beautiful$btystdave, pch=20, xlab="Course Score", ylab="Beauty Score")
for (i in 1:10){
curve (cbind (1, x, mean(beautiful$female), mean(beautiful$age)) %*% beta.sim1[i,], lwd=.5, col="#ffb3ba", add=TRUE)
}
curve(cbind(1,x,mean(beautiful$female),mean(beautiful$age))%*%beta.hat1,col="black",add=TRUE)
# plot sex against beauty score
plot(beautiful$female,beautiful$btystdave,pch=20,xlab="Female",ylab="Beauty Score")
for(i in 1:10){
curve(cbind(1,mean(beautiful$courseevaluation),x,mean(beautiful$age))%*%beta.sim1[i,],lwd=.5,col="#ffb3ba",add=TRUE)
}
curve(cbind(1,mean(beautiful$courseevaluation),x,mean(beautiful$age))%*%beta.hat1,col="black",add=TRUE)
# plot age against beauty score
plot(beautiful$age,beautiful$btystdave,pch=20,xlab="Age",ylab="Beauty Score")
for(i in 1:10){
curve(cbind(1,mean(beautiful$courseevaluation),mean(beautiful$female),x)%*%beta.sim1[i,],lwd=.5,col="#ffb3ba",add=TRUE)
}
curve(cbind(1,mean(beautiful$courseevaluation),mean(beautiful$female),x)%*%beta.hat1,col="black",add=TRUE)
mdl_b2<-lm(btystdave~courseevaluation*female+age,data=beautiful)
summary(mdl_b2)
##
## Call:
## lm(formula = btystdave ~ courseevaluation * female + age, data = beautiful)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80938 -0.54166 -0.09022 0.42178 1.82833
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.512488 0.387154 -1.324 0.1863
## courseevaluation 0.356584 0.081244 4.389 1.41e-05 ***
## female 1.020189 0.511609 1.994 0.0467 *
## age -0.021899 0.003686 -5.940 5.63e-09 ***
## courseevaluation:female -0.226481 0.127976 -1.770 0.0774 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7387 on 458 degrees of freedom
## Multiple R-squared: 0.1303, Adjusted R-squared: 0.1227
## F-statistic: 17.16 on 4 and 458 DF, p-value: 4.001e-13
par(mfrow=c(2,2))
plot(mdl_b2)
Comments:
We will now fit a model which, compared to m2, adds a few more input variables. As we will see, this model, although slighly more complicated, will be able to explain better the variance of our outcome variable.
mdl_b3<-lm(btystdave~courseevaluation*female+age+blkandwhite+formal,data=beautiful)
summary(mdl_b3)
##
## Call:
## lm(formula = btystdave ~ courseevaluation * female + age + blkandwhite +
## formal, data = beautiful)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6987 -0.5367 -0.0485 0.3941 1.8714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.270223 0.370255 -0.730 0.46587
## courseevaluation 0.314151 0.077555 4.051 6.00e-05 ***
## female 1.356260 0.490874 2.763 0.00596 **
## age -0.025580 0.003569 -7.167 3.11e-12 ***
## blkandwhite 0.552726 0.092295 5.989 4.29e-09 ***
## formal 0.253208 0.090476 2.799 0.00535 **
## courseevaluation:female -0.330770 0.123321 -2.682 0.00758 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.703 on 456 degrees of freedom
## Multiple R-squared: 0.2158, Adjusted R-squared: 0.2054
## F-statistic: 20.91 on 6 and 456 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(mdl_b3)
Comments: