bailsofhay — Nov 25, 2013, 5:57 PM
data=read.table("http://www.stat.lsu.edu/exstweb/statlab/datasets/KNNLData/CH09PR10.txt")
names(data)=c("y","x1","x2","x3","x4")
##### Continuation Problem ###########
####Part a
### There can be ___ Models:
modelnum=2^(5-1) ### 16
#### Part b ################
# fit0=lm(y~ 0, data=data) R^2 will euqal zero since there is nothing to compare
fit1=lm(y~x1, data=data)
summary(fit1)$r.square
[1] 0.2646
fit2=lm(y~x2, data=data)
summary(fit2)$r.square
[1] 0.247
fit3=lm(y~x3, data=data)
summary(fit3)$r.square
[1] 0.8047
fit4=lm(y~x4, data=data)
summary(fit4)$r.square
[1] 0.7558
fit5=lm(y~x1+x2, data=data)
summary(fit5)$r.square
[1] 0.4642
fit6=lm(y~x1+x3, data=data)
summary(fit6)$r.square
[1] 0.933
fit7=lm(y~x1+x4, data=data)
summary(fit7)$r.square
[1] 0.8153
fit8=lm(y~x2+x3, data=data)
summary(fit8)$r.square
[1] 0.8061
fit9=lm(y~x2+x4, data=data)
summary(fit9)$r.square
[1] 0.7833
fit10=lm(y~x3+x4, data=data)
summary(fit10)$r.square
[1] 0.8773
fit11=lm(y~x1+x2+x3, data=data)
summary(fit11)$r.square
[1] 0.9341
fit12=lm(y~x1+x2+x4, data=data)
summary(fit12)$r.square
[1] 0.8454
fit13=lm(y~x1+x3+x4, data=data)
summary(fit13)$r.square
[1] 0.9615
fit14=lm(y~x2+x3+x4, data=data)
summary(fit14)$r.square
[1] 0.879
fit15=lm(y~x1+x2+x3+x4, data=data)
summary(fit15)$r.square
[1] 0.9629
# The best model is Yi+bo+b1+b2+b3+b4 --> this is expected since are R^2 cannot
# lower when more variable are added into the model.
########### Part c ##############
## Cp
sigsqhat <- summary(fit3)$sigma^2
Cp = sum(fit3$residuals^2)/sigsqhat + 2*fit3$rank-nrow(data)
Cp
[1] 2
sigsqhat <- summary(fit12)$sigma^2
Cp = sum(fit12$residuals^2)/sigsqhat + 2*fit12$rank-nrow(data)
Cp
[1] 4
### Yi=B0+b3X3 is preferred since it has the smaller Cp value
############ Part d ##############
library(MASS)
stepAIC(fit3)
Start: AIC=110.5
y ~ x3
Df Sum of Sq RSS AIC
<none> 1768 110
- x3 1 7286 9054 149
Call:
lm(formula = y ~ x3, data = data)
Coefficients:
(Intercept) x3
-106.13 1.97
# 110.47
stepAIC(fit12)
Start: AIC=108.6
y ~ x1 + x2 + x4
Df Sum of Sq RSS AIC
<none> 1400 109
- x2 1 272 1673 111
- x1 1 562 1962 115
- x4 1 3451 4851 138
Call:
lm(formula = y ~ x1 + x2 + x4, data = data)
Coefficients:
(Intercept) x1 x2 x4
-78.541 0.252 0.212 1.288
# 108.64
### Yi= x1+x2+x4 is preferred since it has the smaller value
############ Part e ############
stepAIC(fit3,k=log(25))
Start: AIC=112.9
y ~ x3
Df Sum of Sq RSS AIC
<none> 1768 113
- x3 1 7286 9054 150
Call:
lm(formula = y ~ x3, data = data)
Coefficients:
(Intercept) x3
-106.13 1.97
# 112.91
stepAIC(fit12,k=log(25))
Start: AIC=113.5
y ~ x1 + x2 + x4
Df Sum of Sq RSS AIC
<none> 1400 114
- x2 1 272 1673 115
- x1 1 562 1962 119
- x4 1 3451 4851 141
Call:
lm(formula = y ~ x1 + x2 + x4, data = data)
Coefficients:
(Intercept) x1 x2 x4
-78.541 0.252 0.212 1.288
# 113.51
### Yi=bo+b3x3 is prefferred since it has the smaller value
############ Part f #############
# The results in d and e do not agree which makes since since AIC is less
# conservative than SBC due to 2*p vs p(log(n)). The SBC formula will affect the
# chosen model more for complexity.