Problem 9.10 Cont

bailsofhay — Nov 25, 2013, 5:57 PM

data=read.table("http://www.stat.lsu.edu/exstweb/statlab/datasets/KNNLData/CH09PR10.txt")
names(data)=c("y","x1","x2","x3","x4")
##### Continuation  Problem ###########

####Part a
### There can be ___ Models:

modelnum=2^(5-1) ### 16

#### Part b  ################

# fit0=lm(y~ 0, data=data) R^2 will euqal zero since there is nothing to compare


fit1=lm(y~x1, data=data)
summary(fit1)$r.square
[1] 0.2646

fit2=lm(y~x2, data=data)
summary(fit2)$r.square
[1] 0.247

fit3=lm(y~x3, data=data)
summary(fit3)$r.square
[1] 0.8047

fit4=lm(y~x4, data=data)
summary(fit4)$r.square
[1] 0.7558

fit5=lm(y~x1+x2, data=data)
summary(fit5)$r.square
[1] 0.4642

fit6=lm(y~x1+x3, data=data)
summary(fit6)$r.square
[1] 0.933

fit7=lm(y~x1+x4, data=data)
summary(fit7)$r.square
[1] 0.8153

fit8=lm(y~x2+x3, data=data)
summary(fit8)$r.square
[1] 0.8061

fit9=lm(y~x2+x4, data=data)
summary(fit9)$r.square
[1] 0.7833

fit10=lm(y~x3+x4, data=data)
summary(fit10)$r.square
[1] 0.8773

fit11=lm(y~x1+x2+x3, data=data)
summary(fit11)$r.square
[1] 0.9341

fit12=lm(y~x1+x2+x4, data=data)
summary(fit12)$r.square
[1] 0.8454

fit13=lm(y~x1+x3+x4, data=data)
summary(fit13)$r.square
[1] 0.9615

fit14=lm(y~x2+x3+x4, data=data)
summary(fit14)$r.square
[1] 0.879

fit15=lm(y~x1+x2+x3+x4, data=data)
summary(fit15)$r.square
[1] 0.9629
# The best model is Yi+bo+b1+b2+b3+b4 --> this is expected since are R^2 cannot
# lower when more variable are added into the model. 

########### Part c ##############

## Cp
sigsqhat <- summary(fit3)$sigma^2
Cp = sum(fit3$residuals^2)/sigsqhat + 2*fit3$rank-nrow(data)
Cp
[1] 2

sigsqhat <- summary(fit12)$sigma^2
Cp = sum(fit12$residuals^2)/sigsqhat + 2*fit12$rank-nrow(data)
Cp
[1] 4

### Yi=B0+b3X3 is preferred since it has the smaller Cp value

############ Part d ##############

library(MASS)
stepAIC(fit3)
Start:  AIC=110.5
y ~ x3

       Df Sum of Sq  RSS AIC
<none>              1768 110
- x3    1      7286 9054 149

Call:
lm(formula = y ~ x3, data = data)

Coefficients:
(Intercept)           x3  
    -106.13         1.97  
# 110.47

stepAIC(fit12)
Start:  AIC=108.6
y ~ x1 + x2 + x4

       Df Sum of Sq  RSS AIC
<none>              1400 109
- x2    1       272 1673 111
- x1    1       562 1962 115
- x4    1      3451 4851 138

Call:
lm(formula = y ~ x1 + x2 + x4, data = data)

Coefficients:
(Intercept)           x1           x2           x4  
    -78.541        0.252        0.212        1.288  
# 108.64

### Yi= x1+x2+x4 is preferred since it has the smaller value

############ Part e ############

stepAIC(fit3,k=log(25))
Start:  AIC=112.9
y ~ x3

       Df Sum of Sq  RSS AIC
<none>              1768 113
- x3    1      7286 9054 150

Call:
lm(formula = y ~ x3, data = data)

Coefficients:
(Intercept)           x3  
    -106.13         1.97  
# 112.91
stepAIC(fit12,k=log(25))
Start:  AIC=113.5
y ~ x1 + x2 + x4

       Df Sum of Sq  RSS AIC
<none>              1400 114
- x2    1       272 1673 115
- x1    1       562 1962 119
- x4    1      3451 4851 141

Call:
lm(formula = y ~ x1 + x2 + x4, data = data)

Coefficients:
(Intercept)           x1           x2           x4  
    -78.541        0.252        0.212        1.288  
# 113.51

### Yi=bo+b3x3 is prefferred since it has the smaller value

############ Part f #############
# The results in d and e do not agree which makes since since AIC is less 
# conservative than SBC due to 2*p vs p(log(n)). The SBC formula will affect the 
# chosen model more for complexity.