Chapter 07 (page 297): 6, 10

6 (a)

library(ISLR)
library(boot)
attach(Wage)

MSE Cross Validation

set.seed(42)
MSE = vector(mode = "list", length = 10)
for (i in 1:10) {
  fit=glm(wage~poly(age,i),data=Wage)
  MSE[i]=cv.glm(Wage, fit, K=10)$delta[1] 
}
MSE
## [[1]]
## [1] 1676.334
## 
## [[2]]
## [1] 1601.952
## 
## [[3]]
## [1] 1597.313
## 
## [[4]]
## [1] 1594.688
## 
## [[5]]
## [1] 1595.061
## 
## [[6]]
## [1] 1592.922
## 
## [[7]]
## [1] 1594.542
## 
## [[8]]
## [1] 1596.803
## 
## [[9]]
## [1] 1592.672
## 
## [[10]]
## [1] 1596.282
plot(x=1:10, y = MSE)

Anova

fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
fit.6=lm(wage~poly(age,6),data=Wage)
fit.7=lm(wage~poly(age,7),data=Wage)
fit.8=lm(wage~poly(age,8),data=Wage)

anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
## Model 8: wage ~ poly(age, 8)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.6484 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8926  0.001676 ** 
## 4   2995 4771604  1      6070   3.8113  0.051002 .  
## 5   2994 4770322  1      1283   0.8053  0.369590    
## 6   2993 4766389  1      3932   2.4690  0.116221    
## 7   2992 4763834  1      2555   1.6044  0.205381    
## 8   2991 4763707  1       127   0.0795  0.777952    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plot the fit graph

fit = lm(wage~poly(age,4), data = Wage)

agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])

preds=predict(fit,newdata=list(age=age.grid),se=TRUE)

plot(age,wage,xlim=agelims,cex =.5,col="blue")
title("Polynomial of degree4")
lines(age.grid,preds$fit,lwd=2,col="black")

From the MSE cross validation we see that degree of 9 has least MSE. and when compared with 6 and 4 the differences are little. So we select degree 6. Also in Anova model as per the p values we observe that 3 or 4 degree polynomial is well suited.

(b)

Cross validation

error = vector(mode = "list", length = 19)

for (i in 2:20) {
  Wage$age.cut=cut(Wage$age,i)
  fit2=glm(wage~age.cut,data=Wage)
  error[i]=cv.glm(Wage, fit2, K=10)$delta[1]
}
error
## [[1]]
## NULL
## 
## [[2]]
## [1] 1733.666
## 
## [[3]]
## [1] 1683.238
## 
## [[4]]
## [1] 1634.605
## 
## [[5]]
## [1] 1630.162
## 
## [[6]]
## [1] 1625.805
## 
## [[7]]
## [1] 1612.094
## 
## [[8]]
## [1] 1601.004
## 
## [[9]]
## [1] 1611.51
## 
## [[10]]
## [1] 1606.099
## 
## [[11]]
## [1] 1600.554
## 
## [[12]]
## [1] 1603.106
## 
## [[13]]
## [1] 1605.808
## 
## [[14]]
## [1] 1608.233
## 
## [[15]]
## [1] 1608.797
## 
## [[16]]
## [1] 1597.154
## 
## [[17]]
## [1] 1600.866
## 
## [[18]]
## [1] 1605.194
## 
## [[19]]
## [1] 1603.806
## 
## [[20]]
## [1] 1605.96
plot(x=2:20, y = error[-1])  

Plot the fit graph

fit2 = lm(wage~cut(age,11),data=Wage)

preds=predict(fit2,newdata=list(age=age.grid),se=TRUE)

plot(age,wage,xlim=agelims,cex =.5,col="blue")
title("Knot Fit 11")
lines(age.grid,preds$fit,lwd=2,col="black")

10 (a)

library(leaps)
set.seed(42)
train = sample(nrow(College), .8*(nrow(College)) )

college.train = College[train,]
college.test = College[-train,]

college.fwd= regsubsets(Outstate~., data=college.train, nvmax=17, method="forward")

plot(summary(college.fwd)$cp,xlab="# of Variables",ylab="CP",type='l')

plot(summary(college.fwd)$bic,xlab="# of Variables",ylab="BIC",type='l')

Coefficients

coef(college.fwd, id = 5)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3273.3044374  2972.8394338     1.1316869    44.8780875    64.4774711 
##        Expend 
##     0.1988544

Mean train

mean(college.train$Room.Board)
## [1] 4349.982

Mean value

mean(College$Room.Board)
## [1] 4357.526

Std. Deviation

sd(college.train$Room.Board)
## [1] 1075.48

(b)

library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
college.gam = gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend), data = college.train )
plot(college.gam, se=TRUE)

(c)

Preds

preds=predict(college.gam,newdata=college.test)
error= mean((college.test$Outstate - preds)^2)
error
## [1] 4623577

(d)

summary(college.gam)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend), data = college.train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -5480.39 -1164.28    34.94  1258.54  5157.38 
## 
## (Dispersion Parameter for gaussian family taken to be 3450251)
## 
##     Null Deviance: 9963544806 on 620 degrees of freedom
## Residual Deviance: 2080501867 on 603.0001 degrees of freedom
## AIC: 11130.56 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2772847304 2772847304  803.67 < 2.2e-16 ***
## s(Room.Board)    1 2034012966 2034012966  589.53 < 2.2e-16 ***
## s(PhD)           1  708409900  708409900  205.32 < 2.2e-16 ***
## s(perc.alumni)   1  403098246  403098246  116.83 < 2.2e-16 ***
## s(Expend)        1  736933460  736933460  213.59 < 2.2e-16 ***
## Residuals      603 2080501867    3450251                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df Npar F  Pr(F)    
## (Intercept)                             
## Private                                 
## s(Room.Board)        3  1.774 0.1510    
## s(PhD)               3  0.906 0.4377    
## s(perc.alumni)       3  1.748 0.1561    
## s(Expend)            3 32.132 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the summary stats above we get that all the variables have p value lower than beta 0.05 and that means they have non linear relation with the dependent variable “Outstate”