Question 6

part a

library(ISLR2)
library(boot)

set.seed(1)
cv.error <- rep(0,5)
for (i in 1:5) {
  fit <- glm(wage ~ poly(age, i), data = Wage)
  cv.error[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
cv.error 
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977
glm.fit1 <- lm(wage ~ poly(age,1), data = Wage)
glm.fit2 <- lm(wage ~ poly(age,2), data = Wage)
glm.fit3 <- lm(wage ~ poly(age,3), data = Wage)
glm.fit4 <- lm(wage ~ poly(age,4), data = Wage)
glm.fit5 <- lm(wage ~ poly(age,5), data = Wage)
anova(glm.fit1, glm.fit2, glm.fit3, glm.fit4, glm.fit5)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
attach(Wage)
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(glm.fit3, newdata = list(age=age.grid), se=TRUE)
se.bands <- cbind(preds$fit + 2*preds$se.fit, preds$fit - 2*preds$se.fit)

plot(age, wage, xlim=agelims, cex=.5, col = "darkgrey")
title("Degree-3 Polynomial", outer=T)
lines(age.grid, preds$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

The third degree would be chosen. As 3, 4, and 5 degrees have the lowest CV error but to be conservative 3 would be chosen as its simpler and may be easier to interpret compared to a 4 or 5 degree polynomial model. This is supported further from the ANOVA showing its much more statistically significant that the previously mentioned models.

Question 6

part b

cv.error <- rep(0,9)

for (i in 2:10) {
  Wage$age.cut <- cut(Wage$age, i)
  glm.fit <- glm(wage ~ age.cut, data = Wage)
  cv.error[i-1] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}

cv.error
## [1] 1734.999 1682.289 1636.768 1632.149 1624.180 1611.908 1601.480 1611.079
## [9] 1604.125
step.fit <- glm(wage~cut(age, 8), data=Wage)
agelims <- range(Wage$age)
age.grid <- seq(from=agelims[1], to=agelims[2])
step.pred <- predict(step.fit, data.frame(age=age.grid))
plot(wage~age, data=Wage, col="darkgrey")
lines(age.grid, step.pred, col="blue", lwd=2)

Question 10

part a

set.seed(1)
library(leaps)
train <- sample(1:nrow(College), nrow(College)/2)
test <- (-train)
y.test <- College$Outstate[test]

reg.forward <- regsubsets(Outstate ~ ., data = College, nvmax = 19, method = "forward", subset = train)
summary(reg.forward)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, nvmax = 19, 
##     method = "forward", subset = train)
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    "*"       " "       " "        
## 9  ( 1 )  "*"        " "  " "    " "    "*"       " "       " "        
## 10  ( 1 ) "*"        " "  " "    " "    "*"       " "       " "        
## 11  ( 1 ) "*"        " "  "*"    " "    "*"       " "       " "        
## 12  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       " "        
## 13  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       " "        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       " "        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      " " "*"      "*"      
## 10  ( 1 ) " "         "*"        "*"   "*"      " " "*"      "*"      
## 11  ( 1 ) " "         "*"        "*"   "*"      " " "*"      "*"      
## 12  ( 1 ) " "         "*"        "*"   "*"      " " "*"      "*"      
## 13  ( 1 ) " "         "*"        "*"   "*"      " " "*"      "*"      
## 14  ( 1 ) " "         "*"        "*"   "*"      " " "*"      "*"      
## 15  ( 1 ) "*"         "*"        "*"   "*"      " " "*"      "*"      
## 16  ( 1 ) "*"         "*"        "*"   "*"      " " "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         " "    " "      
## 2  ( 1 )  "*"         " "    " "      
## 3  ( 1 )  "*"         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    "*"      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"
par(mfrow = c(1,3))

plot(summary(reg.forward)$cp)
x = which.min(summary(reg.forward)$cp)
y = summary(reg.forward)$cp[x]
points(x,y, col='red', cex = 2, pch = 20)
x
## [1] 14
coef(reg.forward, 9)
##   (Intercept)    PrivateYes     Top10perc    Room.Board      Personal 
## -1953.5313547  2527.8835558    18.5938114     1.0726124    -0.4021046 
##      Terminal     S.F.Ratio   perc.alumni        Expend     Grad.Rate 
##    33.5768041   -75.8939438    49.5421868     0.1474168    26.5570861

part b c and d

library(gam)
attach(College)
gam.fit <- glm(Outstate ~ Private + Top10perc + Room.Board + Personal + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate, data = College, subset = train)
summary(gam.fit)
## 
## Call:
## glm(formula = Outstate ~ Private + Top10perc + Room.Board + Personal + 
##     Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate, 
##     data = College, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6803.9  -1429.0    -90.1   1320.4   9741.8  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.954e+03  1.144e+03  -1.707 0.088633 .  
## PrivateYes   2.528e+03  3.145e+02   8.037 1.19e-14 ***
## Top10perc    1.859e+01  9.260e+00   2.008 0.045353 *  
## Room.Board   1.073e+00  1.221e-01   8.788  < 2e-16 ***
## Personal    -4.021e-01  1.621e-01  -2.481 0.013539 *  
## Terminal     3.358e+01  9.372e+00   3.583 0.000384 ***
## S.F.Ratio   -7.589e+01  3.893e+01  -1.949 0.051982 .  
## perc.alumni  4.954e+01  1.121e+01   4.418 1.30e-05 ***
## Expend       1.474e-01  2.957e-02   4.986 9.40e-07 ***
## Grad.Rate    2.656e+01  8.102e+00   3.278 0.001143 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 4376026)
## 
##     Null deviance: 6989966760  on 387  degrees of freedom
## Residual deviance: 1654137740  on 378  degrees of freedom
## AIC: 7046.1
## 
## Number of Fisher Scoring iterations: 2
summary(gam.fit)
## 
## Call:
## glm(formula = Outstate ~ Private + Top10perc + Room.Board + Personal + 
##     Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate, 
##     data = College, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6803.9  -1429.0    -90.1   1320.4   9741.8  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.954e+03  1.144e+03  -1.707 0.088633 .  
## PrivateYes   2.528e+03  3.145e+02   8.037 1.19e-14 ***
## Top10perc    1.859e+01  9.260e+00   2.008 0.045353 *  
## Room.Board   1.073e+00  1.221e-01   8.788  < 2e-16 ***
## Personal    -4.021e-01  1.621e-01  -2.481 0.013539 *  
## Terminal     3.358e+01  9.372e+00   3.583 0.000384 ***
## S.F.Ratio   -7.589e+01  3.893e+01  -1.949 0.051982 .  
## perc.alumni  4.954e+01  1.121e+01   4.418 1.30e-05 ***
## Expend       1.474e-01  2.957e-02   4.986 9.40e-07 ***
## Grad.Rate    2.656e+01  8.102e+00   3.278 0.001143 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 4376026)
## 
##     Null deviance: 6989966760  on 387  degrees of freedom
## Residual deviance: 1654137740  on 378  degrees of freedom
## AIC: 7046.1
## 
## Number of Fisher Scoring iterations: 2
par(mfrow = c(1,3))
plot(gam.fit, se = TRUE, col = "blue")

gam.pred <- predict(gam.fit, College)[test]
mean((y.test - gam.pred)^2)
## [1] 3838132
plot(Outstate ~ Enroll)
plot(Outstate ~ Top10perc)

plot(Outstate ~ Grad.Rate)

The error we obtain is 3838132. From our plots we can see that the distribution is relatively normal. But the residuals in the Residuals vs. Fitted shows that there is a type of quadtraic relationship and lineraity is violated. This can be seen as well in the Scale-Location plot where there is non-linearity seen in the red line and the spread of the magnitudes is concentrated between 5000 and 15000.