##6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.##

#(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data#

library(ISLR)
library(tidyverse)
library(modelr)
library(splines)
library(corrplot)
library(caret)
library(boot)
library(tidyverse)
library(leaps)
library(MASS)
library (gam)
attach(Wage)
range(age)
## [1] 18 80
names(Wage)
##  [1] "year"       "age"        "maritl"     "race"       "education" 
##  [6] "region"     "jobclass"   "health"     "health_ins" "logwage"   
## [11] "wage"
data("Wage")
set.seed(8)
cv.error <- rep(NA,10)
for (i in 1:10) { poly.fit <- glm(wage ~ poly(age, i), data=Wage)
                  cv.error[i] <- cv.glm(Wage, poly.fit, K=10)$delta[1]}

plot(x=1:10, y=cv.error, xlab='Degree Polynomial',
                         ylab='CV MSE', 
                         type='b', 
                         main='CV MSE vs Degree of Polynomial. Wage Vs Age')

points( x=which.min(cv.error), y=min(cv.error), col='red', pch=16,cex=3)
axis(1, at = seq(1,10, by=1))

It appears that a polynomial with degree 9 has the lowest CV MSE.

agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
fit=lm(wage~poly(age,9,raw=T),data=Wage)
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.7,col="lightblue")
title("Degree-9 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=3,col="pink")
matlines(age.grid,se.bands,lwd=2,col="darkgray")

fit.1=lm(wage~education+poly(age,1),data=Wage)
fit.2=lm(wage~education+poly(age,2),data=Wage)
fit.3=lm(wage~education+poly(age,3),data=Wage)
fit.4=lm(wage~education+poly(age,4),data=Wage)
fit.5=lm(wage~education+poly(age,5),data=Wage)
fit.6=lm(wage~education+poly(age,6),data=Wage)
fit.7=lm(wage~education+poly(age,7),data=Wage)
fit.8=lm(wage~education+poly(age,8),data=Wage)
fit.9=lm(wage~education+poly(age,9),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8,fit.9)
## Analysis of Variance Table
## 
## Model 1: wage ~ education + poly(age, 1)
## Model 2: wage ~ education + poly(age, 2)
## Model 3: wage ~ education + poly(age, 3)
## Model 4: wage ~ education + poly(age, 4)
## Model 5: wage ~ education + poly(age, 5)
## Model 6: wage ~ education + poly(age, 6)
## Model 7: wage ~ education + poly(age, 7)
## Model 8: wage ~ education + poly(age, 8)
## Model 9: wage ~ education + poly(age, 9)
##   Res.Df     RSS Df Sum of Sq        F  Pr(>F)    
## 1   2994 3867992                                  
## 2   2993 3725395  1    142597 114.8380 < 2e-16 ***
## 3   2992 3719809  1      5587   4.4991 0.03399 *  
## 4   2991 3719777  1        32   0.0256 0.87299    
## 5   2990 3716972  1      2805   2.2588 0.13296    
## 6   2989 3715061  1      1911   1.5390 0.21487    
## 7   2988 3711220  1      3841   3.0937 0.07870 .  
## 8   2987 3711219  1         1   0.0005 0.98164    
## 9   2986 3707787  1      3432   2.7640 0.09651 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova picked 3 degree of freedom as the best model, the cross-validation picked 9 degree of freedom as the best, they are quite different.

#(b) Fit a step function to predict wage using age, and perform cross validation to choose the optimal number of cuts. Make a plot of the fit obtained.#

set.seed(8)
cv.error <- rep(NA,10)
for (i in 2:10) { Wage$cut.age=cut(Wage$age,i)
                  cut.fit=glm(wage ~ cut.age, data = Wage)
                  cv.error[i] <- cv.glm(Wage, cut.fit, K=10)$delta[1]}

plot(x=2:10, y=cv.error[-1], xlab="No. of Cuts", 
                            ylab='CV MSE', 
                            main='Determine No. Cuts vs CV MSE',type='b')
points( x=which.min(cv.error),  y=cv.error[which.min(cv.error)], col='orange', pch=18,cex=3)
axis(1, at = seq(2,10, by=1))

fit=glm(I(wage >250)~poly(age,10),data=Wage, family=binomial )
preds=predict(fit,newdata =list(age=age.grid),se=T)

pfit=exp(preds$fit )/(1+exp(preds$fit ))
se.bands.logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands=exp(se.bands.logit)/(1+exp(se.bands.logit))
preds=predict(fit,newdata =list(age=age.grid),type="response",se=T)
plot(age,I(wage>250),xlim=agelims,type="n",ylim=c(0,.2))
points(jitter(age),I((wage >250)/5),cex=.5,pch ="|",col="darkgrey")
lines(age.grid,pfit,lwd=2,col="red")
matlines(age.grid,se.bands,lwd=1,col="#7fcdbb",lty=3)

fit=lm(wage~cut(age,10,),data=Wage)
coef(summary(fit))
##                           Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)               73.34442   3.024122 24.253129 8.966890e-119
## cut(age, 10, )(24.2,30.4] 24.35898   3.709111  6.567337  6.017962e-11
## cut(age, 10, )(30.4,36.6] 35.44142   3.569564  9.928782  7.023380e-23
## cut(age, 10, )(36.6,42.8] 44.76684   3.478237 12.870553  6.191956e-37
## cut(age, 10, )(42.8,49]   46.70694   3.412621 13.686529  2.106651e-41
## cut(age, 10, )(49,55.2]   43.14541   3.574130 12.071585  8.532713e-33
## cut(age, 10, )(55.2,61.4] 46.38852   3.882374 11.948493  3.529874e-32
## cut(age, 10, )(61.4,67.6] 43.21103   5.080990  8.504452  2.839593e-17
## cut(age, 10, )(67.6,73.8] 23.27101   7.795645  2.985129  2.857559e-03
## cut(age, 10, )(73.8,80.1] 21.21671  11.500230  1.844895  6.515172e-02
table(cut(age,10))
## 
## (17.9,24.2] (24.2,30.4] (30.4,36.6] (36.6,42.8]   (42.8,49]   (49,55.2] 
##         175         347         445         542         640         441 
## (55.2,61.4] (61.4,67.6] (67.6,73.8] (73.8,80.1] 
##         270          96          31          13

The optimal number of cuts cross validation chosed are 10

10. This question relates to the College data set.

#(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.#

detach(Wage)
attach(College)
dim(College)
## [1] 777  18
names(College)
##  [1] "Private"     "Apps"        "Accept"      "Enroll"      "Top10perc"  
##  [6] "Top25perc"   "F.Undergrad" "P.Undergrad" "Outstate"    "Room.Board" 
## [11] "Books"       "Personal"    "PhD"         "Terminal"    "S.F.Ratio"  
## [16] "perc.alumni" "Expend"      "Grad.Rate"
set.seed(4)
train<-sample(nrow(College), size=0.7*nrow(College))
test=(-train)
TRAIN=College[train,]
TEST=College[-train,]
regfit.fwd <- regsubsets (Outstate~., data=TRAIN, nvmax = 18, method ="forward")
summary (regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = TRAIN, nvmax = 18, method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  "*"    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  "*"    "*"    " "       " "       " "        
## 10  ( 1 ) "*"        "*"  "*"    "*"    " "       " "       " "        
## 11  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 12  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 13  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 10  ( 1 ) " "         "*"        " "   "*"      " " "*"      " "      
## 11  ( 1 ) " "         "*"        " "   "*"      " " "*"      " "      
## 12  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 15  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    "*"      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"
set.seed(4)
regfit.best <- regsubsets(Outstate ~ ., data = College[train,], nvmax = 17)
test.mat <- model.matrix(Outstate ~ ., data = College[-train,])
val.errors <- rep(NA, 17)
for (i in 1:17) {
  coefi <- coef(regfit.best, id = i)
  pred <- test.mat[, names(coefi)] %*% coefi
  val.errors[i] <- mean((College$Outstate[-train] - pred)^2)}
val.errors
##  [1] 8814713 7016592 5179712 4650402 4370643 3993738 3937748 4151396 3945736
## [10] 3772875 3714789 3680013 3654424 3646198 3665541 3667384 3666294
min(val.errors)
## [1] 3646198
which.min (val.errors)
## [1] 14
predict.regsubsets <- function (object , newdata , id, ...) {
+ form <- as.formula (object$call[[2]])
+ mat <- model.matrix (form , newdata)
+ coefi <- coef (object , id = id)
+ xvars <- names (coefi)
+ mat[, xvars] %*% coefi}
regfit.best <- regsubsets (Outstate~., data =College,nvmax = 17)
coef (regfit.best , 14)
##   (Intercept)    PrivateYes          Apps        Accept        Enroll 
## -1.817040e+03  2.256946e+03 -2.999022e-01  8.023519e-01 -5.372545e-01 
##     Top10perc   F.Undergrad    Room.Board      Personal           PhD 
##  2.365529e+01 -9.569936e-02  8.741819e-01 -2.478418e-01  1.269506e+01 
##      Terminal     S.F.Ratio   perc.alumni        Expend     Grad.Rate 
##  2.297296e+01 -4.700560e+01  4.195006e+01  2.003912e-01  2.383197e+01

I find that the best model is the one that contains 14 variables.

#(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.#

gam.fit <- gam(Outstate~Private+ s(Apps,4) +                                    
              s(Accept,4)+s(Enroll,4)+s(Top10perc,4)+s(Expend ,4)+s(Grad.Rate,4)
              +s(F.Undergrad,4)+s(Room.Board,4)+s(Personal,4)+s(PhD,4)
             +s(Terminal,4)+s( S.F.Ratio,4)+s(perc.alumni,4),data =College,subset = train)

par (mfrow = c(2, 3))
plot (gam.fit, se = TRUE , col = "green")

Private schools cost more。“App, Enroll, f.Bennett grad, Personal and outstate” are inversely related; “Accept, Expend, Grad Rate, Room, Board, Terminal, S.F.R atio and the perc. Alumni” is proportional to the outstate relationship, “PHD and Top10perc” almost no change

#(c) Evaluate the model obtained on the test set, and explain the results obtained.#

pred <- predict(gam.fit, TEST)
error.rate <- mean((TEST$Outstate - pred)^2)

cat('The GAM model MSE is: ', error.rate)
## The GAM model MSE is:  3637684
r2= 1 - error.rate/mean(((TEST$Outstate - mean(TEST$Outstate)))^2)
cat('The R2 of the GAM mode is: ', r2)
## The R2 of the GAM mode is:  0.7800168

The R2 value means we can explain roughly 78 % of the variance in the model based on 14 predictors.

#(d) For which variables, if any, is there evidence of a non-linear relationship with the response?#

summary(gam.fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Apps, 4) + s(Accept, 4) + 
##     s(Enroll, 4) + s(Top10perc, 4) + s(Expend, 4) + s(Grad.Rate, 
##     4) + s(F.Undergrad, 4) + s(Room.Board, 4) + s(Personal, 4) + 
##     s(PhD, 4) + s(Terminal, 4) + s(S.F.Ratio, 4) + s(perc.alumni, 
##     4), data = College, subset = train)
## Deviance Residuals:
##       Min        1Q    Median        3Q       Max 
## -6234.202 -1015.739     8.407  1102.396  6455.232 
## 
## (Dispersion Parameter for gaussian family taken to be 3197483)
## 
##     Null Deviance: 8687534605 on 542 degrees of freedom
## Residual Deviance: 1563573009 on 489.0012 degrees of freedom
## AIC: 9727.075 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private             1 2306077770 2306077770 721.2165 < 2.2e-16 ***
## s(Apps, 4)          1 1029107801 1029107801 321.8493 < 2.2e-16 ***
## s(Accept, 4)        1  107542360  107542360  33.6334 1.196e-08 ***
## s(Enroll, 4)        1  213208684  213208684  66.6802 2.739e-15 ***
## s(Top10perc, 4)     1  637340889  637340889 199.3258 < 2.2e-16 ***
## s(Expend, 4)        1 1049900129 1049900129 328.3520 < 2.2e-16 ***
## s(Grad.Rate, 4)     1  224152498  224152498  70.1028 5.974e-16 ***
## s(F.Undergrad, 4)   1     102593     102593   0.0321   0.85791    
## s(Room.Board, 4)    1  150920615  150920615  47.1998 1.957e-11 ***
## s(Personal, 4)      1   14325362   14325362   4.4802   0.03479 *  
## s(PhD, 4)           1   10521975   10521975   3.2907   0.07029 .  
## s(Terminal, 4)      1    4220249    4220249   1.3199   0.25118    
## s(S.F.Ratio, 4)     1   11287521   11287521   3.5301   0.06086 .  
## s(perc.alumni, 4)   1   77510613   77510613  24.2411 1.164e-06 ***
## Residuals         489 1563573009    3197483                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Apps, 4)              3  2.0923  0.100349    
## s(Accept, 4)            3 11.7818 1.831e-07 ***
## s(Enroll, 4)            3  4.4807  0.004079 ** 
## s(Top10perc, 4)         3  0.4229  0.736678    
## s(Expend, 4)            3 26.7252 4.441e-16 ***
## s(Grad.Rate, 4)         3  1.8080  0.144775    
## s(F.Undergrad, 4)       3  0.9602  0.411276    
## s(Room.Board, 4)        3  1.0445  0.372495    
## s(Personal, 4)          3  2.6775  0.046523 *  
## s(PhD, 4)               3  2.8716  0.035938 *  
## s(Terminal, 4)          3  1.6823  0.169893    
## s(S.F.Ratio, 4)         3  2.7255  0.043653 *  
## s(perc.alumni, 4)       3  1.3905  0.244905    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

“Accept,Enroll,Expend,Personal,PhD,S.F.Ratio” have a non-linear relationship with the response,since their P value is smaller than 0.05.