##6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.##
#(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data#
library(ISLR)
library(tidyverse)
library(modelr)
library(splines)
library(corrplot)
library(caret)
library(boot)
library(tidyverse)
library(leaps)
library(MASS)
library (gam)
attach(Wage)
range(age)
## [1] 18 80
names(Wage)
## [1] "year" "age" "maritl" "race" "education"
## [6] "region" "jobclass" "health" "health_ins" "logwage"
## [11] "wage"
data("Wage")
set.seed(8)
cv.error <- rep(NA,10)
for (i in 1:10) { poly.fit <- glm(wage ~ poly(age, i), data=Wage)
cv.error[i] <- cv.glm(Wage, poly.fit, K=10)$delta[1]}
plot(x=1:10, y=cv.error, xlab='Degree Polynomial',
ylab='CV MSE',
type='b',
main='CV MSE vs Degree of Polynomial. Wage Vs Age')
points( x=which.min(cv.error), y=min(cv.error), col='red', pch=16,cex=3)
axis(1, at = seq(1,10, by=1))
It appears that a polynomial with degree 9 has the lowest CV
MSE.
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
fit=lm(wage~poly(age,9,raw=T),data=Wage)
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.7,col="lightblue")
title("Degree-9 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=3,col="pink")
matlines(age.grid,se.bands,lwd=2,col="darkgray")
fit.1=lm(wage~education+poly(age,1),data=Wage)
fit.2=lm(wage~education+poly(age,2),data=Wage)
fit.3=lm(wage~education+poly(age,3),data=Wage)
fit.4=lm(wage~education+poly(age,4),data=Wage)
fit.5=lm(wage~education+poly(age,5),data=Wage)
fit.6=lm(wage~education+poly(age,6),data=Wage)
fit.7=lm(wage~education+poly(age,7),data=Wage)
fit.8=lm(wage~education+poly(age,8),data=Wage)
fit.9=lm(wage~education+poly(age,9),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8,fit.9)
## Analysis of Variance Table
##
## Model 1: wage ~ education + poly(age, 1)
## Model 2: wage ~ education + poly(age, 2)
## Model 3: wage ~ education + poly(age, 3)
## Model 4: wage ~ education + poly(age, 4)
## Model 5: wage ~ education + poly(age, 5)
## Model 6: wage ~ education + poly(age, 6)
## Model 7: wage ~ education + poly(age, 7)
## Model 8: wage ~ education + poly(age, 8)
## Model 9: wage ~ education + poly(age, 9)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2994 3867992
## 2 2993 3725395 1 142597 114.8380 < 2e-16 ***
## 3 2992 3719809 1 5587 4.4991 0.03399 *
## 4 2991 3719777 1 32 0.0256 0.87299
## 5 2990 3716972 1 2805 2.2588 0.13296
## 6 2989 3715061 1 1911 1.5390 0.21487
## 7 2988 3711220 1 3841 3.0937 0.07870 .
## 8 2987 3711219 1 1 0.0005 0.98164
## 9 2986 3707787 1 3432 2.7640 0.09651 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova picked 3 degree of freedom as the best model, the cross-validation picked 9 degree of freedom as the best, they are quite different.
#(b) Fit a step function to predict wage using age, and perform cross validation to choose the optimal number of cuts. Make a plot of the fit obtained.#
set.seed(8)
cv.error <- rep(NA,10)
for (i in 2:10) { Wage$cut.age=cut(Wage$age,i)
cut.fit=glm(wage ~ cut.age, data = Wage)
cv.error[i] <- cv.glm(Wage, cut.fit, K=10)$delta[1]}
plot(x=2:10, y=cv.error[-1], xlab="No. of Cuts",
ylab='CV MSE',
main='Determine No. Cuts vs CV MSE',type='b')
points( x=which.min(cv.error), y=cv.error[which.min(cv.error)], col='orange', pch=18,cex=3)
axis(1, at = seq(2,10, by=1))
fit=glm(I(wage >250)~poly(age,10),data=Wage, family=binomial )
preds=predict(fit,newdata =list(age=age.grid),se=T)
pfit=exp(preds$fit )/(1+exp(preds$fit ))
se.bands.logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands=exp(se.bands.logit)/(1+exp(se.bands.logit))
preds=predict(fit,newdata =list(age=age.grid),type="response",se=T)
plot(age,I(wage>250),xlim=agelims,type="n",ylim=c(0,.2))
points(jitter(age),I((wage >250)/5),cex=.5,pch ="|",col="darkgrey")
lines(age.grid,pfit,lwd=2,col="red")
matlines(age.grid,se.bands,lwd=1,col="#7fcdbb",lty=3)
fit=lm(wage~cut(age,10,),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.34442 3.024122 24.253129 8.966890e-119
## cut(age, 10, )(24.2,30.4] 24.35898 3.709111 6.567337 6.017962e-11
## cut(age, 10, )(30.4,36.6] 35.44142 3.569564 9.928782 7.023380e-23
## cut(age, 10, )(36.6,42.8] 44.76684 3.478237 12.870553 6.191956e-37
## cut(age, 10, )(42.8,49] 46.70694 3.412621 13.686529 2.106651e-41
## cut(age, 10, )(49,55.2] 43.14541 3.574130 12.071585 8.532713e-33
## cut(age, 10, )(55.2,61.4] 46.38852 3.882374 11.948493 3.529874e-32
## cut(age, 10, )(61.4,67.6] 43.21103 5.080990 8.504452 2.839593e-17
## cut(age, 10, )(67.6,73.8] 23.27101 7.795645 2.985129 2.857559e-03
## cut(age, 10, )(73.8,80.1] 21.21671 11.500230 1.844895 6.515172e-02
table(cut(age,10))
##
## (17.9,24.2] (24.2,30.4] (30.4,36.6] (36.6,42.8] (42.8,49] (49,55.2]
## 175 347 445 542 640 441
## (55.2,61.4] (61.4,67.6] (67.6,73.8] (73.8,80.1]
## 270 96 31 13
The optimal number of cuts cross validation chosed are 10
#(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.#
detach(Wage)
attach(College)
dim(College)
## [1] 777 18
names(College)
## [1] "Private" "Apps" "Accept" "Enroll" "Top10perc"
## [6] "Top25perc" "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board"
## [11] "Books" "Personal" "PhD" "Terminal" "S.F.Ratio"
## [16] "perc.alumni" "Expend" "Grad.Rate"
set.seed(4)
train<-sample(nrow(College), size=0.7*nrow(College))
test=(-train)
TRAIN=College[train,]
TEST=College[-train,]
regfit.fwd <- regsubsets (Outstate~., data=TRAIN, nvmax = 18, method ="forward")
summary (regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = TRAIN, nvmax = 18, method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " "*" " " " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" "*" "*" " " " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 14 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " " " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " "*" " " "*" " "
## 8 ( 1 ) " " "*" " " "*" " " "*" " "
## 9 ( 1 ) " " "*" " " "*" " " "*" " "
## 10 ( 1 ) " " "*" " " "*" " " "*" " "
## 11 ( 1 ) " " "*" " " "*" " " "*" " "
## 12 ( 1 ) " " "*" " " "*" " " "*" "*"
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
set.seed(4)
regfit.best <- regsubsets(Outstate ~ ., data = College[train,], nvmax = 17)
test.mat <- model.matrix(Outstate ~ ., data = College[-train,])
val.errors <- rep(NA, 17)
for (i in 1:17) {
coefi <- coef(regfit.best, id = i)
pred <- test.mat[, names(coefi)] %*% coefi
val.errors[i] <- mean((College$Outstate[-train] - pred)^2)}
val.errors
## [1] 8814713 7016592 5179712 4650402 4370643 3993738 3937748 4151396 3945736
## [10] 3772875 3714789 3680013 3654424 3646198 3665541 3667384 3666294
min(val.errors)
## [1] 3646198
which.min (val.errors)
## [1] 14
predict.regsubsets <- function (object , newdata , id, ...) {
+ form <- as.formula (object$call[[2]])
+ mat <- model.matrix (form , newdata)
+ coefi <- coef (object , id = id)
+ xvars <- names (coefi)
+ mat[, xvars] %*% coefi}
regfit.best <- regsubsets (Outstate~., data =College,nvmax = 17)
coef (regfit.best , 14)
## (Intercept) PrivateYes Apps Accept Enroll
## -1.817040e+03 2.256946e+03 -2.999022e-01 8.023519e-01 -5.372545e-01
## Top10perc F.Undergrad Room.Board Personal PhD
## 2.365529e+01 -9.569936e-02 8.741819e-01 -2.478418e-01 1.269506e+01
## Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 2.297296e+01 -4.700560e+01 4.195006e+01 2.003912e-01 2.383197e+01
I find that the best model is the one that contains 14 variables.
#(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.#
gam.fit <- gam(Outstate~Private+ s(Apps,4) +
s(Accept,4)+s(Enroll,4)+s(Top10perc,4)+s(Expend ,4)+s(Grad.Rate,4)
+s(F.Undergrad,4)+s(Room.Board,4)+s(Personal,4)+s(PhD,4)
+s(Terminal,4)+s( S.F.Ratio,4)+s(perc.alumni,4),data =College,subset = train)
par (mfrow = c(2, 3))
plot (gam.fit, se = TRUE , col = "green")
Private schools cost more。“App, Enroll, f.Bennett grad, Personal
and outstate” are inversely related; “Accept, Expend, Grad Rate, Room,
Board, Terminal, S.F.R atio and the perc. Alumni” is proportional to the
outstate relationship, “PHD and Top10perc” almost no change
#(c) Evaluate the model obtained on the test set, and explain the results obtained.#
pred <- predict(gam.fit, TEST)
error.rate <- mean((TEST$Outstate - pred)^2)
cat('The GAM model MSE is: ', error.rate)
## The GAM model MSE is: 3637684
r2= 1 - error.rate/mean(((TEST$Outstate - mean(TEST$Outstate)))^2)
cat('The R2 of the GAM mode is: ', r2)
## The R2 of the GAM mode is: 0.7800168
The R2 value means we can explain roughly 78 % of the variance in the model based on 14 predictors.
#(d) For which variables, if any, is there evidence of a non-linear relationship with the response?#
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Apps, 4) + s(Accept, 4) +
## s(Enroll, 4) + s(Top10perc, 4) + s(Expend, 4) + s(Grad.Rate,
## 4) + s(F.Undergrad, 4) + s(Room.Board, 4) + s(Personal, 4) +
## s(PhD, 4) + s(Terminal, 4) + s(S.F.Ratio, 4) + s(perc.alumni,
## 4), data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6234.202 -1015.739 8.407 1102.396 6455.232
##
## (Dispersion Parameter for gaussian family taken to be 3197483)
##
## Null Deviance: 8687534605 on 542 degrees of freedom
## Residual Deviance: 1563573009 on 489.0012 degrees of freedom
## AIC: 9727.075
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2306077770 2306077770 721.2165 < 2.2e-16 ***
## s(Apps, 4) 1 1029107801 1029107801 321.8493 < 2.2e-16 ***
## s(Accept, 4) 1 107542360 107542360 33.6334 1.196e-08 ***
## s(Enroll, 4) 1 213208684 213208684 66.6802 2.739e-15 ***
## s(Top10perc, 4) 1 637340889 637340889 199.3258 < 2.2e-16 ***
## s(Expend, 4) 1 1049900129 1049900129 328.3520 < 2.2e-16 ***
## s(Grad.Rate, 4) 1 224152498 224152498 70.1028 5.974e-16 ***
## s(F.Undergrad, 4) 1 102593 102593 0.0321 0.85791
## s(Room.Board, 4) 1 150920615 150920615 47.1998 1.957e-11 ***
## s(Personal, 4) 1 14325362 14325362 4.4802 0.03479 *
## s(PhD, 4) 1 10521975 10521975 3.2907 0.07029 .
## s(Terminal, 4) 1 4220249 4220249 1.3199 0.25118
## s(S.F.Ratio, 4) 1 11287521 11287521 3.5301 0.06086 .
## s(perc.alumni, 4) 1 77510613 77510613 24.2411 1.164e-06 ***
## Residuals 489 1563573009 3197483
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Apps, 4) 3 2.0923 0.100349
## s(Accept, 4) 3 11.7818 1.831e-07 ***
## s(Enroll, 4) 3 4.4807 0.004079 **
## s(Top10perc, 4) 3 0.4229 0.736678
## s(Expend, 4) 3 26.7252 4.441e-16 ***
## s(Grad.Rate, 4) 3 1.8080 0.144775
## s(F.Undergrad, 4) 3 0.9602 0.411276
## s(Room.Board, 4) 3 1.0445 0.372495
## s(Personal, 4) 3 2.6775 0.046523 *
## s(PhD, 4) 3 2.8716 0.035938 *
## s(Terminal, 4) 3 1.6823 0.169893
## s(S.F.Ratio, 4) 3 2.7255 0.043653 *
## s(perc.alumni, 4) 3 1.3905 0.244905
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
“Accept,Enroll,Expend,Personal,PhD,S.F.Ratio” have a non-linear relationship with the response,since their P value is smaller than 0.05.