library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.2
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Loading required package: lattice
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
##6. In this exercise, you will further analyze the Wage data set considered ##throughout this chapter. #(a) Perform polynomial regression to predict wage using age. Use #cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to #the results of hypothesis testing using ANOVA? Make a plot of #the resulting polynomial fit to the data.
Look at the Wage dataset
str(Wage)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
create the polynomial regression model with k-Fold Cross-Validation (k=10)
?poly
## starting httpd help server ... done
set.seed(22)
cv.error.10 <- rep(0,10) ##from 5.3.3 in the book
for (i in 1:10) {
poly.fit <- glm(wage ~ poly(age,i), data = Wage)
cv.error.10[i] <- cv.glm(Wage, poly.fit, K=10)$delta[1]
}
cv.error.10
## [1] 1676.802 1600.861 1596.619 1593.603 1594.985 1595.317 1593.254 1594.866
## [9] 1595.676 1593.965
range(cv.error.10)
## [1] 1593.254 1676.802
which.min(cv.error.10)
## [1] 7
##interesting that 7 is the lowest degree, book and RLab only went up to 4. However, based on the coef summary below 7 degrees is not significant P-value > 0.05
Look at the model: P-values indicate that poly 3 is the model of best fit (significant)
coef(summary(poly.fit)) ##each column is linear combination of age, age squared, age cubed and age to the 10th
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7283319 153.36910451 0.000000e+00
## poly(age, i)1 447.06785 39.8923800 11.20684835 1.390296e-28
## poly(age, i)2 -478.31581 39.8923800 -11.99015466 2.187330e-32
## poly(age, i)3 125.52169 39.8923800 3.14650783 1.668583e-03
## poly(age, i)4 -77.91118 39.8923800 -1.95303416 5.090874e-02
## poly(age, i)5 -35.81289 39.8923800 -0.89773759 3.693978e-01
## poly(age, i)6 62.70772 39.8923800 1.57192216 1.160744e-01
## poly(age, i)7 50.54979 39.8923800 1.26715403 2.051989e-01
## poly(age, i)8 -11.25473 39.8923800 -0.28212736 7.778654e-01
## poly(age, i)9 -83.69180 39.8923800 -2.09793947 3.599425e-02
## poly(age, i)10 1.62405 39.8923800 0.04071077 9.675292e-01
Anova: null hypothesis simpler model is better than the more complex model For example: basic linear model is better than squared model If the model is significant (p-value less than 0.05) reject null hypothesis and accept the alternate hypothesis more complex model is better
Anova indicates poly 3 is the best fit, poly 4 p-value is slightly > 0.05. I used glm function above, but to get Anova results I gad to switch to lm function. GLM would not produce F and P-value statistics with the Anova function.
lm.fit <- lm(wage ~ age, data = Wage)
poly.fit2 <-lm(wage ~ poly(age, 2), data = Wage)
poly.fit3 <-lm(wage ~ poly(age, 3), data = Wage)
poly.fit4 <-lm(wage ~ poly(age, 4), data = Wage)
anova(lm.fit,poly.fit2, poly.fit3, poly.fit4)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6025 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8894 0.001679 **
## 4 2995 4771604 1 6070 3.8101 0.051039 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Plotting: (code from class R lab) Plotting the poly degree 3 model
agelims=range(Wage$age)
age.grid=seq(from=agelims[1], to=agelims[2])
preds=predict(poly.fit3,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(Wage$age,Wage$wage,xlim=agelims,cex=.5,col='darkgrey')
title('Polynomial')
lines(age.grid,preds$fit,lwd=2,col='darkblue')
matlines(age.grid,se.bands,lwd = 1, col='lightblue',lty = 3)
##6. #(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of #the fit obtained
table(cut(Wage$age,4))
##
## (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
## 750 1399 779 72
Code used from ch 5 of book, however the iteration (i) for CV wouldn’t work in the cut function (error: invalid number of intervals) From stackoverflow a solution was posted that created a tmp variable that stored the cut intervals and then feed the tmp variable to the glm function. It worked, but would rather have cv work for the step function like it works for the splines (CV = TRUE)
set.seed(22)
cv.error.step <- rep(0,10)
for (i in 2:11) {
Wage$tmp <- cut(Wage$age,i)
step.fit = glm(wage~tmp,data = Wage)
cv.error.step[i] <- cv.glm(Wage, step.fit, K=10)$delta[1]
}
cv.error.step
## [1] 0.000 1733.637 1682.881 1637.804 1630.894 1623.387 1610.311 1600.769
## [9] 1609.561 1609.255 1598.349
It appears 10 cuts is optimal, slightly better than 7 cuts
plot(1:10, cv.error.step[-1], xlab = "Cuts", ylab = "MSE", type = "l")
d.min <- which.min(cv.error.step[-1])
points(d.min, cv.error.step[d.min+1], col="red", cex=2, pch =20) #x is 1:10, y is 0-11, 0 has to be taken out
Plotting: Plotting the step.fit model with 10 cuts
step.fit10 <- lm(wage ~ cut(age, 10), data = Wage)
agelims=range(Wage$age)
age.grid=seq(from=agelims[1], to=agelims[2])
preds=predict(step.fit10,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(Wage$age,Wage$wage,xlim=agelims,cex=.5,col='darkgrey')
title('Step Function')
lines(age.grid,preds$fit,lwd=2,col='darkblue')
matlines(age.grid,se.bands,lwd = 1, col='lightblue',lty = 3)
Look at the dataset Response: Outstate
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
summary(College$Outstate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2340 7320 9990 10441 12925 21700
set.seed(22)
train=sample(c(TRUE,FALSE), nrow(College),rep=TRUE)
test=(!train)
college.train = College[train,]
college.test = College[!train,]
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
out.fwd <- regsubsets(Outstate~., data = college.train, nvmax =18, method ="forward")
out.summary <- summary(out.fwd)
#out.summary$rsq
Plot RSS for forward stepwise (the smaller the RSS the better the model) Plot AdjR^2 for forward stepwise (the higher the adjusted R^2 the better)
par(mfrow = c(2,2))
plot(out.summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
minrss <-which.min(out.summary$rss) ##17 variables
points(minrss, out.summary$rss[minrss], col='red', cex = 2, pch = 20)
plot(out.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")
maxr2 <-which.max(out.summary$adjr2) ##10 variables
points(maxr2, out.summary$adjr2[maxr2], col='red', cex = 2, pch = 20)
Plot CP for forward stepwise (the smaller the CP the better the model) Plot BIC for forward stepwise (the smaller the BIC the better)
par(mfrow = c(2,2))
plot(out.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
mincp <-which.min(out.summary$cp) ##10 variables
points(mincp, out.summary$cp[mincp], col='red', cex = 2, pch = 20)
plot(out.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
minbic <-which.min(out.summary$bic) ##8 variables
points(minbic, out.summary$bic[minbic], col='red', cex = 2, pch = 20)
Use built-in plot command for regsubsets() for the model with least variables (BIC criteria)
plot(out.fwd, scale = 'bic') #8 variables selected based on the BIC plot below (top row with black cell is selected variable)
Coefficients of the BIC model using forward selection
coef(out.fwd,8)
## (Intercept) PrivateYes Accept F.Undergrad Room.Board
## -3758.7610386 3098.5057659 0.4809318 -0.2031804 0.7867957
## Terminal perc.alumni Expend Grad.Rate
## 43.2484308 67.4052790 0.1976709 22.4520955
library(gam)
## Warning: package 'gam' was built under R version 4.1.2
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.2
## Loaded gam 1.20
gam.out <- gam(Outstate ~ Private + s(Accept) + s(F.Undergrad) + s(Room.Board) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = college.train)
Plot the gam model Plot results: Private is a categorical variable (binary yes/no): out of state is higher for private = yes Appearance of linearity: accept (positive), undergrad (negative), room.board (positive), perc.alumni (positive) Appearance of non-linearity: terminal, expend, grad.rate
par(mfrow=c(1,3))
plot.Gam(gam.out, se = TRUE, col = 'red')
preds <- predict(gam.out, newdata = college.test)
mean(preds - college.test$Outstate)^2
## [1] 6918.377
comparing the MSE to a lm model with stepwise forward variables
out.lm <- lm(Outstate ~ Private + Accept + F.Undergrad + Room.Board + Terminal + perc.alumni + Expend + Grad.Rate, data = college.train)
preds.lm <- predict(out.lm, newdata = college.test)
mean(preds.lm - college.test$Outstate)^2
## [1] 14552.76
GAM Model MSE: 6918 LM model MSE: 14552 GAM stepwise forward model MSE is far less 6918 compared to lm model MSE 14552 using the stepwise forward variables (same variables). GAM with splines on the variable produced better model results.