library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.2
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Loading required package: lattice
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma

##6. In this exercise, you will further analyze the Wage data set considered ##throughout this chapter. #(a) Perform polynomial regression to predict wage using age. Use #cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to #the results of hypothesis testing using ANOVA? Make a plot of #the resulting polynomial fit to the data.

Look at the Wage dataset

str(Wage)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...

create the polynomial regression model with k-Fold Cross-Validation (k=10)

?poly
## starting httpd help server ... done
set.seed(22)
cv.error.10 <- rep(0,10) ##from 5.3.3 in the book
for (i in 1:10) {
  poly.fit <- glm(wage ~ poly(age,i), data = Wage)
  cv.error.10[i] <- cv.glm(Wage, poly.fit, K=10)$delta[1]
                }
cv.error.10 
##  [1] 1676.802 1600.861 1596.619 1593.603 1594.985 1595.317 1593.254 1594.866
##  [9] 1595.676 1593.965
range(cv.error.10) 
## [1] 1593.254 1676.802
which.min(cv.error.10)  
## [1] 7
##interesting that 7 is the lowest degree, book and RLab only went up to 4.  However, based on the coef summary below 7 degrees is not significant P-value > 0.05

Look at the model: P-values indicate that poly 3 is the model of best fit (significant)

coef(summary(poly.fit)) ##each column is linear combination of age, age squared, age cubed and age to the 10th
##                  Estimate Std. Error      t value     Pr(>|t|)
## (Intercept)     111.70361  0.7283319 153.36910451 0.000000e+00
## poly(age, i)1   447.06785 39.8923800  11.20684835 1.390296e-28
## poly(age, i)2  -478.31581 39.8923800 -11.99015466 2.187330e-32
## poly(age, i)3   125.52169 39.8923800   3.14650783 1.668583e-03
## poly(age, i)4   -77.91118 39.8923800  -1.95303416 5.090874e-02
## poly(age, i)5   -35.81289 39.8923800  -0.89773759 3.693978e-01
## poly(age, i)6    62.70772 39.8923800   1.57192216 1.160744e-01
## poly(age, i)7    50.54979 39.8923800   1.26715403 2.051989e-01
## poly(age, i)8   -11.25473 39.8923800  -0.28212736 7.778654e-01
## poly(age, i)9   -83.69180 39.8923800  -2.09793947 3.599425e-02
## poly(age, i)10    1.62405 39.8923800   0.04071077 9.675292e-01

Anova: null hypothesis simpler model is better than the more complex model For example: basic linear model is better than squared model If the model is significant (p-value less than 0.05) reject null hypothesis and accept the alternate hypothesis more complex model is better

Anova indicates poly 3 is the best fit, poly 4 p-value is slightly > 0.05. I used glm function above, but to get Anova results I gad to switch to lm function. GLM would not produce F and P-value statistics with the Anova function.

lm.fit <- lm(wage ~ age, data = Wage)
poly.fit2 <-lm(wage ~ poly(age, 2), data = Wage)
poly.fit3 <-lm(wage ~ poly(age, 3), data = Wage)
poly.fit4 <-lm(wage ~ poly(age, 4), data = Wage)

anova(lm.fit,poly.fit2, poly.fit3, poly.fit4)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.6025 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8894  0.001679 ** 
## 4   2995 4771604  1      6070   3.8101  0.051039 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plotting: (code from class R lab) Plotting the poly degree 3 model

agelims=range(Wage$age)
age.grid=seq(from=agelims[1], to=agelims[2])
preds=predict(poly.fit3,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(Wage$age,Wage$wage,xlim=agelims,cex=.5,col='darkgrey')
title('Polynomial')
lines(age.grid,preds$fit,lwd=2,col='darkblue')
matlines(age.grid,se.bands,lwd = 1, col='lightblue',lty = 3)

##6. #(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of #the fit obtained

table(cut(Wage$age,4))
## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72

Code used from ch 5 of book, however the iteration (i) for CV wouldn’t work in the cut function (error: invalid number of intervals) From stackoverflow a solution was posted that created a tmp variable that stored the cut intervals and then feed the tmp variable to the glm function. It worked, but would rather have cv work for the step function like it works for the splines (CV = TRUE)

set.seed(22)
cv.error.step <- rep(0,10)
for (i in 2:11) {
Wage$tmp <- cut(Wage$age,i)
step.fit = glm(wage~tmp,data = Wage)
cv.error.step[i] <- cv.glm(Wage, step.fit, K=10)$delta[1]
}
cv.error.step
##  [1]    0.000 1733.637 1682.881 1637.804 1630.894 1623.387 1610.311 1600.769
##  [9] 1609.561 1609.255 1598.349

It appears 10 cuts is optimal, slightly better than 7 cuts

plot(1:10, cv.error.step[-1], xlab = "Cuts", ylab = "MSE", type = "l")
d.min <- which.min(cv.error.step[-1])
points(d.min, cv.error.step[d.min+1], col="red", cex=2, pch =20) #x is 1:10, y is 0-11, 0 has to be taken out

Plotting: Plotting the step.fit model with 10 cuts

step.fit10 <- lm(wage ~ cut(age, 10), data = Wage)
agelims=range(Wage$age)
age.grid=seq(from=agelims[1], to=agelims[2])
preds=predict(step.fit10,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(Wage$age,Wage$wage,xlim=agelims,cex=.5,col='darkgrey')
title('Step Function')
lines(age.grid,preds$fit,lwd=2,col='darkblue')
matlines(age.grid,se.bands,lwd = 1, col='lightblue',lty = 3)

  1. This question relates to the College data set.
  1. Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
  2. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
  3. Evaluate the model obtained on the test set, and explain the results obtained.
  4. For which variables, if any, is there evidence of a non-linear relationship with the response?

Look at the dataset Response: Outstate

str(College)
## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...
summary(College$Outstate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2340    7320    9990   10441   12925   21700
  1. Split the data set into a training set and a test set.
set.seed(22)
train=sample(c(TRUE,FALSE), nrow(College),rep=TRUE)
test=(!train)
college.train = College[train,]
college.test = College[!train,]
  1. Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
out.fwd <- regsubsets(Outstate~., data = college.train, nvmax =18, method ="forward")
out.summary <- summary(out.fwd)
#out.summary$rsq

Plot RSS for forward stepwise (the smaller the RSS the better the model) Plot AdjR^2 for forward stepwise (the higher the adjusted R^2 the better)

par(mfrow = c(2,2))
plot(out.summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
minrss <-which.min(out.summary$rss)  ##17 variables
points(minrss, out.summary$rss[minrss], col='red', cex = 2, pch = 20)

plot(out.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")
maxr2 <-which.max(out.summary$adjr2)  ##10 variables
points(maxr2, out.summary$adjr2[maxr2], col='red', cex = 2, pch = 20)

Plot CP for forward stepwise (the smaller the CP the better the model) Plot BIC for forward stepwise (the smaller the BIC the better)

par(mfrow = c(2,2))
plot(out.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
mincp <-which.min(out.summary$cp)  ##10 variables
points(mincp, out.summary$cp[mincp], col='red', cex = 2, pch = 20)

plot(out.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
minbic <-which.min(out.summary$bic)  ##8 variables
points(minbic, out.summary$bic[minbic], col='red', cex = 2, pch = 20)

Use built-in plot command for regsubsets() for the model with least variables (BIC criteria)

plot(out.fwd, scale = 'bic')  #8 variables selected based on the BIC plot below (top row with black cell is selected variable)

Coefficients of the BIC model using forward selection

coef(out.fwd,8)
##   (Intercept)    PrivateYes        Accept   F.Undergrad    Room.Board 
## -3758.7610386  3098.5057659     0.4809318    -0.2031804     0.7867957 
##      Terminal   perc.alumni        Expend     Grad.Rate 
##    43.2484308    67.4052790     0.1976709    22.4520955
  1. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
library(gam)
## Warning: package 'gam' was built under R version 4.1.2
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.2
## Loaded gam 1.20
gam.out <- gam(Outstate ~ Private + s(Accept) + s(F.Undergrad) + s(Room.Board) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = college.train)

Plot the gam model Plot results: Private is a categorical variable (binary yes/no): out of state is higher for private = yes Appearance of linearity: accept (positive), undergrad (negative), room.board (positive), perc.alumni (positive) Appearance of non-linearity: terminal, expend, grad.rate

par(mfrow=c(1,3))
plot.Gam(gam.out, se = TRUE, col = 'red')

  1. Evaluate the model obtained on the test set, and explain the results obtained.
preds <- predict(gam.out, newdata = college.test)
mean(preds - college.test$Outstate)^2
## [1] 6918.377

comparing the MSE to a lm model with stepwise forward variables

out.lm <- lm(Outstate ~ Private + Accept + F.Undergrad + Room.Board + Terminal + perc.alumni + Expend + Grad.Rate, data = college.train)
preds.lm <- predict(out.lm, newdata = college.test)
mean(preds.lm - college.test$Outstate)^2
## [1] 14552.76

GAM Model MSE: 6918 LM model MSE: 14552 GAM stepwise forward model MSE is far less 6918 compared to lm model MSE 14552 using the stepwise forward variables (same variables). GAM with splines on the variable produced better model results.

  1. For which variables, if any, is there evidence of a non-linear relationship with the response? Appearance of linearity: accept (positive), undergrad (negative), room.board (positive), perc.alumni (positive) Appearance of non-linearity: terminal, expend, grad.rate