library(ISLR)
library(boot)
attach(Wage)
str(Wage)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...

6A.

Optimal degree d for the polynomial is 9.

plot(1:10, delta, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(delta)
points(d.min, delta[d.min], col = "red", cex = 2, pch = 20)

plot(wage ~ age, data = Wage, col = "darkgrey")
age.range <- range(Wage$age)
age.grid <- seq(from = age.range[1], to = age.range[2])
fit <- lm(wage ~ poly(age, 3), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)

## 6B.

The plot shows that the optimal number of cuts is 8.

set.seed(1)
cvs <- rep(0, 10)
for (i in 2:10) {Wage$age.cut <- cut(Wage$age, i)
    fit <- glm(wage ~ age.cut, data = Wage)
    cvs[i] <- cv.glm(Wage, fit, K = 10)$delta[1]}

plot(2:10, cvs[-1], xlab = "Cuts", ylab = "Test MSE", type = "l")
d.min <- which.min(cvs)
points(d.min, cvs[d.min], col = "red", cex = 2, pch = 20)

plot(wage ~ age, data = Wage, col = "grey")
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds, col = "blue", lwd = 2)

## 10A

attach(College)

library(caret)
## Loading required package: ggplot2
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
library(leaps)
str(College)
## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...
set.seed(1)
train <- createDataPartition(Outstate, p=0.8, list = FALSE, times = 1)
College.train <- College[train,]
College.test <- College[-train,]
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 1, selectionFunction = "oneSE")

set.seed(1)

forward.model <- train(Outstate ~ .,data = College.train, method = "leapForward", maximize = F, trControl = ctrl, tuneGrid = data.frame(nvmax = 1:17))

forward.model
## Linear Regression with Forward Selection 
## 
## 623 samples
##  17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 560, 560, 562, 560, 561, 561, ... 
## Resampling results across tuning parameters:
## 
##   nvmax  RMSE      Rsquared   MAE     
##    1     3169.576  0.4229785  2473.480
##    2     2673.444  0.5769138  2027.409
##    3     2315.210  0.6760545  1813.115
##    4     2119.253  0.7250021  1666.489
##    5     2051.198  0.7413953  1615.754
##    6     2011.108  0.7507054  1583.030
##    7     2032.862  0.7457135  1601.087
##    8     2054.518  0.7404374  1611.457
##    9     2054.324  0.7402399  1614.336
##   10     2033.989  0.7455431  1608.504
##   11     2036.540  0.7447230  1608.870
##   12     2033.428  0.7453446  1598.234
##   13     2022.145  0.7492188  1592.980
##   14     1999.300  0.7544045  1572.820
##   15     1996.559  0.7547738  1575.071
##   16     1994.824  0.7550069  1575.145
##   17     1995.677  0.7548594  1575.779
## 
## RMSE was used to select the optimal model using  the one SE rule.
## The final value used for the model was nvmax = 6.
coef(forward.model$finalModel, id = 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3748.7268770  2885.1884620     0.9448039    41.0276564    51.9906135 
##        Expend     Grad.Rate 
##     0.1997015    28.4932592

10B.

The out-of-state tuition for private collages appear to be higher. Also, prices could increase based on the boarding rooms, PhD, alumni percentage and graduation rate.

library(gam)
## Warning: package 'gam' was built under R version 4.1.2
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.2
## Loaded gam 1.20.1
gam.model <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = College.train)

par(mfrow = c(2,3))
plot(gam.model, se = T, col = 'blue')

## 10C. There’s 0.79 variance.

gam.pred = predict(gam.model, College.test)
gam.res = mean((College.test$Outstate - gam.pred)^2)
gam.tot = mean((College.test$Outstate - mean(College.test$Outstate))^2)
test.r2 = 1 - gam.res/gam.tot
test.r2
## [1] 0.7885879

10D.

The ANOVA for non-parametric effects, Expend has a significant p-value. So, we can conclude that there is evidence of non-linear relationship with Outstate. We also saw this same behavior from the plots of the GAM model.

summary(gam.model)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate), data = College.train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7238.47 -1185.55    85.31  1280.35  4604.05 
## 
## (Dispersion Parameter for gaussian family taken to be 3411970)
## 
##     Null Deviance: 9778743994 on 622 degrees of freedom
## Residual Deviance: 2050592947 on 600.9997 degrees of freedom
## AIC: 11163.26 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2634772462 2634772462  772.21 < 2.2e-16 ***
## s(Room.Board)    1 1955517765 1955517765  573.13 < 2.2e-16 ***
## s(PhD)           1  756514727  756514727  221.72 < 2.2e-16 ***
## s(perc.alumni)   1  386996490  386996490  113.42 < 2.2e-16 ***
## s(Expend)        1  681419771  681419771  199.71 < 2.2e-16 ***
## s(Grad.Rate)     1  103860089  103860089   30.44 5.123e-08 ***
## Residuals      601 2050592947    3411970                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F   Pr(F)    
## (Intercept)                               
## Private                                   
## s(Room.Board)        3  1.9582 0.11910    
## s(PhD)               3  1.3924 0.24407    
## s(perc.alumni)       3  1.1768 0.31779    
## s(Expend)            3 27.3661 < 2e-16 ***
## s(Grad.Rate)         3  2.3871 0.06803 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1