library(ISLR)
library(boot)
attach(Wage)
str(Wage)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
Optimal degree d for the polynomial is 9.
plot(1:10, delta, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(delta)
points(d.min, delta[d.min], col = "red", cex = 2, pch = 20)
plot(wage ~ age, data = Wage, col = "darkgrey")
age.range <- range(Wage$age)
age.grid <- seq(from = age.range[1], to = age.range[2])
fit <- lm(wage ~ poly(age, 3), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)
## 6B.
The plot shows that the optimal number of cuts is 8.
set.seed(1)
cvs <- rep(0, 10)
for (i in 2:10) {Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
cvs[i] <- cv.glm(Wage, fit, K = 10)$delta[1]}
plot(2:10, cvs[-1], xlab = "Cuts", ylab = "Test MSE", type = "l")
d.min <- which.min(cvs)
points(d.min, cvs[d.min], col = "red", cex = 2, pch = 20)
plot(wage ~ age, data = Wage, col = "grey")
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds, col = "blue", lwd = 2)
## 10A
attach(College)
library(caret)
## Loading required package: ggplot2
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
library(leaps)
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
set.seed(1)
train <- createDataPartition(Outstate, p=0.8, list = FALSE, times = 1)
College.train <- College[train,]
College.test <- College[-train,]
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 1, selectionFunction = "oneSE")
set.seed(1)
forward.model <- train(Outstate ~ .,data = College.train, method = "leapForward", maximize = F, trControl = ctrl, tuneGrid = data.frame(nvmax = 1:17))
forward.model
## Linear Regression with Forward Selection
##
## 623 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 560, 560, 562, 560, 561, 561, ...
## Resampling results across tuning parameters:
##
## nvmax RMSE Rsquared MAE
## 1 3169.576 0.4229785 2473.480
## 2 2673.444 0.5769138 2027.409
## 3 2315.210 0.6760545 1813.115
## 4 2119.253 0.7250021 1666.489
## 5 2051.198 0.7413953 1615.754
## 6 2011.108 0.7507054 1583.030
## 7 2032.862 0.7457135 1601.087
## 8 2054.518 0.7404374 1611.457
## 9 2054.324 0.7402399 1614.336
## 10 2033.989 0.7455431 1608.504
## 11 2036.540 0.7447230 1608.870
## 12 2033.428 0.7453446 1598.234
## 13 2022.145 0.7492188 1592.980
## 14 1999.300 0.7544045 1572.820
## 15 1996.559 0.7547738 1575.071
## 16 1994.824 0.7550069 1575.145
## 17 1995.677 0.7548594 1575.779
##
## RMSE was used to select the optimal model using the one SE rule.
## The final value used for the model was nvmax = 6.
coef(forward.model$finalModel, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3748.7268770 2885.1884620 0.9448039 41.0276564 51.9906135
## Expend Grad.Rate
## 0.1997015 28.4932592
The out-of-state tuition for private collages appear to be higher. Also, prices could increase based on the boarding rooms, PhD, alumni percentage and graduation rate.
library(gam)
## Warning: package 'gam' was built under R version 4.1.2
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.2
## Loaded gam 1.20.1
gam.model <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = College.train)
par(mfrow = c(2,3))
plot(gam.model, se = T, col = 'blue')
## 10C. There’s 0.79 variance.
gam.pred = predict(gam.model, College.test)
gam.res = mean((College.test$Outstate - gam.pred)^2)
gam.tot = mean((College.test$Outstate - mean(College.test$Outstate))^2)
test.r2 = 1 - gam.res/gam.tot
test.r2
## [1] 0.7885879
The ANOVA for non-parametric effects, Expend has a significant p-value. So, we can conclude that there is evidence of non-linear relationship with Outstate. We also saw this same behavior from the plots of the GAM model.
summary(gam.model)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate), data = College.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7238.47 -1185.55 85.31 1280.35 4604.05
##
## (Dispersion Parameter for gaussian family taken to be 3411970)
##
## Null Deviance: 9778743994 on 622 degrees of freedom
## Residual Deviance: 2050592947 on 600.9997 degrees of freedom
## AIC: 11163.26
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2634772462 2634772462 772.21 < 2.2e-16 ***
## s(Room.Board) 1 1955517765 1955517765 573.13 < 2.2e-16 ***
## s(PhD) 1 756514727 756514727 221.72 < 2.2e-16 ***
## s(perc.alumni) 1 386996490 386996490 113.42 < 2.2e-16 ***
## s(Expend) 1 681419771 681419771 199.71 < 2.2e-16 ***
## s(Grad.Rate) 1 103860089 103860089 30.44 5.123e-08 ***
## Residuals 601 2050592947 3411970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 1.9582 0.11910
## s(PhD) 3 1.3924 0.24407
## s(perc.alumni) 3 1.1768 0.31779
## s(Expend) 3 27.3661 < 2e-16 ***
## s(Grad.Rate) 3 2.3871 0.06803 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1