library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.1 âś” readr 2.1.4
## âś” forcats 1.0.0 âś” stringr 1.5.0
## âś” ggplot2 3.4.2 âś” tibble 3.2.1
## âś” lubridate 1.9.2 âś” tidyr 1.3.0
## âś” purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ISLR)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:lattice':
##
## melanoma
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loaded gam 1.22-2
library(leaps)
set.seed(1)
degree <- rep(NA, 15)
for (i in 1:15) {
fit <- glm(wage ~ poly(age,i), data = Wage)
degree[i] <- cv.glm(Wage, fit, K = 15)$delta[1]
}
plot(1:15, degree,
xlab = "Degree",
ylab = "MSE"
)
min.degree <- which.min(degree)
min.degree
## [1] 7
It appears that 7 degrees gave us the lowest MSE.
fit1 <- lm(wage ~ age, data = Wage)
fit2 <- lm(wage ~ poly(age, 2), data = Wage)
fit3 <- lm(wage ~ poly(age, 3), data = Wage)
fit4 <- lm(wage ~ poly(age, 4), data = Wage)
fit5 <- lm(wage ~ poly(age, 5), data = Wage)
fit6 <- lm(wage ~ poly(age, 6), data = Wage)
fit7 <- lm(wage ~ poly(age, 7), data = Wage)
anova(fit1,fit2,fit3,fit4,fit5,fit6,fit7)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6926 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8956 0.001673 **
## 4 2995 4771604 1 6070 3.8125 0.050966 .
## 5 2994 4770322 1 1283 0.8055 0.369516
## 6 2993 4766389 1 3932 2.4697 0.116165
## 7 2992 4763834 1 2555 1.6049 0.205311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I am going to use a poly(age,4) model because .0509 is so close to .05. I am leaving out the 7th degree, as it would compicate the model with only nominal benefits.
plot(wage ~ age, data = Wage)
agelimits <- range(Wage$age)
agegrid <- seq(from = agelimits[1], to = agelimits[2])
polyfit <- lm(wage ~ poly(age,4), data = Wage)
polypredict <- predict(polyfit, newdata = list(age = agegrid))
lines(agegrid, polypredict, col = 'blue', lwd = 2)
set.seed(1)
kfold <- rep(NA, 15)
for(i in 2:15) {
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
kfold[i] <- cv.glm(Wage, fit, K =15)$delta[1]
}
plot(2:15, kfold[-1],
xlab = "Degrees",
ylab = "MSE")
min.folds <- which.min(kfold)
min.folds
## [1] 8
It appears that 8 cuts gives us the lowest MSE.
plot(wage ~ age, data = Wage)
kfold.fit <- glm(wage ~ cut(age,8), data = Wage)
kfold.predict <- predict(kfold.fit, data.frame(age = agegrid))
lines(agegrid, kfold.predict, col = 'blue', lwd = 2)
split = .8
trainIndex <- createDataPartition(College$Outstate, p =split, list = FALSE)
Collegetrain <- College[ trainIndex,]
Collegetest <- College[-trainIndex,]
fwd.step <- regsubsets(Outstate ~., data = Collegetrain, method = "forward", nvmax = 18)
sumfwd.step <- summary(fwd.step)
sumfwd.step
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = Collegetrain, method = "forward",
## nvmax = 18)
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## 9 ( 1 ) "*" " " " " " " " " " " " "
## 10 ( 1 ) "*" " " "*" " " " " " " " "
## 11 ( 1 ) "*" " " "*" " " " " " " "*"
## 12 ( 1 ) "*" "*" "*" " " " " " " "*"
## 13 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " "*" " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " "*" " " "*" " "
## 8 ( 1 ) " " "*" " " "*" " " "*" "*"
## 9 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 10 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 11 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 12 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) " " "*" "*"
## 5 ( 1 ) " " "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
par(mfrow=c(1,3))
plot(sumfwd.step$adjr2)
plot(sumfwd.step$cp)
plot(sumfwd.step$bic)
It appears that 6 is a reasonable amount of variables to choose.
coeffs <- coef(fwd.step, id = 6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "Terminal" "perc.alumni"
## [6] "Expend" "Grad.Rate"
attach(College)
gam.fit <- gam(Outstate ~ Private + s(Room.Board) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = Collegetrain)
par(mfrow = c(2,3))
plot(gam.fit, se = TRUE, col = 'blue')
mean((predict(gam.fit, newdata = Collegetest) - Collegetest$Outstate)^2)
## [1] 3641989
err <- mean((Collegetest$Outstate - predict(gam.fit, Collegetest))^2)
tss <-mean((Collegetest$Outstate - mean(Collegetest$Outstate))^2)
rs <- 1 - err/tss
rs
## [1] 0.7794837
We have an R-Squared of .8414, which means that 84.14% of the variation in the data can be explained by our gam model.
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(Terminal) +
## s(perc.alumni) + s(Expend) + s(Grad.Rate), data = Collegetrain)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7387.37 -1080.65 30.96 1245.60 7782.51
##
## (Dispersion Parameter for gaussian family taken to be 3414453)
##
## Null Deviance: 10015674527 on 622 degrees of freedom
## Residual Deviance: 2052088069 on 601.0006 degrees of freedom
## AIC: 11163.72
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2848901411 2848901411 834.37 < 2.2e-16 ***
## s(Room.Board) 1 1863074984 1863074984 545.64 < 2.2e-16 ***
## s(Terminal) 1 691934312 691934312 202.65 < 2.2e-16 ***
## s(perc.alumni) 1 390893703 390893703 114.48 < 2.2e-16 ***
## s(Expend) 1 915495774 915495774 268.12 < 2.2e-16 ***
## s(Grad.Rate) 1 138830787 138830787 40.66 3.614e-10 ***
## Residuals 601 2052088069 3414453
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 2.2819 0.07812 .
## s(Terminal) 3 2.6420 0.04854 *
## s(perc.alumni) 3 1.5776 0.19363
## s(Expend) 3 30.4245 < 2e-16 ***
## s(Grad.Rate) 3 2.5902 0.05201 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It appears that Terminal and Expend both have a non-linear relationship with Outstate at the \(\alpha\) = .05 level. This being said, there is also evidence that Room.Board and Grad.Rate also have a non-linear relationship, but the relationship isn’t as signficant as the first two.