library(ISLR2)
library(boot)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tibble' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
wage <- Wage
cv <- rep(1:10)
for (i in 1:10) {
glmfit <- glm(wage ~ poly(age, i), data = Wage)
cv[i] <- cv.glm(Wage, glmfit, K = 10)$delta[1]}
plot(1:i, cv)
min <- which.min(cv)
points(min, cv[min], col = 'red')
# the lowest test mse is 7, but poly value of 3 will be used
glmfit <- glm(wage ~ poly(age, 3), data = wage)
agelims=range(wage$age)
agegrid=seq(from=agelims[1],to=agelims[2])
x <- seq(min(wage$age),max(wage$age))
pred <- predict(glmfit, newdata = list(age = agegrid))
plot(wage$age,wage$wage, col = "darkgrey")
lines(x,pred,lwd=2,col="red")
fit1=lm(wage~age,data=Wage)
fit2=lm(wage~poly(age,2),data=Wage)
fit3=lm(wage~poly(age,3),data=Wage)
fit4=lm(wage~poly(age,4),data=Wage)
fit5=lm(wage~poly(age,5),data=Wage)
anova(fit1,fit2,fit3,fit4,fit5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cv <- rep(1:10)
for (i in 2:10) {
wage$age.cut <- cut(wage$age, i)
fit <- glm(wage ~ age.cut, data = wage)
cv[i] <- cv.glm(wage, fit, K = 10)$delta[1]
}
plot(2:10, cv[-1], xlab = 'Cuts', ylab = 'Test MSE')
# we can see that the lowest cut point obtained in the graph is at 8.
plot(wage ~ age, data = Wage, col = "darkgrey")
glmfit <- glm(wage ~ cut(age, 8), data = wage)
pred <- predict(glmfit, list(age = agegrid))
lines(agegrid, pred, col = "red", lwd = 2)
library(caTools)
set.seed(1)
college <- College
sample <- sample.split(college$Apps, SplitRatio = 0.5)
test <- train <- subset(college, sample == TRUE)
test <- subset(college, sample == FALSE)
library(leaps)
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.22
regfit <- regsubsets(Outstate ~ ., data = train, nvmax = 17)
reg.summary <- summary(regfit)
par (mfrow = c(2, 2))
plot (reg.summary$rss , xlab = " Number of Variables ", ylab = "RSS ", type = "l")
plot(reg.summary$adjr2, xlab = " Number of Variables ",
ylab = " Adjusted RSq ", type = "l")
which.max(reg.summary$adjr2)
## [1] 11
points(13, reg.summary$adjr2[13], col = " red ", cex = 2, pch = 20)
plot(reg.summary$cp, xlab = " Number of Variables ", ylab = "Cp", type = "l")
which.min (reg.summary$cp)
## [1] 10
points (12, reg.summary$cp[12], col = " red ", cex = 2, pch = 20)
which.min(reg.summary$bic)
## [1] 6
plot (reg.summary$bic , xlab = " Number of Variables ", ylab = " BIC ", type = "l")
points (8, reg.summary$bic[8], col = " red ", cex = 2, pch = 20)
# The BIC tells us to stop at 8 variables
par(mfrow = c(1,2))
plot(regfit, scale = "r2")
plot(regfit, scale = "adjr2")
plot(regfit, scale = "Cp")
plot(regfit, scale = "bic")
coef(regfit, id = 6)
## (Intercept) PrivateYes Room.Board Terminal perc.alumni
## -3579.4690422 2484.7981053 1.1390901 33.2412341 53.4580937
## Expend Grad.Rate
## 0.1857791 25.8777174
gam1 <- lm(Outstate~ns(Room.Board,2)+ns(Terminal,2)+ ns(perc.alumni,2) + ns(Expend,5) + ns(Grad.Rate,2) + Private, data=train)
gam2 <- gam(Outstate ~s(Room.Board, df = 2) + s(Terminal, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2) + Private, data = train)
par(mfrow = c(2,3))
plot(gam2, se=TRUE,col ="#1c9099")
summary(gam2)
##
## Call: gam(formula = Outstate ~ s(Room.Board, df = 2) + s(Terminal,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate,
## df = 2) + Private, data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5625.99 -1020.40 97.92 1228.22 7659.61
##
## (Dispersion Parameter for gaussian family taken to be 3277325)
##
## Null Deviance: 6217170124 on 387 degrees of freedom
## Residual Deviance: 1222443287 on 373.0003 degrees of freedom
## AIC: 6938.783
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(Room.Board, df = 2) 1 2369817502 2369817502 723.095 < 2.2e-16 ***
## s(Terminal, df = 2) 1 81469518 81469518 24.858 9.469e-07 ***
## s(perc.alumni, df = 2) 1 720940487 720940487 219.978 < 2.2e-16 ***
## s(Expend, df = 5) 1 570748780 570748780 174.151 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 101439820 101439820 30.952 5.056e-08 ***
## Private 1 233522088 233522088 71.254 7.044e-16 ***
## Residuals 373 1222443287 3277325
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(Room.Board, df = 2) 1 1.1340 0.2876
## s(Terminal, df = 2) 1 2.1386 0.1445
## s(perc.alumni, df = 2) 1 1.7096 0.1918
## s(Expend, df = 5) 4 20.5681 2.442e-15 ***
## s(Grad.Rate, df = 2) 1 0.5476 0.4597
## Private
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,3))
plot.Gam(gam1, se=TRUE, col="#f03b20")
# The predictor variables look pretty linear; especially grad rate, room board, and perc alumni, and terminal. This means that out-of-state tuition increases as the linear predictors do (this would be opposite if the linear predictors were going in the negative direction. Also, the Outstate variable is higher with private schools, shown by the private plot.
##(c) Evaluate the model obtained on the test set, and explain the results obtained.
gampred <- predict(gam1, test)
gampred2 <- predict(gam2, test)
mean((test$Outstate - gampred)^2)
## [1] 3874156
mean((test$Outstate - gampred2)^2)
## [1] 3791404
# The MSE's are shown below.
# The only variable that does not have a linear relationship with the predictor is the Expend variable. This variable is linear at the lower amounts, but sort of flattens out as Expend gets higher.