Load packages and attach data set
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(boot)
## Warning: package 'boot' was built under R version 4.0.4
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
attach(Wage)
set.seed(1)
deltas <- rep(NA, 10)
for (i in 1:10) {
fit <- glm(wage ~ poly(age, i), data = Wage)
deltas[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(deltas)
points(which.min(deltas), deltas[which.min(deltas)], col = "yellow", cex = 2, pch = 20)
#Find the best degree using Anova
fit1 = lm(wage~poly(age, 1), data=Wage)
fit2 = lm(wage~poly(age, 2), data=Wage)
fit3 = lm(wage~poly(age, 3), data=Wage)
fit4 = lm(wage~poly(age, 4), data=Wage)
fit5 = lm(wage~poly(age, 5), data=Wage)
anova(fit1, fit2, fit3, fit4, fit5)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
a cubic or quadratic polynomial provide a reasonable fit to the data
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)")
cvs <- rep(NA, 10)
for (i in 2:10) {
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
cvs[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
plot(2:10, cvs[-1], xlab = "cuts", ylab = "cv error", type = "l")
d.min <- which.min(cvs)
points(which.min(cvs), cvs[which.min(cvs)], col = "yellow", cex = 2, pch = 20)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", formula = "y ~ cut(x, 12)")
Load packages and attach data set
library(ISLR)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.4
library(gam)
## Warning: package 'gam' was built under R version 4.0.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.3
## Loaded gam 1.20
attach(College)
Train and test split
set.seed(1)
row.number = sample(1:nrow(College), 0.7*nrow(College))
coll_train = College[row.number,]
coll_test = College[-row.number,]
Model
model <- regsubsets(Outstate ~ ., data = coll_train, nvmax = 17, method = "forward")
model.summary <- summary(model)
Plots
par(mfrow = c(1, 3))
plot(model.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")
min.cp <- min(model.summary$cp)
std.cp <- sd(model.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)
plot(model.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')
min.bic <- min(model.summary$bic)
std.bic <- sd(model.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2)
plot(model.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 <- max(model.summary$adjr2)
std.adjr2 <- sd(model.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "red", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "red", lty = 2)
model1 = regsubsets(Outstate ~ ., data = College, method = "forward")
model.coefficients = coef(model1, id = 6)
names(model.coefficients)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "PhD" "perc.alumni"
## [6] "Expend" "Grad.Rate"
gamfit = gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) +
s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data = coll_train)
par(mfrow = c(2, 3))
plot(gamfit, se = T, col = "green")
there is apparent evidence of the nonlinear effect of Expend there is a more linear relationship for perc.alumni there is a moderate non-linearity with the other terms
gpreds <- predict(gamfit, coll_test)
testerror <- mean((coll_test$Outstate - gpreds)^2)
testerror
## [1] 3205299
gamtss = mean((coll_test$Outstate - mean(coll_test$Outstate))^2)
r.squared = 1 - testerror/gamtss
r.squared
## [1] 0.767789
the test R-squared of the model is 0.77 using GAM with 6 predictors
summary(gamfit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate,
## df = 2), data = coll_train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7559.16 -1122.45 85.28 1279.57 7865.22
##
## (Dispersion Parameter for gaussian family taken to be 3613715)
##
## Null Deviance: 9260683704 on 542 degrees of freedom
## Residual Deviance: 1908040819 on 527.9999 degrees of freedom
## AIC: 9757.19
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2416002978 2416002978 668.57 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1720867924 1720867924 476.20 < 2.2e-16 ***
## s(PhD, df = 2) 1 614780720 614780720 170.12 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 466260430 466260430 129.03 < 2.2e-16 ***
## s(Expend, df = 5) 1 753550406 753550406 208.53 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 100352623 100352623 27.77 1.994e-07 ***
## Residuals 528 1908040819 3613715
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 3.3534 0.06763 .
## s(PhD, df = 2) 1 0.6329 0.42666
## s(perc.alumni, df = 2) 1 1.0388 0.30859
## s(Expend, df = 5) 4 27.4239 < 2e-16 ***
## s(Grad.Rate, df = 2) 1 1.7611 0.18507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Expend has a strong evidence of non-linear relationship with Outstate, all the other variables are insignificant.