library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.2
library(boot)
set.seed(1)
degree <- 10
cv_e <- rep(NA, degree)
for (i in 1:degree){
fit <- glm( wage ~ poly( age, i), data = Wage)
cv_e[i] <- cv.glm(Wage, fit)$delta[1]
}
plot:
#plot(1: degree, cv_e , xlab = "Degree", ylab = "Test MSE", type ="1")
plot(1:degree, cv_e, xlab = 'Degree', ylab = 'Test MSE', type = 'l')
opt_degree <- which.min(cv_e)
points(opt_degree, cv_e[opt_degree], col = 'red', cex = 2, pch = 19)
plot(wage ~ age, data = Wage, col = "grey")
age.range <- range(Wage$age)
age.grid <- seq(from = age.range[1], to = age.range[2])
fit <- lm(wage ~ poly (age, 3), data = Wage )
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = 'blue', lwd = 2)
(b) Fit a step function to predict wage using age, and perform cross- validation to choose the optimal number of cuts. Make a plot of the fit obtained.
cv_e <- rep(NA, degree)
for (i in 2:degree ){
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage )
cv_e[i] <- cv.glm(Wage, fit)$delta[1]
}
plot(2:degree, cv_e[-1], xlab = 'Cuts', ylab = 'Test MSE', type = 'l')
deg.min <- which.min(cv_e)
points(deg.min,cv_e[deg.min], col = 'red', cex = 2, pch = 19 )
plot(wage ~ age, data = Wage , col = 'grey')
fit <- glm( wage ~ cut (age, 8), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds,col = 'blue', lwd = 2)
res <- cut(c(1,5,2,3,8), 2)
res
## [1] (0.993,4.5] (4.5,8.01] (0.993,4.5] (0.993,4.5] (4.5,8.01]
## Levels: (0.993,4.5] (4.5,8.01]
length(res)
## [1] 5
class(res[1])
## [1] "factor"
library(ISLR)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.2
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
fit <- regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary <- summary(fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " "*" " "
## 8 ( 1 ) "*" " " " " " " " " "*" " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " "*" " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " " " "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) "*" " " " "
## 3 ( 1 ) "*" "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
coef(fit, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3815.6574509 2880.3858979 0.9861841 43.6735045 40.4602197
## Expend Grad.Rate
## 0.1770944 30.8363935
#install.packages("gam")
library(gam)
## Warning: package 'gam' was built under R version 4.0.2
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
gam.mod <- gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni,5) + s(Expend,5) + s(Grad.Rate, 5), data = College, subset = train )
par (mfrow = c(2,3))
plot(gam.mod , se = TRUE, col = 'red')
we can see from the graph that Expend and Grad.Rate are strong non-linear with the Outstate.
preds <- predict(gam.mod, College[test, ])
RSS <- sum((College[test, ]$Outstate -preds)^2)
TSS <- sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS/TSS)
## [1] 0.7649038
The R squared statistic in the test set is 0.765
summary(gam.mod)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal,
## 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5),
## data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7289.5 -1004.3 18.3 1123.6 4218.8
##
## (Dispersion Parameter for gaussian family taken to be 3138798)
##
## Null Deviance: 6139053909 on 387 degrees of freedom
## Residual Deviance: 1133105994 on 361 degrees of freedom
## AIC: 6933.339
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1658551575 1658551575 528.404 < 2.2e-16 ***
## s(Room.Board, 5) 1 1093958629 1093958629 348.528 < 2.2e-16 ***
## s(Terminal, 5) 1 239592419 239592419 76.332 < 2.2e-16 ***
## s(perc.alumni, 5) 1 189302589 189302589 60.310 8.461e-14 ***
## s(Expend, 5) 1 671008681 671008681 213.779 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 87504239 87504239 27.878 2.236e-07 ***
## Residuals 361 1133105994 3138798
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 5) 4 3.6201 0.006576 **
## s(Terminal, 5) 4 2.3018 0.058243 .
## s(perc.alumni, 5) 4 0.8690 0.482600
## s(Expend, 5) 4 28.0768 < 2.2e-16 ***
## s(Grad.Rate, 5) 4 2.7848 0.026556 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the above results, the anova for nonparametric effects show Expend has strong non-linear relationship with the Outstate. Grad.Rate and PhD have moderate non-linear relationship with the Outstate.
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.