library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
library(MASS)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
attach(Wage)
require(boot)
## Loading required package: boot
set.seed(1)
poly_cv <- rep(0,5)
for (i in 1:5){
glm.fit.poly <- glm(wage ~ poly(age,i),data=Wage)
poly_cv[i]<- cv.glm(Wage,glm.fit.poly,K=10)$delta[1]
}
poly_cv
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977
range_poly <- range(poly_cv)
min.cv <- min(poly_cv)
range_poly
## [1] 1594.977 1676.826
min.cv
## [1] 1594.977
plot(poly_cv, type="b")
points(which.min(poly_cv), poly_cv[5], col="pink", pch=20, cex=2)
set.seed(1)
wa.fit.1=lm(wage~age,data=Wage)
wa.fit.2=lm(wage~poly(age,2),data=Wage)
wa.fit.3=lm(wage~poly(age,3),data=Wage)
wa.fit.4=lm(wage~poly(age,4),data=Wage)
wa.fit.5=lm(wage~poly(age,5),data=Wage)
anova(wa.fit.1,wa.fit.2,wa.fit.3,wa.fit.4,wa.fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(dvmisc)
## Warning: package 'dvmisc' was built under R version 4.0.5
## Loading required package: rbenchmark
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
get_mse(wa.fit.1)
## [1] 1675.189
get_mse(wa.fit.2)
## [1] 1599.409
get_mse(wa.fit.3)
## [1] 1594.684
get_mse(wa.fit.4)
## [1] 1593.19
get_mse(wa.fit.5)
## [1] 1593.294
age.fit =range(age)
age.plot.6a=seq(from=age.fit [1],to=age.fit [2])
preds=predict(wa.fit.4 ,newdata =list(age=age.plot.6a),se=TRUE)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
par(mfrow=c(1,1))
plot(age,wage,xlim=age.fit ,cex =.5,col="grey")
title("Degree 4 Polynomial")
lines(age.plot.6a ,preds$fit,lwd=2,col="purple")
matlines(age.plot.6a ,se.bands ,lwd=1, col=" grey",lty=3)
set.seed(5)
cv.opt <- rep(NA,5)
for (i in 2:5){
Wage$age.cut <- cut(Wage$age,i)
glm.fit.step <- glm(wage ~ age.cut, data = Wage)
cv.opt[i]<- cv.glm(Wage,glm.fit.step,K=10)$delta[1]
}
cv.opt
## [1] NA 1733.469 1681.712 1636.621 1630.320
range <- range(cv.opt)
plot(cv.opt, type="b")
points(which.min(cv.opt), cv.opt[5], col="pink", pch=20, cex=2)
plot(wage ~ age, data = Wage, col = "darkgrey")
glm.fit.step <- glm(wage ~ cut(age, 5), data = Wage)
preds.fitplot <- predict(glm.fit.step, list(age = age.fit))
lines(age.fit, preds.fitplot, col = "purple", lwd = 2)
attach(College)
College=na.omit(College)
samp_size10 <- floor(.7 * nrow(College))
set.seed(1)
train_index <- sample(seq_len(nrow(College)), size = samp_size10)
train <- College[train_index, ]
test <- College[-train_index, ]
set.seed(1)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
reg.fit.10a <- regsubsets(Outstate ~ ., data = train, method = 'forward')
summary(reg.fit.10a)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = train, method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" "*" " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
names(coef(reg.fit.10a, id = 6))
## [1] "(Intercept)" "PrivateYes" "Room.Board" "PhD" "perc.alumni"
## [6] "Expend" "Grad.Rate"
b.) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings
library(gam)
## Warning: package 'gam' was built under R version 4.0.5
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.5
## Loaded gam 1.20
set.seed(1)
reg.plot.10b <- gam(Outstate ~ Private + s(Room.Board, 5) + s(PhD, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = train)
par(mfrow = c(2,3))
plot(reg.plot.10b, se=TRUE,col ="#991c90")
c.) Evaluate the model obtained on the test set, and explain the results obtained.
preds = predict(reg.plot.10b, newdata=test)
sum(abs(test$Outstate - preds))/nrow(test)
## [1] 1337.698
The model obtained on the test set has an absolute error of 1337.698.
d.) For which variables, if any, is there evidence of a non-linear relationship with the response?
Based on our plots, it shows that there may be evidence of a non linear relationship between outstate and expend.