library(ISLR2)
library(boot)
attach(Wage)
set.seed(5)
cv.error <- rep(0,5)
for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
cv.error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1675.881 1600.625 1595.877 1594.165 1594.823
plot(x=1:5, y=cv.error, xlab= "age", ylab = "CV Error", type = "b", pch = 20, lwd = 1, lty = 3)
degree.min <- which.min(cv.error)
points(degree.min, cv.error[degree.min], col = "orange", cex = 2, pch = 20)
fit.1<-lm(wage~age, data=Wage)
fit.2<-lm(wage~poly(age,2), data=Wage)
fit.3<-lm(wage~poly(age,3), data=Wage)
fit.4<-lm(wage~poly(age,4), data=Wage)
fit.5<-lm(wage~poly(age,5), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
age.limit<- range(age)
age.grid<- seq(from=age.limit[1], to=age.limit[2])
pred <- predict(fit.4, newdata = list(age=age.grid), se=T)
se.bands<- cbind(pred$fit+2*pred$se.fit, pred$fit -2*pred$se.fit)
plot(age, wage, xlim=age.limit, cex=.5, col ="darkgreen")
title<- ("Degree-4 polynomial")
lines(age.grid, pred$fit, lwd = 2, col="blue")
matlines (age.grid, se.bands, lwd =1, col="blue", lty = 3)
#### b) Fit a step function to predict wage using age, and perform
cross-validation to choose the optimal number of cuts. Make a plot of
the fit obtained. #### Cross-validation shows that 8 cuts will
suffice.
set.seed(10)
cv.err<- rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv.err[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv.err
## [1] NA 1733.954 1681.195 1637.324 1632.526 1624.776 1610.919 1600.948
## [9] 1610.202 1603.532
plot(x=1:10, y=cv.err, xlab= "age", ylab = "CV Error", type = "b", pch = 20, lwd = 1, lty = 3)
degree.min <- which.min(cv.err)
points(degree.min, cv.err[degree.min], col = "blue", cex = 2, pch = 20)
step.fit <- glm(wage~cut(age,8), data=Wage)
preds<- predict(step.fit, data.frame(age=age.grid))
plot(age, wage, col = "grey")
lines(age.grid, preds, col = "purple", lwd = 2)
### Question 10. This question relates to the College data set.
attach(College)
library(leaps)
set.seed(15)
test.set <- sample(1:nrow(College), nrow(College)/2)
train <- College[-test.set,]
test<- College[test.set,]
#Forward step-wise selection
fwd.fit<- regsubsets(Outstate ~., data = College, nvmax = 17, method = "forward")
fit.summary <- summary(fwd.fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, nvmax = 17,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## 9 ( 1 ) "*" " " " " " " " " " " " "
## 10 ( 1 ) "*" " " "*" " " " " " " " "
## 11 ( 1 ) "*" " " "*" " " " " " " "*"
## 12 ( 1 ) "*" "*" "*" " " " " " " "*"
## 13 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" "*" " "
## 9 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 10 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 11 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 12 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
coef(fwd.fit, 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3553.2345268 2768.6347025 0.9679086 35.5283359 48.4221031
## Expend Grad.Rate
## 0.2210255 29.7119093
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20.2
library(splines)
gam_fit<- gam(Outstate~ Private+ s(Room.Board,df=2) + s(PhD, df=2) + s(perc.alumni,df=2) + s(Expend, df=2) + s(Grad.Rate, df=2), data = train)
par(mfrow=c(2,3))
plot(gam_fit, se=T, col="purple")
#### c) Evaluate the model obtained on the test set, and explaing the
results obtained. #### The model obtains an r-squared of ~0.78.
pred <- predict(gam_fit, newdata = test)
error <- mean((test$Outstate-pred)^2)
error
## [1] 3750164
TotSS= mean((test$Outstate -mean(test$Outstate))^2)
Rsq = 1- error/TotSS
Rsq
## [1] 0.7780796
summary(gam_fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 2) + s(Grad.Rate,
## df = 2), data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7444.47 -1201.52 -45.45 1229.99 5056.03
##
## (Dispersion Parameter for gaussian family taken to be 3540380)
##
## Null Deviance: 6001737341 on 388 degrees of freedom
## Residual Deviance: 1334722893 on 376.9999 degrees of freedom
## AIC: 6983.766
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1557062605 1557062605 439.801 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1421421366 1421421366 401.488 < 2.2e-16 ***
## s(PhD, df = 2) 1 484793080 484793080 136.933 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 266302517 266302517 75.219 < 2.2e-16 ***
## s(Expend, df = 2) 1 405736370 405736370 114.603 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 72573436 72573436 20.499 8.014e-06 ***
## Residuals 377 1334722893 3540380
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 0.583 0.44544
## s(PhD, df = 2) 1 3.804 0.05185 .
## s(perc.alumni, df = 2) 1 0.522 0.47063
## s(Expend, df = 2) 1 37.796 2.005e-09 ***
## s(Grad.Rate, df = 2) 1 1.638 0.20138
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1