library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(boot)
## Warning: package 'boot' was built under R version 4.0.4
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
## Loading required package: ggplot2
library(gam)
## Warning: package 'gam' was built under R version 4.0.4
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.3
## Loaded gam 1.20
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.3
#Question 6
6a.
data(Wage)
set.seed(7)
cv.error = rep(NA, 10)
for (i in 1:10) {
glm.fit = glm(wage ~ poly(age, i), data = Wage)
cv.error[i]=cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
print(cv.error)
## [1] 1677.629 1600.838 1595.967 1593.759 1595.406 1594.984 1595.637 1595.991
## [9] 1593.700 1594.877
plot(1:10, cv.error, xlab = "Degree of Polynomial", ylab = "CV Error", type = "line")
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to first
## character
points(which.min(cv.error), cv.error[which.min(cv.error)], col = "red", cex = 2, pch = 20)
From the line graph we can see that d = 9 is the optimal degree for the polynomial.
set.seed(7)
fit.1 = lm(wage~poly(age, 1), data=Wage)
fit.2 = lm(wage~poly(age, 2), data=Wage)
fit.3 = lm(wage~poly(age, 3), data=Wage)
fit.4 = lm(wage~poly(age, 4), data=Wage)
fit.5 = lm(wage~poly(age, 5), data=Wage)
fit.6 = lm(wage~poly(age, 6), data=Wage)
fit.7 = lm(wage~poly(age, 7), data=Wage)
fit.8 = lm(wage~poly(age, 8), data=Wage)
fit.9 = lm(wage~poly(age, 9), data=Wage)
fit.10 = lm(wage~poly(age, 10), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5, fit.6, fit.7, fit.8, fit.9, fit.10)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
## Model 8: wage ~ poly(age, 8)
## Model 9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.7638 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.9005 0.001669 **
## 4 2995 4771604 1 6070 3.8143 0.050909 .
## 5 2994 4770322 1 1283 0.8059 0.369398
## 6 2993 4766389 1 3932 2.4709 0.116074
## 7 2992 4763834 1 2555 1.6057 0.205199
## 8 2991 4763707 1 127 0.0796 0.777865
## 9 2990 4756703 1 7004 4.4014 0.035994 *
## 10 2989 4756701 1 3 0.0017 0.967529
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the ANOVA we can see that the 3rd and 9th order polynomial have a reasonable fit to the data.
plot(wage~age, data=Wage, col="darkgrey")
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.fit = lm(wage~poly(age, 3), data=Wage)
lm.pred = predict(lm.fit, data.frame(age=age.grid))
lines(age.grid, lm.pred, col="blue", lwd=2)
plot(wage~age, data=Wage, col="darkgrey")
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.fit = lm(wage~poly(age, 9), data=Wage)
lm.pred = predict(lm.fit, data.frame(age=age.grid))
lines(age.grid, lm.pred, col="blue", lwd=2)
6b.
set.seed(7)
cvs <- rep(0, 10)
for (i in 2:10) {
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
cvs[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
plot(2:10, cvs[-1], xlab = "Cuts", ylab = "Cross-valdation Error", type = "l")
points(which.min(cvs), cvs[which.min(cvs)], col = "red", cex = 2, pch = 20)
From the line plot we can see that 8 cuts is optimal.
plot(wage ~ age, data = Wage, col = "darkgrey")
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims[2])
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, data.frame(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)
#Question 10
10a.
data(College)
set.seed(7)
college.Index<-createDataPartition(y=College$Outstate,
p = .8,list = FALSE,times = 1)
collegeTrain<-College[college.Index,]
collegeTest<-College[-college.Index,]
set.seed(7)
regfit.full = regsubsets(Outstate ~ ., data = collegeTrain, method = 'forward', nvmax = 20)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = collegeTrain, method = "forward",
## nvmax = 20)
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## 9 ( 1 ) "*" " " "*" " " " " " " " "
## 10 ( 1 ) "*" " " "*" "*" " " " " " "
## 11 ( 1 ) "*" "*" "*" "*" " " " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " "*"
## 9 ( 1 ) " " "*" " " "*" "*" " " "*"
## 10 ( 1 ) " " "*" " " "*" "*" " " "*"
## 11 ( 1 ) " " "*" " " "*" "*" " " "*"
## 12 ( 1 ) " " "*" " " "*" "*" " " "*"
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) " " "*" "*"
## 5 ( 1 ) " " "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
reg.summary=summary(regfit.full)
reg.summary$rss
## [1] 5409486904 3967921849 3198637338 2861982908 2646023259 2537067195
## [7] 2507910353 2488391448 2467278495 2350019084 2319927210 2306603884
## [13] 2295828624 2287107972 2285053714 2284758514 2284543501
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
Model with 17 variables are a good choice. This model has the smallest error.
coef(regfit.full, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3700.3613840 2667.0260439 0.9496997 38.1529684 42.4867544
## Expend Grad.Rate
## 0.2278933 33.3056352
forward.mod = lm(Outstate ~ Private + Room.Board + PhD + perc.alumni +
Expend + Grad.Rate, data = collegeTrain)
summary(forward.mod)
##
## Call:
## lm(formula = Outstate ~ Private + Room.Board + PhD + perc.alumni +
## Expend + Grad.Rate, data = collegeTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7528.8 -1294.7 -109.3 1237.5 10541.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.700e+03 4.746e+02 -7.797 2.72e-14 ***
## PrivateYes 2.667e+03 2.321e+02 11.489 < 2e-16 ***
## Room.Board 9.497e-01 9.483e-02 10.015 < 2e-16 ***
## PhD 3.815e+01 6.014e+00 6.344 4.34e-10 ***
## perc.alumni 4.249e+01 8.260e+00 5.143 3.63e-07 ***
## Expend 2.279e-01 2.056e-02 11.085 < 2e-16 ***
## Grad.Rate 3.331e+01 5.946e+00 5.601 3.21e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2029 on 616 degrees of freedom
## Multiple R-squared: 0.7473, Adjusted R-squared: 0.7448
## F-statistic: 303.6 on 6 and 616 DF, p-value: < 2.2e-16
These are the predictors in the model
10b.
gam.fit=gam(Outstate~Private+s(Room.Board)+s(PhD)+s(perc.alumni)+s(Expend)+s(Grad.Rate),data=collegeTrain)
par(mfrow = c(2, 3))
plot(gam.fit,se=T,col='blue')
10c.
Error for GAM model
mean((predict(gam.fit, newdata = collegeTest) - collegeTest$Outstate)^2)
## [1] 3525787
Error for forward model
mean((predict(forward.mod, newdata = collegeTest) - collegeTest$Outstate)^2)
## [1] 4204301
The GAM model has a lower test error compared to the forward model.
10d.
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate), data = collegeTrain)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7858.72 -1096.61 -5.51 1214.18 7989.77
##
## (Dispersion Parameter for gaussian family taken to be 3468935)
##
## Null Deviance: 10038612702 on 622 degrees of freedom
## Residual Deviance: 2084832235 on 601.0006 degrees of freedom
## AIC: 11173.58
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2577999460 2577999460 743.17 < 2.2e-16 ***
## s(Room.Board) 1 1970010191 1970010191 567.90 < 2.2e-16 ***
## s(PhD) 1 737262125 737262125 212.53 < 2.2e-16 ***
## s(perc.alumni) 1 379073222 379073222 109.28 < 2.2e-16 ***
## s(Expend) 1 796019016 796019016 229.47 < 2.2e-16 ***
## s(Grad.Rate) 1 133346145 133346145 38.44 1.05e-09 ***
## Residuals 601 2084832235 3468935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 3.7280 0.01125 *
## s(PhD) 3 2.5940 0.05174 .
## s(perc.alumni) 3 1.6803 0.17005
## s(Expend) 3 29.0774 < 2e-16 ***
## s(Grad.Rate) 3 2.3793 0.06873 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the ANOVA output for nonparametric effects, we can see that Expand and Room.Board are significant and have smooth/non-linear effects.
From the ANOVA for parametric effects, we can see that Room.Board, PhD, perc.alumni, Expend, and Grad.Rate have linear effects because of their small p-value.