require(ISLR); require(caret); require(tidyverse); require(broom); require(ggthemes)
## Loading required package: ISLR
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
## Loading required package: ggplot2
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.6 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## Loading required package: broom
## Loading required package: ggthemes
## Warning: package 'ggthemes' was built under R version 4.0.4
require(knitr); require(kableExtra)
## Loading required package: knitr
## Loading required package: kableExtra
## Warning: package 'kableExtra' was built under R version 4.0.4
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
theme_set(theme_tufte(base_size = 14) + theme(legend.position = 'top'))
set.seed(1)
data('Wage')
set.seed(101)
inTrain <- sample(nrow(Wage), nrow(Wage)*0.6)
train <- Wage[inTrain,]
val <- Wage[-inTrain,]
errors <- list()
models <- list()
for (i in 1:20) {
models[[i]] <- lm(wage ~ poly(age, i), data = train)
pred <- predict(models[[i]], newdata = val)
resid <- val$wage - pred
errors[[i]] <- sqrt(mean(resid^2))
}
data_frame(Errors = unlist(errors)) %>%
mutate(Polynomial = row_number()) %>%
ggplot(aes(Polynomial, Errors)) +
geom_col(aes(fill = Polynomial == which.min(errors))) +
coord_cartesian(ylim = c(min(unlist(errors)) - 1*sd(unlist(errors)),
min(unlist(errors)) + 3*sd(unlist(errors)))) +
scale_x_continuous(breaks = 1:length(errors)) +
guides(fill = FALSE)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
do.call(anova, models) %>%
tidy %>%
mutate(Model = paste('Model ', 1:20),
Percent = sumsq/lag(rss) * 100) %>%
select(Model, sumsq, Percent, rss, p.value) %>%
rename(Error = rss, Explained = sumsq, p = p.value) %>%
kable(digits = 2, format = 'html') %>%
kable_styling(bootstrap_options = c('striped', 'hover')) %>%
scroll_box(height = '500px')
| Model | Explained | Percent | Error | p |
|---|---|---|---|---|
| Model 1 | NA | NA | 2883771 | NA |
| Model 2 | 137713.65 | 4.78 | 2746057 | 0.00 |
| Model 3 | 6086.71 | 0.22 | 2739971 | 0.05 |
| Model 4 | 3232.92 | 0.12 | 2736738 | 0.15 |
| Model 5 | 1440.32 | 0.05 | 2735297 | 0.33 |
| Model 6 | 2345.57 | 0.09 | 2732952 | 0.22 |
| Model 7 | 200.11 | 0.01 | 2732752 | 0.72 |
| Model 8 | 0.16 | 0.00 | 2732752 | 0.99 |
| Model 9 | 1635.54 | 0.06 | 2731116 | 0.30 |
| Model 10 | 488.43 | 0.02 | 2730628 | 0.57 |
| Model 11 | 110.62 | 0.00 | 2730517 | 0.79 |
| Model 12 | 83.55 | 0.00 | 2730433 | 0.82 |
| Model 13 | 702.51 | 0.03 | 2729731 | 0.50 |
| Model 14 | 551.39 | 0.02 | 2729180 | 0.55 |
| Model 15 | 2927.10 | 0.11 | 2726252 | 0.17 |
| Model 16 | 2223.24 | 0.08 | 2724029 | 0.23 |
| Model 17 | 213.26 | 0.01 | 2723816 | 0.71 |
| Model 18 | 801.30 | 0.03 | 2723015 | 0.47 |
| Model 19 | 0.05 | 0.00 | 2723015 | 1.00 |
| Model 20 | 220.08 | 0.01 | 2722794 | 0.70 |
model <- lm(wage ~ poly(age, which.min(errors)), data = train)
val %>%
mutate(predictions = predict(model, val)) %>%
ggplot(aes(age, wage, col = 'blue')) +
geom_point() +
geom_line(aes(age, predictions, col = 'goldenrod2'), size = 1.5) +
scale_color_manual(name = 'Value Type',
labels = c('Observed', 'Predicted'),
values = c('#56B4E9', '#E69F00')) +
labs(x = 'Age', y = 'Wage',
title = paste0('Predictions from polynomial model of level ', which.min(errors)))
errors <- list()
for (i in 2:12) {
step_fit <- lm(wage ~ cut(age, i), data = train)
pred <- predict(step_fit, newdata = val)
resid <- val$wage - pred
errors[[i-1]] <- sqrt(mean(resid^2))
}
data_frame(Errors = unlist(errors)) %>%
mutate(Steps = row_number() + 1) %>%
ggplot(aes(Steps, Errors)) +
geom_col(aes(fill = Steps == (which.min(errors)+1))) +
coord_cartesian(ylim = c(min(unlist(errors)) - 1*sd(unlist(errors)),
min(unlist(errors)) + 3*sd(unlist(errors)))) +
scale_x_continuous(breaks = 2:(length(errors)+1)) +
guides(fill = FALSE)
step_fit <- lm(wage ~ cut(age, which.min(errors)), data = train)
val %>%
mutate(predictions = predict(step_fit, val)) %>%
ggplot(aes(age, wage, col = 'blue')) +
geom_point() +
geom_line(aes(age, predictions, col = 'goldenrod2'), size = 1.5) +
scale_color_manual(name = 'Value Type',
labels = c('Observed', 'Predicted'),
values = c('#56B4E9', '#E69F00')) +
labs(x = 'Age', y = 'Wage')
10. This question relates to the College data set (a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
library(ISLR)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.4
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
fit <- regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary <- summary(fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" " " "*" " " " " " " "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " " " "*" " " " "
## 8 ( 1 ) " " "*" " " " " "*" " " " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
coef(fit, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3287.5299971 2512.8945110 1.0770517 26.8926378 60.8413480
## Expend Grad.Rate
## 0.2261472 26.1662097
Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
Evaluate the model obtained on the test set, and explain the results obtained. preds <- predict(gam_fit, newdata = test) error <- mean((test$Outstate-preds)^2) val.errors[6]-error
GAM model does better than a simple linear model with 6 variables.