require(ISLR)
## Loading required package: ISLR
require(MASS)
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.0.5
require(descr)
## Loading required package: descr
attach(Smarket)
freq(Direction)
## Direction
## Frequency Percent
## Down 602 48.16
## Up 648 51.84
## Total 1250 100.00
train = Year<2005
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket, subset=Year<2005)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = Year <
## 2005)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
plot(lda.fit, col="dodgerblue")
Smarket.2005=subset(Smarket,Year==2005)
lda.pred=predict(lda.fit,Smarket.2005)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class=lda.pred$class
Direction.2005=Smarket$Direction[!train]
table(lda.class,Direction.2005)
## Direction.2005
## lda.class Down Up
## Down 35 35
## Up 76 106
data.frame(lda.pred)[1:5,]
## class posterior.Down posterior.Up LD1
## 999 Up 0.4901792 0.5098208 0.08293096
## 1000 Up 0.4792185 0.5207815 0.59114102
## 1001 Up 0.4668185 0.5331815 1.16723063
## 1002 Up 0.4740011 0.5259989 0.83335022
## 1003 Up 0.4927877 0.5072123 -0.03792892
table(lda.pred$class,Smarket.2005$Direction)
##
## Down Up
## Down 35 35
## Up 76 106
mean(lda.pred$class==Smarket.2005$Direction)
## [1] 0.5595238
From the three methods (best subset, forward stepwise, and backward stepwise): a. Which of the three models with k predictors has the smallest training RSS? b. Which of the three models with k predictors has the smallest test RSS? Best subset selection will have the smallest training and test RSS because it can choose from ALL possible models to find the best model whereas forward and backward stepwise selection may not end up with the best model if they start with the wrong predictors early on. A hybrid approach that includes both forward and backward stepwise selection may achieve a RSS value close to best subset selection. The problem with best subset selection, which may necessitate the use of the other methods even if they are not optimal, is that with a high number of predictors it becomes computationally difficult (and potentially not feasible) because too many models would need to be run and selected from.
set.seed(1)
x <- rnorm(100)
eps <- rnorm(100)
data <- data.frame(x, eps)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data1 <- mutate(data, y = 4 + 9 * x + 2 * I(x^2) + I(x^3) + eps)
Plot x and y.
plot1 <- plot(data1$x, data1$y, xlab = "X", ylab = "Y")
3. Use the leaps package: require(leaps)
library(leaps)
fit1 <- regsubsets(y~poly(x,10,raw=T), data=data1, nvmax=10)
sum1 <- summary(fit1)
What is the best model obtained according to Cp, BIC, and adjusted 𝑅2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Based on the CP, BIC, and Adjusted R^2 values, the best model is a trinomial model with x, x^2, and x^3. The coefficients for this models are 6.77, 104.75, 24.38, and 15.23, respectively.
results1 <- data.frame(sum1$cp, sum1$bic, sum1$adjr2)
results1
## sum1.cp sum1.bic sum1.adjr2
## 1 873.3368122 -247.1841 0.9222139
## 2 223.7534690 -354.1125 0.9742384
## 3 2.1859433 -471.1079 0.9922844
## 4 0.6067483 -470.3769 0.9924995
## 5 2.1782005 -466.2458 0.9924556
## 6 3.9955812 -461.8434 0.9923899
## 7 5.7869063 -457.4704 0.9923250
## 8 7.1694092 -453.5553 0.9922940
## 9 9.0016690 -449.1384 0.9922231
## 10 11.0000000 -444.5351 0.9921358
cpplot <- plot(sum1$cp, main = "CP Plot", xlab = "Model #", ylab = "CP")
bicplot <- plot(sum1$bic, main = "BIC Plot", xlab = "Model #", ylab = "BIC")
adjr2 <- plot(sum1$adjr2, main = "Adjusted R^2 Plot", xlab = "Model #", ylab = "Adjusted R^2")
fit2 <- lm(y ~ poly(x, 3), data1)
summary(fit2)
##
## Call:
## lm(formula = y ~ poly(x, 3), data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9765 -0.6302 -0.1227 0.5545 2.2843
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.77392 0.09626 70.37 <2e-16 ***
## poly(x, 3)1 104.75971 0.96263 108.83 <2e-16 ***
## poly(x, 3)2 24.38056 0.96263 25.33 <2e-16 ***
## poly(x, 3)3 15.23729 0.96263 15.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9626 on 96 degrees of freedom
## Multiple R-squared: 0.9925, Adjusted R-squared: 0.9923
## F-statistic: 4245 on 3 and 96 DF, p-value: < 2.2e-16
library(MASS)
fit3 <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data1)
Forward selection, which starts with no predictors in the model, iteratively adds the most contributive predictors, and stops when the improvement is no longer statistically significant.
step.forward <- stepAIC(fit3, direction = "forward",
trace = FALSE)
summary(step.forward)
##
## Call:
## lm(formula = y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) +
## I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9774 -0.5895 -0.1238 0.4923 2.1505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.17283 0.19971 20.894 <2e-16 ***
## x 9.51409 0.59009 16.123 <2e-16 ***
## I(x^2) 0.86854 1.29174 0.672 0.503
## I(x^3) 0.06886 1.68567 0.041 0.968
## I(x^4) 1.90383 2.14977 0.886 0.378
## I(x^5) 0.55110 1.35654 0.406 0.686
## I(x^6) -1.26499 1.31956 -0.959 0.340
## I(x^7) -0.15569 0.39731 -0.392 0.696
## I(x^8) 0.31987 0.32511 0.984 0.328
## I(x^9) 0.01628 0.03817 0.426 0.671
## I(x^10) -0.02690 0.02749 -0.979 0.330
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9719 on 89 degrees of freedom
## Multiple R-squared: 0.9929, Adjusted R-squared: 0.9921
## F-statistic: 1250 on 10 and 89 DF, p-value: < 2.2e-16
Backward selection (or backward elimination), which starts with all predictors in the model (full model), iteratively removes the least contributive predictors, and stops when you have a model where all predictors are statistically significant.
step.backward <- stepAIC(fit3, direction = "backward",
trace = FALSE)
summary(step.backward)
##
## Call:
## lm(formula = y ~ x + I(x^4) + I(x^5) + I(x^6) + I(x^8) + I(x^10),
## data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9874 -0.5725 -0.1219 0.5213 2.1722
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.26752 0.13079 32.628 < 2e-16 ***
## x 9.89354 0.15010 65.911 < 2e-16 ***
## I(x^4) 3.06370 0.61180 5.008 2.61e-06 ***
## I(x^5) 0.18397 0.01403 13.113 < 2e-16 ***
## I(x^6) -1.78860 0.51582 -3.468 0.000797 ***
## I(x^8) 0.41326 0.13715 3.013 0.003330 **
## I(x^10) -0.03249 0.01155 -2.812 0.005998 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9709 on 93 degrees of freedom
## Multiple R-squared: 0.9926, Adjusted R-squared: 0.9922
## F-statistic: 2087 on 6 and 93 DF, p-value: < 2.2e-16
The results from forward and backward stepwise selection are different from those given by best subset selection which is not unusual. Since different methods are used for determining which variables are included, it is not surprising that they ended up with different results.