library(ISLR2)
library(leaps)
set.seed(1)
# Define parameters
n <- 1000
p <- 20
# Generate matrix
X <- matrix(rnorm(n * p), nrow = n, ncol = p)
# Generate beta
beta <- rnorm(p)
# Generate epsilon
epsilon <- rnorm(n, mean = 0, sd = 0.5)
# set y equal to equation
Y <- X %*% beta + epsilon
# create data frame
df <- as.data.frame(X)
df$Y <- Y
# Split into training and test
train_set <- sample(1:n, 100)
test_set <- setdiff(1:n, train_set)
# Create training and testing sets
training_set <- df[train_set, ]
testing_set <- df[test_set, ]
# Create model matrix for train
train_matrix <- model.matrix(Y ~ ., data=training_set)
# regsubsets
regfit.best <- regsubsets(Y ~ ., data = training_set, nvmax = 19)
summary(regfit.best)
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = training_set, nvmax = 19)
## 20 Variables (and intercept)
## Forced in Forced out
## V1 FALSE FALSE
## V2 FALSE FALSE
## V3 FALSE FALSE
## V4 FALSE FALSE
## V5 FALSE FALSE
## V6 FALSE FALSE
## V7 FALSE FALSE
## V8 FALSE FALSE
## V9 FALSE FALSE
## V10 FALSE FALSE
## V11 FALSE FALSE
## V12 FALSE FALSE
## V13 FALSE FALSE
## V14 FALSE FALSE
## V15 FALSE FALSE
## V16 FALSE FALSE
## V17 FALSE FALSE
## V18 FALSE FALSE
## V19 FALSE FALSE
## V20 FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: exhaustive
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1 ( 1 ) " " " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" " " " " " " " " "*" " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " " " " "*" " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " "*" " " " " "*" " " "*" " " " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " "*" " " " " "*" " " "*" " " " " " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " "*" " " " " "*" "*" "*" " " " " " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " "*" " " " " "*" "*" "*" " " "*" " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " "*" "*" " " "*" "*" "*" " " "*" " " " " " " " " " " " "
## 9 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" " " "*" " " " " " " " " " " " "
## 10 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" " " " " " " " " " " " "
## 11 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" " " " " " " "*" " " " "
## 12 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" " " " " " " "*" " " " "
## 13 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " " " " "*" " " " "
## 14 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " " "*" "*" " " " "
## 15 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " " "*" "*" "*" " "
## 16 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 17 ( 1 ) " " " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 18 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 19 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## V18 V19 V20
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) "*" " " " "
## 4 ( 1 ) "*" " " " "
## 5 ( 1 ) "*" " " "*"
## 6 ( 1 ) "*" " " "*"
## 7 ( 1 ) "*" " " "*"
## 8 ( 1 ) "*" " " "*"
## 9 ( 1 ) "*" " " "*"
## 10 ( 1 ) "*" " " "*"
## 11 ( 1 ) "*" " " "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
## 18 ( 1 ) "*" "*" "*"
## 19 ( 1 ) "*" "*" "*"
# Training Set MSE
val.errors <- rep(NA, 19)
for (i in 1:19) {
coefi <- coef(regfit.best, id = i)
pred <- train_matrix[, names(coefi), drop = FALSE] %*% coefi
val.errors[i] <- mean((training_set$Y - pred)^2)
}
plot(val.errors, type = "b", pch = 19, col = "blue",
xlab = "Number of Predictors", ylab = "Training Set MSE",
main = "Best Subset Selection - Training MSE")

# Test MSE
test_matrix <- model.matrix(Y~ ., data = testing_set)
test.errors <- rep(NA, 19)
for (i in 1:19) {
coefi <- coef(regfit.best, id = i)
pred <- test_matrix[, names(coefi), drop = FALSE] %*% coefi
test.errors[i] <- mean((testing_set$Y - pred)^2)
}
plot(test.errors, type = "b", pch = 19, col = "blue",
xlab = "Number of Predictors", ylab = "Testing Set MSE",
main = "Best Subset Selection - Test MSE")

#Re-doing how the test and training data is split
set.seed(50)
# Define parameters
n <- 1000
p <- 20
# Generate matrix
X <- matrix(rnorm(n * p), nrow = n, ncol = p)
# Generate beta
beta <- rnorm(p)
# Generate epsilon
epsilon <- rnorm(n, mean = 0, sd = .001)
# set y equal to equation
Y <- X %*% beta + epsilon
# create data frame
df <- as.data.frame(X)
df$Y <- Y
# Split into training and test
train_set <- sample(1:n, 200)
test_set <- setdiff(1:n, train_set)
# Create training and testing sets
training_set <- df[train_set, ]
testing_set <- df[test_set, ]
# Create model matrix for train
train_matrix <- model.matrix(Y ~ ., data=training_set)
# regsubsets
regfit.best <- regsubsets(Y ~ ., data = training_set, nvmax = 19)
summary(regfit.best)
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = training_set, nvmax = 19)
## 20 Variables (and intercept)
## Forced in Forced out
## V1 FALSE FALSE
## V2 FALSE FALSE
## V3 FALSE FALSE
## V4 FALSE FALSE
## V5 FALSE FALSE
## V6 FALSE FALSE
## V7 FALSE FALSE
## V8 FALSE FALSE
## V9 FALSE FALSE
## V10 FALSE FALSE
## V11 FALSE FALSE
## V12 FALSE FALSE
## V13 FALSE FALSE
## V14 FALSE FALSE
## V15 FALSE FALSE
## V16 FALSE FALSE
## V17 FALSE FALSE
## V18 FALSE FALSE
## V19 FALSE FALSE
## V20 FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: exhaustive
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " " " " " " "*" " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " " " " " " "*" " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" " " " " "*" " " " "
## 4 ( 1 ) " " " " " " " " " " "*" " " " " " " " " " " "*" " " " " "*" " " " "
## 5 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" "*" " " " " "*" " " " "
## 6 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" "*" " " " " "*" " " "*"
## 7 ( 1 ) " " " " " " "*" " " "*" " " " " " " " " "*" "*" " " " " "*" " " "*"
## 8 ( 1 ) " " " " " " "*" " " "*" " " " " " " " " "*" "*" " " "*" "*" " " "*"
## 9 ( 1 ) " " "*" " " "*" " " "*" " " " " " " " " "*" "*" " " "*" "*" " " "*"
## 10 ( 1 ) " " "*" " " "*" " " "*" " " " " " " " " "*" "*" " " "*" "*" " " "*"
## 11 ( 1 ) " " "*" " " "*" " " "*" "*" " " " " " " "*" "*" " " "*" "*" " " "*"
## 12 ( 1 ) " " "*" " " "*" " " "*" "*" " " " " " " "*" "*" " " "*" "*" " " "*"
## 13 ( 1 ) " " "*" " " "*" " " "*" "*" "*" " " " " "*" "*" " " "*" "*" " " "*"
## 14 ( 1 ) " " "*" " " "*" " " "*" "*" "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 15 ( 1 ) " " "*" " " "*" " " "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## V18 V19 V20
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " "*"
## 3 ( 1 ) " " " " "*"
## 4 ( 1 ) " " " " "*"
## 5 ( 1 ) " " " " "*"
## 6 ( 1 ) " " " " "*"
## 7 ( 1 ) " " " " "*"
## 8 ( 1 ) " " " " "*"
## 9 ( 1 ) " " " " "*"
## 10 ( 1 ) " " "*" "*"
## 11 ( 1 ) " " "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
## 18 ( 1 ) "*" "*" "*"
## 19 ( 1 ) "*" "*" "*"
# Training Set MSE
val.errors <- rep(NA, 19)
for (i in 1:19) {
coefi <- coef(regfit.best, id = i)
pred <- train_matrix[, names(coefi), drop = FALSE] %*% coefi
val.errors[i] <- mean((training_set$Y - pred)^2)
}
plot(val.errors, type = "b", pch = 19, col = "blue",
xlab = "Number of Predictors", ylab = "Training Set MSE",
main = "Best Subset Selection - Training MSE")

# Test MSE
test_matrix <- model.matrix(Y~ ., data = testing_set)
test.errors <- rep(NA, 19)
for (i in 1:19) {
coefi <- coef(regfit.best, id = i)
pred <- test_matrix[, names(coefi), drop = FALSE] %*% coefi
test.errors[i] <- mean((testing_set$Y - pred)^2)
}
plot(test.errors, type = "b", pch = 19, col = "blue",
xlab = "Number of Predictors", ylab = "Testing Set MSE",
main = "Best Subset Selection - Test MSE")

which.min(test.errors)
## [1] 19
#part g