library(ISLR2)
library(leaps)

set.seed(1)

# Define parameters
n <- 1000 
p <- 20    

# Generate matrix
X <- matrix(rnorm(n * p), nrow = n, ncol = p)

# Generate beta
beta <- rnorm(p)

# Generate epsilon
epsilon <- rnorm(n, mean = 0, sd = 0.5)

# set y equal to equation
Y <- X %*% beta + epsilon

# create data frame
df <- as.data.frame(X)
df$Y <- Y

# Split into training and test
train_set <- sample(1:n, 100)  
test_set <- setdiff(1:n, train_set)  

# Create training and testing sets
training_set <- df[train_set, ]
testing_set <- df[test_set, ]

# Create model matrix for train
train_matrix <- model.matrix(Y ~ ., data=training_set)

# regsubsets
regfit.best <- regsubsets(Y ~ ., data = training_set, nvmax = 19)
summary(regfit.best)
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = training_set, nvmax = 19)
## 20 Variables  (and intercept)
##     Forced in Forced out
## V1      FALSE      FALSE
## V2      FALSE      FALSE
## V3      FALSE      FALSE
## V4      FALSE      FALSE
## V5      FALSE      FALSE
## V6      FALSE      FALSE
## V7      FALSE      FALSE
## V8      FALSE      FALSE
## V9      FALSE      FALSE
## V10     FALSE      FALSE
## V11     FALSE      FALSE
## V12     FALSE      FALSE
## V13     FALSE      FALSE
## V14     FALSE      FALSE
## V15     FALSE      FALSE
## V16     FALSE      FALSE
## V17     FALSE      FALSE
## V18     FALSE      FALSE
## V19     FALSE      FALSE
## V20     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: exhaustive
##           V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11 V12 V13 V14 V15 V16 V17
## 1  ( 1 )  " " " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " "
## 2  ( 1 )  " " " " " " "*" " " " " " " " " "*" " " " " " " " " " " " " " " " "
## 3  ( 1 )  " " " " " " "*" " " " " " " " " "*" " " " " " " " " " " " " " " " "
## 4  ( 1 )  " " " " " " "*" " " " " "*" " " "*" " " " " " " " " " " " " " " " "
## 5  ( 1 )  " " " " " " "*" " " " " "*" " " "*" " " " " " " " " " " " " " " " "
## 6  ( 1 )  " " " " " " "*" " " " " "*" "*" "*" " " " " " " " " " " " " " " " "
## 7  ( 1 )  " " " " " " "*" " " " " "*" "*" "*" " " "*" " " " " " " " " " " " "
## 8  ( 1 )  " " " " " " "*" "*" " " "*" "*" "*" " " "*" " " " " " " " " " " " "
## 9  ( 1 )  " " " " "*" "*" "*" " " "*" "*" "*" " " "*" " " " " " " " " " " " "
## 10  ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" " " " " " " " " " " " "
## 11  ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" " " " " " " "*" " " " "
## 12  ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" " " " " " " "*" " " " "
## 13  ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " " " " "*" " " " "
## 14  ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " " "*" "*" " " " "
## 15  ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " " "*" "*" "*" " "
## 16  ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 17  ( 1 ) " " " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 18  ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 19  ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
##           V18 V19 V20
## 1  ( 1 )  " " " " " "
## 2  ( 1 )  " " " " " "
## 3  ( 1 )  "*" " " " "
## 4  ( 1 )  "*" " " " "
## 5  ( 1 )  "*" " " "*"
## 6  ( 1 )  "*" " " "*"
## 7  ( 1 )  "*" " " "*"
## 8  ( 1 )  "*" " " "*"
## 9  ( 1 )  "*" " " "*"
## 10  ( 1 ) "*" " " "*"
## 11  ( 1 ) "*" " " "*"
## 12  ( 1 ) "*" "*" "*"
## 13  ( 1 ) "*" "*" "*"
## 14  ( 1 ) "*" "*" "*"
## 15  ( 1 ) "*" "*" "*"
## 16  ( 1 ) "*" "*" "*"
## 17  ( 1 ) "*" "*" "*"
## 18  ( 1 ) "*" "*" "*"
## 19  ( 1 ) "*" "*" "*"
# Training Set MSE
val.errors <- rep(NA, 19)

for (i in 1:19) {
  coefi <- coef(regfit.best, id = i)
  pred <- train_matrix[, names(coefi), drop = FALSE] %*% coefi
  val.errors[i] <- mean((training_set$Y - pred)^2)
}

plot(val.errors, type = "b", pch = 19, col = "blue",
     xlab = "Number of Predictors", ylab = "Training Set MSE",
     main = "Best Subset Selection - Training MSE")

# Test MSE

test_matrix <- model.matrix(Y~ ., data = testing_set)
test.errors <- rep(NA, 19)

for (i in 1:19) {
  coefi <- coef(regfit.best, id = i)
  pred <- test_matrix[, names(coefi), drop = FALSE] %*% coefi 
  test.errors[i] <- mean((testing_set$Y - pred)^2)
}

plot(test.errors, type = "b", pch = 19, col = "blue",
     xlab = "Number of Predictors", ylab = "Testing Set MSE",
     main = "Best Subset Selection - Test MSE")

#Re-doing how the test and training data is split
set.seed(50)

# Define parameters
n <- 1000 
p <- 20    

# Generate matrix
X <- matrix(rnorm(n * p), nrow = n, ncol = p)

# Generate beta
beta <- rnorm(p)

# Generate epsilon
epsilon <- rnorm(n, mean = 0, sd = .001)

# set y equal to equation
Y <- X %*% beta + epsilon

# create data frame
df <- as.data.frame(X)
df$Y <- Y

# Split into training and test
train_set <- sample(1:n, 200)  
test_set <- setdiff(1:n, train_set)  

# Create training and testing sets
training_set <- df[train_set, ]
testing_set <- df[test_set, ]

# Create model matrix for train
train_matrix <- model.matrix(Y ~ ., data=training_set)

# regsubsets
regfit.best <- regsubsets(Y ~ ., data = training_set, nvmax = 19)
summary(regfit.best)
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = training_set, nvmax = 19)
## 20 Variables  (and intercept)
##     Forced in Forced out
## V1      FALSE      FALSE
## V2      FALSE      FALSE
## V3      FALSE      FALSE
## V4      FALSE      FALSE
## V5      FALSE      FALSE
## V6      FALSE      FALSE
## V7      FALSE      FALSE
## V8      FALSE      FALSE
## V9      FALSE      FALSE
## V10     FALSE      FALSE
## V11     FALSE      FALSE
## V12     FALSE      FALSE
## V13     FALSE      FALSE
## V14     FALSE      FALSE
## V15     FALSE      FALSE
## V16     FALSE      FALSE
## V17     FALSE      FALSE
## V18     FALSE      FALSE
## V19     FALSE      FALSE
## V20     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: exhaustive
##           V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11 V12 V13 V14 V15 V16 V17
## 1  ( 1 )  " " " " " " " " " " " " " " " " " " " " " " " " " " " " "*" " " " "
## 2  ( 1 )  " " " " " " " " " " " " " " " " " " " " " " " " " " " " "*" " " " "
## 3  ( 1 )  " " " " " " " " " " " " " " " " " " " " " " "*" " " " " "*" " " " "
## 4  ( 1 )  " " " " " " " " " " "*" " " " " " " " " " " "*" " " " " "*" " " " "
## 5  ( 1 )  " " " " " " " " " " "*" " " " " " " " " "*" "*" " " " " "*" " " " "
## 6  ( 1 )  " " " " " " " " " " "*" " " " " " " " " "*" "*" " " " " "*" " " "*"
## 7  ( 1 )  " " " " " " "*" " " "*" " " " " " " " " "*" "*" " " " " "*" " " "*"
## 8  ( 1 )  " " " " " " "*" " " "*" " " " " " " " " "*" "*" " " "*" "*" " " "*"
## 9  ( 1 )  " " "*" " " "*" " " "*" " " " " " " " " "*" "*" " " "*" "*" " " "*"
## 10  ( 1 ) " " "*" " " "*" " " "*" " " " " " " " " "*" "*" " " "*" "*" " " "*"
## 11  ( 1 ) " " "*" " " "*" " " "*" "*" " " " " " " "*" "*" " " "*" "*" " " "*"
## 12  ( 1 ) " " "*" " " "*" " " "*" "*" " " " " " " "*" "*" " " "*" "*" " " "*"
## 13  ( 1 ) " " "*" " " "*" " " "*" "*" "*" " " " " "*" "*" " " "*" "*" " " "*"
## 14  ( 1 ) " " "*" " " "*" " " "*" "*" "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 15  ( 1 ) " " "*" " " "*" " " "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 16  ( 1 ) "*" "*" " " "*" " " "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 17  ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 18  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 19  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
##           V18 V19 V20
## 1  ( 1 )  " " " " " "
## 2  ( 1 )  " " " " "*"
## 3  ( 1 )  " " " " "*"
## 4  ( 1 )  " " " " "*"
## 5  ( 1 )  " " " " "*"
## 6  ( 1 )  " " " " "*"
## 7  ( 1 )  " " " " "*"
## 8  ( 1 )  " " " " "*"
## 9  ( 1 )  " " " " "*"
## 10  ( 1 ) " " "*" "*"
## 11  ( 1 ) " " "*" "*"
## 12  ( 1 ) "*" "*" "*"
## 13  ( 1 ) "*" "*" "*"
## 14  ( 1 ) "*" "*" "*"
## 15  ( 1 ) "*" "*" "*"
## 16  ( 1 ) "*" "*" "*"
## 17  ( 1 ) "*" "*" "*"
## 18  ( 1 ) "*" "*" "*"
## 19  ( 1 ) "*" "*" "*"
# Training Set MSE
val.errors <- rep(NA, 19)

for (i in 1:19) {
  coefi <- coef(regfit.best, id = i)
  pred <- train_matrix[, names(coefi), drop = FALSE] %*% coefi
  val.errors[i] <- mean((training_set$Y - pred)^2)
}

plot(val.errors, type = "b", pch = 19, col = "blue",
     xlab = "Number of Predictors", ylab = "Training Set MSE",
     main = "Best Subset Selection - Training MSE")

# Test MSE

test_matrix <- model.matrix(Y~ ., data = testing_set)
test.errors <- rep(NA, 19)

for (i in 1:19) {
  coefi <- coef(regfit.best, id = i)
  pred <- test_matrix[, names(coefi), drop = FALSE] %*% coefi 
  test.errors[i] <- mean((testing_set$Y - pred)^2)
}

plot(test.errors, type = "b", pch = 19, col = "blue",
     xlab = "Number of Predictors", ylab = "Testing Set MSE",
     main = "Best Subset Selection - Test MSE")

which.min(test.errors)
## [1] 19
#part g