#Greeshma Ganji
#ISTE 780
#Summer 2023
#PART - I
# 1) Splitting the data set into a training set and a test set.
library(ISLR)
data(College)
set.seed(11)
train = sample(1:dim(College)[1], dim(College)[1] / 2)
test <- -train
College.train <- College[train, ]
College.test <- College[test, ]
# 2) Fitting a linear model using least squares on the training set
fit.lm <- lm(Apps ~ ., data = College.train)
pred.lm <- predict(fit.lm, College.test)
mse.lm <- mean((pred.lm - College.test$Apps)^2)
print(paste("The test MSE for the linear model is:", mse.lm))
## [1] "The test MSE for the linear model is: 1026095.92589412"
rmse.lm <- sqrt(mse.lm)
print(paste("The test RMSE for the linear model is:", rmse.lm))
## [1] "The test RMSE for the linear model is: 1012.96393119109"
# 3)Fitting ridge regression model on the training set, with lamda chosen by cross-validation
train.mat <- model.matrix(Apps ~ ., data = College.train)
test.mat <- model.matrix(Apps ~ ., data = College.test)
grid <- 10 ^ seq(4, -2, length = 100)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
fit.ridge <- glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cv.ridge <- cv.glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
bestlam.ridge <- cv.ridge$lambda.min
bestlam.ridge
## [1] 0.01
pred.ridge <- predict(fit.ridge, s = bestlam.ridge, newx = test.mat)
mse.ridge <- mean((pred.ridge - College.test$Apps)^2)
print(paste("The test MSE for the Ridge model is:", mse.ridge))
## [1] "The test MSE for the Ridge model is: 1026068.65314469"
rmse.ridge <- sqrt(mse.ridge)
print(paste("The test RMSE for the Ridge model is:", rmse.ridge))
## [1] "The test RMSE for the Ridge model is: 1012.95046924551"
# 4) Fit a lasso model on the training set, with lambda chosen by cross-validation.
fit.lasso <- glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cv.lasso <- cv.glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
bestlam.lasso <- cv.lasso$lambda.min
bestlam.lasso
## [1] 0.01
pred.lasso <- predict(fit.lasso, s = bestlam.lasso, newx = test.mat)
mse.lasso <- mean((pred.lasso - College.test$Apps)^2)
print(paste("The test MSE for the Lasso model is:", mse.lasso))
## [1] "The test MSE for the Lasso model is: 1026035.85789427"
rmse.lasso <- sqrt(mse.lasso)
print(paste("The test RMSE for the Lasso model is:", rmse.lasso))
## [1] "The test RMSE for the Lasso model is: 1012.93428113292"
# The test MSE is also higher for ridge regression than for least squares.
predict(fit.lasso, s = bestlam.lasso, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 37.86520037
## (Intercept) .
## PrivateYes -551.14946609
## Accept 1.74980812
## Enroll -1.36005786
## Top10perc 65.55655577
## Top25perc -22.52640339
## F.Undergrad 0.10181853
## P.Undergrad 0.01789131
## Outstate -0.08706371
## Room.Board 0.15384585
## Books -0.12227313
## Personal 0.16194591
## PhD -14.29638634
## Terminal -1.03118224
## S.F.Ratio 4.47956819
## perc.alumni -0.45456280
## Expend 0.05618050
## Grad.Rate 9.07242834
# 5) Commenting on the results obtained.
test.avg <- mean(College.test$Apps)
lm.r2 <- 1 - mean((pred.lm - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
ridge.r2 <- 1 - mean((pred.ridge - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
lasso.r2 <- 1 - mean((pred.lasso - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
###########################################################################################################
# PART - II
###########################################################################################################
# 1) Trying out some of the regression methods
library(MASS)
library(leaps)
data(Boston)
set.seed(1)
#best subset selection
predict.regsubsets <- function(object, newdata, id, ...) {
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id = id)
xvars <- names(coefi)
mat[, xvars] %*% coefi
}
k = 10
folds <- sample(1:k, nrow(Boston), replace = TRUE)
cv.errors <- matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
best.fit <- regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
for (i in 1:13) {
pred <- predict(best.fit, Boston[folds == j, ], id = i)
cv.errors[j, i] <- mean((Boston$crim[folds == j] - pred)^2)
}
}
mean.cv.errors <- apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")

#cross-validation selects an 12-variables model. We have a CV estimate for the test MSE equal to 41.0345657.
#lasso regression
x <- model.matrix(crim ~ ., Boston)[, -1]
y <- Boston$crim
cv.out <- cv.glmnet(x, y, alpha = 1, type.measure = "mse")
plot(cv.out)

#cross-validation selects a lambda equal to 0.0467489. We have a CV estimate for the test MSE equal to 42.134324.
#ridge regression
cv.out <- cv.glmnet(x, y, alpha = 0, type.measure = "mse")
plot(cv.out)

#cross-validation selects a lambda equal to 0.5374992. We have a CV estimate for the test MSE equal to 42.9834518.
# 2) Proposing a model
# Best subset selection model would be suitable, because of low cross-validation error.
# 3) Does your chosen model involve all of the features in the dataset. Why or why not.
# No, the chosen model doesn't involve all the features as it utilizes techniques like best subset selection,
#lasso, and ridge regression to prevent overfitting by selecting only the most predictive features.