#Greeshma Ganji
#ISTE 780
#Summer 2023
#PART - I
# 1) Splitting the data set into a training set and a test set.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
data(College)
set.seed(11)
train = sample(1:dim(College)[1], dim(College)[1] / 2)
test <- -train
College.train <- College[train, ]
College.test <- College[test, ]
# 2) Fitting a linear model using least squares on the training set
fit.lm <- lm(Apps ~ ., data = College.train)
pred.lm <- predict(fit.lm, College.test)
mse.lm <- mean((pred.lm - College.test$Apps)^2)
print(paste("The test MSE for the linear model is:", mse.lm))
## [1] "The test MSE for the linear model is: 1026095.92589412"
rmse.lm <- sqrt(mse.lm)
print(paste("The test RMSE for the linear model is:", rmse.lm))
## [1] "The test RMSE for the linear model is: 1012.96393119109"
# 3)Fitting ridge regression model on the training set, with lambda chosen by cross-validation
train.mat <- model.matrix(Apps ~ ., data = College.train)
test.mat <- model.matrix(Apps ~ ., data = College.test)
grid <- 10 ^ seq(4, -2, length = 100)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
fit.ridge <- glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid)
cv.ridge <- cv.glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid)
bestlam.ridge <- cv.ridge$lambda.min
bestlam.ridge
## [1] 0.01
pred.ridge <- predict(fit.ridge, s = bestlam.ridge, newx = test.mat)
mse.ridge <- mean((pred.ridge - College.test$Apps)^2)
print(paste("The test MSE for the Ridge model is:", mse.ridge))
## [1] "The test MSE for the Ridge model is: 1026024.47410069"
rmse.ridge <- sqrt(mse.ridge)
print(paste("The test RMSE for the Ridge model is:", rmse.ridge))
## [1] "The test RMSE for the Ridge model is: 1012.92866190107"
# 4) Fit a lasso model on the training set, with lambda chosen by cross-validation.
fit.lasso <- glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid)
cv.lasso <- cv.glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid)
bestlam.lasso <- cv.lasso$lambda.min
bestlam.lasso
## [1] 0.01
pred.lasso <- predict(fit.lasso, s = bestlam.lasso, newx = test.mat)
mse.lasso <- mean((pred.lasso - College.test$Apps)^2)
print(paste("The test MSE for the Lasso model is:", mse.lasso))
## [1] "The test MSE for the Lasso model is: 1025747.28886339"
rmse.lasso <- sqrt(mse.lasso)
print(paste("The test RMSE for the Lasso model is:", rmse.lasso))
## [1] "The test RMSE for the Lasso model is: 1012.79182898727"
# The test MSE is also higher for ridge regression than for least squares.
predict(fit.lasso, s = bestlam.lasso, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 36.48617213
## (Intercept) .
## PrivateYes -551.21004144
## Accept 1.74942382
## Enroll -1.35724665
## Top10perc 65.52205560
## Top25perc -22.49751046
## F.Undergrad 0.10145747
## P.Undergrad 0.01799015
## Outstate -0.08704867
## Room.Board 0.15393509
## Books -0.12218268
## Personal 0.16193433
## PhD -14.29636462
## Terminal -1.03093119
## S.F.Ratio 4.48649882
## perc.alumni -0.45435363
## Expend 0.05618901
## Grad.Rate 9.06970922
# 5) Commenting on the results obtained.
test.avg <- mean(College.test$Apps)
lm.r2 <- 1 - mean((pred.lm - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
ridge.r2 <- 1 - mean((pred.ridge - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
lasso.r2 <- 1 - mean((pred.lasso - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
###########################################################################################################
# PART - II
###########################################################################################################
# 1) Trying out some of the regression methods
library(MASS)
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
data(Boston)
set.seed(1)
#best subset selection
predict.regsubsets <- function(object, newdata, id, ...) {
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id = id)
xvars <- names(coefi)
mat[, xvars] %*% coefi
}
k = 10
folds <- sample(1:k, nrow(Boston), replace = TRUE)
cv.errors <- matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
best.fit <- regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
for (i in 1:13) {
pred <- predict(best.fit, Boston[folds == j, ], id = i)
cv.errors[j, i] <- mean((Boston$crim[folds == j] - pred)^2)
}
}
mean.cv.errors <- apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")

#cross-validation selects an 12-variables model. We have a CV estimate for the test MSE equal to 41.0345657.
#lasso regression
x <- model.matrix(crim ~ ., Boston)[, -1]
y <- Boston$crim
cv.out <- cv.glmnet(x, y, alpha = 1, type.measure = "mse")
plot(cv.out)

#cross-validation selects a lambda equal to 0.0467489. We have a CV estimate for the test MSE equal to 42.134324.
#ridge regression
cv.out <- cv.glmnet(x, y, alpha = 0, type.measure = "mse")
plot(cv.out)

#cross-validation selects a lambda equal to 0.5374992. We have a CV estimate for the test MSE equal to 42.9834518.
# 2) Proposing a model
# Best subset selection model would be suitable, because of low cross-validation error.
# 3) Does your chosen model involve all of the features in the data set. Why or why not.
# No the model chosen by the best subset selection method has only 13 predictors.