#Greeshma Ganji
#ISTE 780
#Summer 2023

#PART - I

# 1) Splitting the data set into a training set and a test set.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
data(College)
set.seed(11)
train = sample(1:dim(College)[1], dim(College)[1] / 2)
test <- -train
College.train <- College[train, ]
College.test <- College[test, ]

# 2) Fitting a linear model using least squares on the training set
fit.lm <- lm(Apps ~ ., data = College.train)
pred.lm <- predict(fit.lm, College.test)
mse.lm <- mean((pred.lm - College.test$Apps)^2)
print(paste("The test MSE for the linear model is:", mse.lm))
## [1] "The test MSE for the linear model is: 1026095.92589412"
rmse.lm <- sqrt(mse.lm)
print(paste("The test RMSE for the linear model is:", rmse.lm))
## [1] "The test RMSE for the linear model is: 1012.96393119109"
# 3)Fitting ridge regression model on the training set, with lambda chosen by cross-validation
train.mat <- model.matrix(Apps ~ ., data = College.train)
test.mat <- model.matrix(Apps ~ ., data = College.test)
grid <- 10 ^ seq(4, -2, length = 100)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
fit.ridge <- glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid)
cv.ridge <- cv.glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid)
bestlam.ridge <- cv.ridge$lambda.min
bestlam.ridge
## [1] 0.01
pred.ridge <- predict(fit.ridge, s = bestlam.ridge, newx = test.mat)
mse.ridge <- mean((pred.ridge - College.test$Apps)^2)
print(paste("The test MSE for the Ridge model is:", mse.ridge))
## [1] "The test MSE for the Ridge model is: 1026024.47410069"
rmse.ridge <- sqrt(mse.ridge)
print(paste("The test RMSE for the Ridge model is:", rmse.ridge))
## [1] "The test RMSE for the Ridge model is: 1012.92866190107"
# 4) Fit a lasso model on the training set, with lambda chosen by cross-validation.
fit.lasso <- glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid)

cv.lasso <- cv.glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid)
bestlam.lasso <- cv.lasso$lambda.min
bestlam.lasso
## [1] 0.01
pred.lasso <- predict(fit.lasso, s = bestlam.lasso, newx = test.mat)
mse.lasso <- mean((pred.lasso - College.test$Apps)^2)
print(paste("The test MSE for the Lasso model is:", mse.lasso))
## [1] "The test MSE for the Lasso model is: 1025747.28886339"
rmse.lasso <- sqrt(mse.lasso)
print(paste("The test RMSE for the Lasso model is:", rmse.lasso))
## [1] "The test RMSE for the Lasso model is: 1012.79182898727"
# The test MSE is also higher for ridge regression than for least squares.
predict(fit.lasso, s = bestlam.lasso, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)   36.48617213
## (Intercept)    .         
## PrivateYes  -551.21004144
## Accept         1.74942382
## Enroll        -1.35724665
## Top10perc     65.52205560
## Top25perc    -22.49751046
## F.Undergrad    0.10145747
## P.Undergrad    0.01799015
## Outstate      -0.08704867
## Room.Board     0.15393509
## Books         -0.12218268
## Personal       0.16193433
## PhD          -14.29636462
## Terminal      -1.03093119
## S.F.Ratio      4.48649882
## perc.alumni   -0.45435363
## Expend         0.05618901
## Grad.Rate      9.06970922
# 5) Commenting on the results obtained.
test.avg <- mean(College.test$Apps)
lm.r2 <- 1 - mean((pred.lm - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
ridge.r2 <- 1 - mean((pred.ridge - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
lasso.r2 <- 1 - mean((pred.lasso - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)


###########################################################################################################
# PART - II
###########################################################################################################

# 1) Trying out some of the regression methods

library(MASS)
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
data(Boston)
set.seed(1)
#best subset selection
predict.regsubsets <- function(object, newdata, id, ...) {
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefi <- coef(object, id = id)
  xvars <- names(coefi)
  mat[, xvars] %*% coefi
}

k = 10
folds <- sample(1:k, nrow(Boston), replace = TRUE)
cv.errors <- matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
  best.fit <- regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
  for (i in 1:13) {
    pred <- predict(best.fit, Boston[folds == j, ], id = i)
    cv.errors[j, i] <- mean((Boston$crim[folds == j] - pred)^2)
  }
}
mean.cv.errors <- apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")

#cross-validation selects an 12-variables model. We have a CV estimate for the test MSE equal to 41.0345657.

#lasso regression
x <- model.matrix(crim ~ ., Boston)[, -1]
y <- Boston$crim
cv.out <- cv.glmnet(x, y, alpha = 1, type.measure = "mse")
plot(cv.out)

#cross-validation selects a lambda equal to 0.0467489. We have a CV estimate for the test MSE equal to 42.134324.

#ridge regression 
cv.out <- cv.glmnet(x, y, alpha = 0, type.measure = "mse")
plot(cv.out)

#cross-validation selects a lambda equal to 0.5374992. We have a CV estimate for the test MSE equal to 42.9834518.

# 2) Proposing a model 
# Best subset selection model would be suitable, because of low cross-validation error.

# 3) Does your chosen model involve all of the features in the data set. Why or why not.
# No the model chosen by the best subset selection method has only 13 predictors.