Lab4_Greeshma

#Greeshma Ganji
#ISTE 780
#Summer 2023

#PART - I

# 1) Splitting the data set into a training set and a test set.
library(ISLR)
data(College)
set.seed(11)
train = sample(1:dim(College)[1], dim(College)[1] / 2)
test <- -train
College.train <- College[train, ]
College.test <- College[test, ]


# 2) Fitting a linear model using least squares on the training set
fit.lm <- lm(Apps ~ ., data = College.train)
pred.lm <- predict(fit.lm, College.test)
mse.lm <- mean((pred.lm - College.test$Apps)^2)
print(paste("The test MSE for the linear model is:", mse.lm))

## [1] "The test MSE for the linear model is: 1026095.92589412"

rmse.lm <- sqrt(mse.lm)
print(paste("The test RMSE for the linear model is:", rmse.lm))

## [1] "The test RMSE for the linear model is: 1012.96393119109"

# 3)Fitting ridge regression model on the training set, with lamda chosen by cross-validation
train.mat <- model.matrix(Apps ~ ., data = College.train)
test.mat <- model.matrix(Apps ~ ., data = College.test)
grid <- 10 ^ seq(4, -2, length = 100)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-7

fit.ridge <- glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cv.ridge <- cv.glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
bestlam.ridge <- cv.ridge$lambda.min
bestlam.ridge

## [1] 0.01

pred.ridge <- predict(fit.ridge, s = bestlam.ridge, newx = test.mat)
mse.ridge <- mean((pred.ridge - College.test$Apps)^2)
print(paste("The test MSE for the Ridge model is:", mse.ridge))

## [1] "The test MSE for the Ridge model is: 1026068.65314469"

rmse.ridge <- sqrt(mse.ridge)
print(paste("The test RMSE for the Ridge model is:", rmse.ridge))

## [1] "The test RMSE for the Ridge model is: 1012.95046924551"

# 4) Fit a lasso model on the training set, with lambda chosen by cross-validation.
fit.lasso <- glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cv.lasso <- cv.glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
bestlam.lasso <- cv.lasso$lambda.min
bestlam.lasso

## [1] 0.01

pred.lasso <- predict(fit.lasso, s = bestlam.lasso, newx = test.mat)
mse.lasso <- mean((pred.lasso - College.test$Apps)^2)
print(paste("The test MSE for the Lasso model is:", mse.lasso))

## [1] "The test MSE for the Lasso model is: 1026035.85789427"

rmse.lasso <- sqrt(mse.lasso)
print(paste("The test RMSE for the Lasso model is:", rmse.lasso))

## [1] "The test RMSE for the Lasso model is: 1012.93428113292"

# The test MSE is also higher for ridge regression than for least squares.
predict(fit.lasso, s = bestlam.lasso, type = "coefficients")

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)   37.86520037
## (Intercept)    .         
## PrivateYes  -551.14946609
## Accept         1.74980812
## Enroll        -1.36005786
## Top10perc     65.55655577
## Top25perc    -22.52640339
## F.Undergrad    0.10181853
## P.Undergrad    0.01789131
## Outstate      -0.08706371
## Room.Board     0.15384585
## Books         -0.12227313
## Personal       0.16194591
## PhD          -14.29638634
## Terminal      -1.03118224
## S.F.Ratio      4.47956819
## perc.alumni   -0.45456280
## Expend         0.05618050
## Grad.Rate      9.07242834

# 5) Commenting on the results obtained.
test.avg <- mean(College.test$Apps)
lm.r2 <- 1 - mean((pred.lm - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
ridge.r2 <- 1 - mean((pred.ridge - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
lasso.r2 <- 1 - mean((pred.lasso - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)


###########################################################################################################
# PART - II
###########################################################################################################

# 1) Trying out some of the regression methods
library(MASS)
library(leaps)
data(Boston)
set.seed(1)
#best subset selection
predict.regsubsets <- function(object, newdata, id, ...) {
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefi <- coef(object, id = id)
  xvars <- names(coefi)
  mat[, xvars] %*% coefi
}

k = 10
folds <- sample(1:k, nrow(Boston), replace = TRUE)
cv.errors <- matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
  best.fit <- regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
  for (i in 1:13) {
    pred <- predict(best.fit, Boston[folds == j, ], id = i)
    cv.errors[j, i] <- mean((Boston$crim[folds == j] - pred)^2)
  }
}
mean.cv.errors <- apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")

#cross-validation selects an 12-variables model. We have a CV estimate for the test MSE equal to 41.0345657.

#lasso regression
x <- model.matrix(crim ~ ., Boston)[, -1]
y <- Boston$crim
cv.out <- cv.glmnet(x, y, alpha = 1, type.measure = "mse")
plot(cv.out)

#cross-validation selects a lambda equal to 0.0467489. We have a CV estimate for the test MSE equal to 42.134324.

#ridge regression 
cv.out <- cv.glmnet(x, y, alpha = 0, type.measure = "mse")
plot(cv.out)

#cross-validation selects a lambda equal to 0.5374992. We have a CV estimate for the test MSE equal to 42.9834518.


# 2) Proposing a model 
# Best subset selection model would be suitable, because of low cross-validation error.


# 3) Does your chosen model involve all of the features in the dataset. Why or why not.
# No, the chosen model doesn't involve all the features as it utilizes techniques like best subset selection, 
#lasso, and ridge regression to prevent overfitting by selecting only the most predictive features.

Lab4_Greeshma_Ganji.R

greeshmaganji

2023-07-27