rm(list=ls())

#Library######
library(readr)
library(tidyverse) 
library(dplyr) 
library(DT) 
library(RColorBrewer) 
library(rio) 
library(dbplyr) 
library(psych) 
library(FSA) 
library(knitr)
library(RColorBrewer)
library(plotrix)
library(kableExtra)
library(ISLR)
library(data.table)
library(magrittr)
library(ggplot2)
library(summarytools)
library(hrbrthemes)
library(cowplot)
library(reshape2)
library(scales)
library(zoo)
library(corrplot)
library(lares)
library(leaps)
library(MASS)
library(car)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(tidyr)
library(readxl)
library(forcats)
library(ISLR)
library(caret)
library(pROC)
library(glmnet)
library(Metrics)


#Dataset
Project4_data = data.frame(College)


Regularization


Introduction

Regularization techniques are used to decrease model errors by fitting functions adequately on provided training sets while avoiding parameter overfitting. Simple linear regression formula is:

\(Y = \beta_0 + \beta_1X_1 + \beta_2X_2 +...+\beta_nX_n + \epsilon\)

Where \(Y\) is the response variable and \(\beta\) represents the coefficients for each predictor. Often the model can learn the noise, and the coefficients are unreliable, which means they can’t generalize effectively to new data. Overfitting occurs when a dataset has a small number of observations, multicollinearity exists, or too many predictors are included in the model. When at all feasible, avoid overfitting a model. Regularization uses high-valued regression coefficients as a means of penalty to prevent overfitting (Gero, 2023).
A loss function described as the residual sum of squares, or RSS, is used in the fitting process. The coefficients are chosen to reduce the loss function.
\[RSS = \sum^{n}_{i=1}(\,y_i - \beta_0 - \sum^{p}_{j=1}\beta_jx_{ij})\,^2\]
Task 1: Splitting dataset into Train and Test
#table representation of the statistical description
describe(Project4_data)%>% 
  kable(caption = "<center>Table 1: Statistical Description of College Dataset</center>",
        align = "c")%>%
  kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "US Colleges from the 1995 issue of US News and World Report.\n",
           general_title = "Data Source:",
           symbol = c("Categorical Variable"))
Table 1: Statistical Description of College Dataset
vars n mean sd median trimmed mad min max range skew kurtosis se
Private* 1 777 1.727156 0.4457084 2.0 1.783307 0.00000 1.0 2.0 1.0 -1.0179902 -0.9649328 0.0159897
Apps 2 777 3001.638353 3870.2014844 1558.0 2193.008026 1463.32620 81.0 48094.0 48013.0 3.7093849 26.5184313 138.8427049
Accept 3 777 2018.804376 2451.1139710 1110.0 1510.287319 1008.16800 72.0 26330.0 26258.0 3.4045428 18.7526403 87.9332239
Enroll 4 777 779.972973 929.1761901 434.0 575.953451 354.34140 35.0 6392.0 6357.0 2.6800857 8.7368340 33.3340101
Top10perc 5 777 27.558559 17.6403644 23.0 25.130016 13.34340 1.0 96.0 95.0 1.4077650 2.1728286 0.6328445
Top25perc 6 777 55.796654 19.8047776 54.0 55.121990 20.75640 9.0 100.0 91.0 0.2583399 -0.5744647 0.7104924
F.Undergrad 7 777 3699.907336 4850.4205309 1707.0 2574.884430 1441.08720 139.0 31643.0 31504.0 2.6003876 7.6120676 174.0078673
P.Undergrad 8 777 855.298584 1522.4318873 353.0 536.361156 449.22780 1.0 21836.0 21835.0 5.6703938 54.5249401 54.6169397
Outstate 9 777 10440.669241 4023.0164841 9990.0 10181.658106 4121.62800 2340.0 21700.0 19360.0 0.5073133 -0.4255258 144.3249124
Room.Board 10 777 4357.526383 1096.6964156 4200.0 4301.704655 1005.20280 1780.0 8124.0 6344.0 0.4755141 -0.2012779 39.3437648
Books 11 777 549.380952 165.1053601 500.0 535.219904 148.26000 96.0 2340.0 2244.0 3.4715806 28.0632782 5.9231218
Personal 12 777 1340.642214 677.0714536 1200.0 1268.345104 593.04000 250.0 6800.0 6550.0 1.7357745 7.0446395 24.2898031
PhD 13 777 72.660232 16.3281547 75.0 73.922954 17.79120 8.0 103.0 95.0 -0.7652067 0.5442923 0.5857693
Terminal 14 777 79.702703 14.7223585 82.0 81.102729 14.82600 24.0 100.0 76.0 -0.8133924 0.2244365 0.5281617
S.F.Ratio 15 777 14.089704 3.9583491 13.6 13.935795 3.40998 2.5 39.8 37.3 0.6648606 2.5228017 0.1420050
perc.alumni 16 777 22.743887 12.3918015 21.0 21.857143 13.34340 0.0 64.0 64.0 0.6045500 -0.1113466 0.4445534
Expend 17 777 9660.171171 5221.7684399 8377.0 8823.704655 2730.94920 3186.0 56233.0 53047.0 3.4459767 18.5875365 187.3298993
Grad.Rate 18 777 65.463320 17.1777099 65.0 65.601926 17.79120 10.0 118.0 108.0 -0.1133384 -0.2187930 0.6162469
Data Source: US Colleges from the 1995 issue of US News and World Report.
* Categorical Variable
#Splitting dataset into Train and Test######
set.seed(130)

traintestind <- createDataPartition(Project4_data$Private, p=0.70, list = FALSE)

traindata <- Project4_data[traintestind,] #assigning 70% of data to train
testdata <- Project4_data[-traintestind,] #assigning remaining 30% of data to test

head(traindata) %>%
  kable(caption = "<center>Table 2: Head of Train dataset of College Dataset</center>",
        align = "c")%>%
  kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "US Colleges from the 1995 issue of US News and World Report.\n",
           general_title = "Data Source:")
Table 2: Head of Train dataset of College Dataset
Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
Adelphi University Yes 2186 1924 512 16 29 2683 1227 12280 6450 750 1500 29 30 12.2 16 10527 56
Adrian College Yes 1428 1097 336 22 50 1036 99 11250 3750 400 1165 53 66 12.9 30 8735 54
Agnes Scott College Yes 417 349 137 60 89 510 63 12960 5450 450 875 92 97 7.7 37 19016 59
Alaska Pacific University Yes 193 146 55 16 44 249 869 7560 4120 800 1500 76 72 11.9 2 10922 15
Albertson College Yes 587 479 158 38 62 678 41 13500 3335 500 675 67 73 9.4 11 9727 55
Albertus Magnus College Yes 353 340 103 17 45 416 230 13290 5720 500 1500 90 93 11.5 26 8861 63
Data Source: US Colleges from the 1995 issue of US News and World Report.
head(testdata)%>%
  kable(caption = "<center>Table 3: Head of Test dataset of College Dataset</center>",
        align = "c")%>%
  kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "US Colleges from the 1995 issue of US News and World Report.\n",
           general_title = "Data Source:")
Table 3: Head of Test dataset of College Dataset
Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
Abilene Christian University Yes 1660 1232 721 23 52 2885 537 7440 3300 450 2200 70 78 18.1 12 7041 60
Amherst College Yes 4302 992 418 83 96 1593 5 19760 5300 660 1598 93 98 8.4 63 21424 100
Antioch University Yes 713 661 252 25 44 712 23 15476 3336 400 1100 69 82 11.3 35 42926 48
Appalachian State University No 7313 4664 1910 20 63 9940 1035 6806 2540 96 2000 83 96 18.3 14 5854 70
Aquinas College Yes 619 516 219 20 51 1251 767 11208 4124 350 1615 55 65 12.7 25 6584 65
Arkansas Tech University No 1734 1729 951 12 52 3602 939 3460 2650 450 1000 57 60 19.6 5 4739 48
Data Source: US Colleges from the 1995 issue of US News and World Report.
#Converting dataframe into matrix to make glmnet compatible
#model.matrix() is used as Private is Categorical variable
traindata_x <- model.matrix(Grad.Rate ~., traindata)[,-1] 
testdata_x <- model.matrix(Grad.Rate ~., testdata)[,-1]

traindata_y = traindata$Grad.Rate
testdata_y = testdata$Grad.Rate
Observation:

In this task, I have loaded the College dataset and used describe() for descriptive statistics of the dataset. This dataset have 18 variables and Private is categorical variable whereas others are continues and discrete variables. I would be using this dataset to run the Regularization regressions, Lasso L1 and Ridge L2 but before that I split the dataset into two datasets. Train dataset have 70% of the College and Test have remaining 30% of the College dataset. Train and Test datasets’ head observations are displayed using kable().

Ridge Regression:

L2 regularization is used in the Ridge regression technique, which applies a penalty proportional to the square of the size of the coefficients. The coefficients in Ridge regression will approach 0 but will never equal zero.

Regularization at the L2 level:
Adds a penalty equal to the square of the coefficient magnitude.
The same factor reduces the value of each coefficient (none are eliminated) (Gero, 2023).
#Ridge Regression#####

#Best lambda value

set.seed(130)
crssval.ridge = cv.glmnet(traindata_x, traindata_y, nfolds = 10, alpha = 0) #Alpha 0 = Ridge, 1 = Lasso

plot(crssval.ridge)

#Finding optimal Lambda value

minlamb.ridge = log(crssval.ridge$lambda.min) #minimum Lambda value
selamb.ridge = log(crssval.ridge$lambda.1se) #1Standard Error Lambda value

loglamb = rbind(minlamb.ridge,selamb.ridge)
col_n <- c("Lambda log Value")   
row_n <- c("Lambda Min log value","Lambda 1 Standard Error log value")
  dimnames(loglamb) <- list(row_n,col_n)
  
kable(loglamb,
  caption = "<center>Lambda log values of Ridge Regression</center>",
       align = "c") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Table a, Sub task 1",
           general_title = "Task 1: ")
Lambda log values of Ridge Regression
Lambda log Value
Lambda Min log value 1.212158
Lambda 1 Standard Error log value 3.258900
Task 1: Table a, Sub task 1
##Ridge Regression Model on Lambda Min######

modl.miniridge = glmnet(traindata_x, traindata_y, alpha = 0, lambda = crssval.ridge$lambda.min) #Alpha 0 = Ridge, 1 = Lasso
#print(modl.miniridge)
Dflaridmi = as.numeric(modl.miniridge$df[1])
Dfratlaridmi = as.numeric(modl.miniridge$dev.ratio[1] * 100)
Dflaridmivl = as.numeric(modl.miniridge$lambda[1])

minirdg = cbind(Dflaridmi,
                round(Dfratlaridmi,3),
                round(Dflaridmivl,3))

col_ridmin <- c("Degree of Freedom", "% Dev", "Lambda")   
row_ridmin <- c("Min Ridge Regression")
  dimnames(minirdg) <- list(row_ridmin,col_ridmin)
  
kable(minirdg,
  caption = "<center>Ridge Min Regression on Train data</center>",
       align = "c") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Table b, Sub task 1",
           general_title = "Task 1: ")
Ridge Min Regression on Train data
Degree of Freedom % Dev Lambda
Min Ridge Regression 17 46.201 3.361
Task 1: Table b, Sub task 1
###Regression Coefficient Lambda Min######

modl.miniridge_coef = as.data.frame(as.matrix(coef(modl.miniridge)))

kable(modl.miniridge_coef,
      caption = "<center>Min Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Ridge Regression Coefficient Min",
           general_title = "Task 1: ")
Min Model Coefficient
Coefficient
(Intercept) 35.7641835
PrivateYes 3.8414462
Apps 0.0003850
Accept 0.0003265
Enroll 0.0003817
Top10perc 0.0917166
Top25perc 0.0852349
F.Undergrad 0.0000420
P.Undergrad -0.0013521
Outstate 0.0006718
Room.Board 0.0020998
Books -0.0012379
Personal -0.0028284
PhD 0.0396931
Terminal -0.0335658
S.F.Ratio 0.0703800
perc.alumni 0.2912263
Expend -0.0001139
Task 1: Ridge Regression Coefficient Min
##Ridge Regression Model by Lambda 1SE######

modl.seridge = glmnet(traindata_x, traindata_y, alpha = 0, lambda = crssval.ridge$lambda.1se) #alpha 1 is for Lasso (L2)
#print(modl.seridge)
Dflaridse = as.numeric(modl.seridge$df[1])
Dfratlaridse = as.numeric(modl.seridge$dev.ratio[1] * 100)
Dflaridsevl = as.numeric(modl.seridge$lambda[1])

se1rdg = cbind(Dflaridse,
                round(Dfratlaridse,3),
                round(Dflaridsevl,3))

col_ridse <- c("Degree of Freedom", "% Dev", "Lambda")   
row_ridse <- c("Lambda 1SE Ridge Regression")
  dimnames(se1rdg) <- list(row_ridse,col_ridse)
  
kable(se1rdg,
  caption = "<center>Ridge 1SE Regression on Train data</center>",
       align = "c") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Table c, Sub task 1",
           general_title = "Task 1: ")
Ridge 1SE Regression on Train data
Degree of Freedom % Dev Lambda
Lambda 1SE Ridge Regression 17 40.708 26.021
Task 1: Table c, Sub task 1
##Regression Coefficient Lambda 1SE######
modl.seridge_coef = as.data.frame(as.matrix(coef(modl.seridge)))

kable(modl.seridge_coef,
      caption = "<center>1SE Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Ridge Regression Coefficient 1SE",
           general_title = "Task 1: ")
1SE Model Coefficient
Coefficient
(Intercept) 42.7249657
PrivateYes 2.5934946
Apps 0.0001683
Accept 0.0001864
Enroll 0.0001015
Top10perc 0.0725714
Top25perc 0.0651564
F.Undergrad -0.0000197
P.Undergrad -0.0007152
Outstate 0.0004360
Room.Board 0.0012771
Books -0.0007472
Personal -0.0018947
PhD 0.0329862
Terminal 0.0219444
S.F.Ratio -0.0900863
perc.alumni 0.1596575
Expend 0.0001158
Task 1: Ridge Regression Coefficient 1SE
##Regression Coefficient with no Regularization######
noreg.rdg = lm(Grad.Rate ~ ., data = traindata)
#coef(noreg.rdg)
noreg.rdg_coef = as.data.frame(as.matrix(coef(noreg.rdg)))

kable(noreg.rdg_coef,
      caption = "<center>No Regularization Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Regression Coefficient, No Regularization",
           general_title = "Task 1: ")
No Regularization Model Coefficient
Coefficient
(Intercept) 34.6946731
PrivateYes 4.0971439
Apps 0.0010680
Accept -0.0007622
Enroll 0.0006467
Top10perc 0.0844861
Top25perc 0.0857064
F.Undergrad 0.0002285
P.Undergrad -0.0017100
Outstate 0.0009027
Room.Board 0.0024830
Books -0.0007731
Personal -0.0030138
PhD 0.0862238
Terminal -0.1062937
S.F.Ratio 0.0981707
perc.alumni 0.3499178
Expend -0.0004000
Task 1: Regression Coefficient, No Regularization
##view RMSE on Full data#######
pred.noreg.rdg = predict(noreg.rdg, new = testdata)
Full.rmse.rdg = rmse(testdata$Grad.Rate, pred.noreg.rdg)

#print(Full.rmse.rdg)

###Prediction Test dataset#######
testridge_pred = predict(modl.seridge, newx = testdata_x)

test.ridgermse = rmse(testdata_y, testridge_pred)


###Prediction Train dataset######

trainridge_pred = predict(modl.seridge, newx= traindata_x)

train.ridgermse = rmse(traindata_y, trainridge_pred) #root mean square error

##Comparing RMSE values#######
#print(Full.rmse.rdg) 
#print(train.ridgermse)
#print(test.ridgermse)

CompRMSE = cbind(round(Full.rmse.rdg,3),
                round(train.ridgermse,3),
                round(test.ridgermse,3))

col_comprmse <- c("RMSE value Full dataset", "RMSE value Train dataset", "RMSE value Test dataset")   
row_comprmse <- c("RMSE values")
  dimnames(CompRMSE) <- list(row_comprmse,col_comprmse)

kable(CompRMSE,
  caption = "<center>Comparison of RMSE values of Ridge Min, 1SE, and No Regression on Train, Test, and Full data</center>",
       align = "c") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Table d, Sub task 5 & 6",
           general_title = "Task 1: ")
Comparison of RMSE values of Ridge Min, 1SE, and No Regression on Train, Test, and Full data
RMSE value Full dataset RMSE value Train dataset RMSE value Test dataset
RMSE values 12.413 13.539 12.553
Task 1: Table d, Sub task 5 & 6
Observation:

In this project, I would be using Graduate rate as dependent variable and independent variables which are affecting the graduation rate.
Ridge Regression have been tested on Train and Test dataset to determine the performance of the model and identify whether there is an overfit or not. Before we proceed with the model, I would like to know the log value of Lambda. Lambda min and 1SE values are 1.21 and 3.25 respectively.
The significance of the lambda min value is that it is the value of the regularization parameter that produces the smallest cross-validated mean squared error (Hastie, 2001). Which means it is the value of lambda that results in the best prediction performance, as measured by the mean squared error.
The significance of the 1SE value is that it is the value of the regularization parameter that produces a prediction performance that is within one standard error of the prediction performance at the lambda min value (Hastie, 2001). Which means it is the value of lambda that results in a prediction performance that is almost as good as the best prediction performance, but with less overfitting to the training data.
Both the lambda min and 1SE values are useful in determining the appropriate level of regularization in Ridge Regression, as they help to balance the trade-off between overfitting and underfitting.

Using the Cross-Validation, I would be extracting log value of Lambda and the same could be plotted to understand the number of variables influencing the Lambda minimum and 1 Standard Error value. In the plot, the Cross-Validation values are represented with dotted vertical lines. The minimum and 1 Standard Error values are 1.21 and 3.25 respectively. The number of variables it has counted from College dataset is 17 means all other variables effecting Graduation rate. The same is represented in Task 1: Table a.

Task 1: Table b represents the Minimum Ridge values. Degree of Freedom, Deviation Ratio, and Lambda values are captured from Train dataset using the glmnet(). Furthermore, Task 1 table shows the coefficient values of the glmnet() on Train dataset. Same procedures are repeated for 1SE and No Regularization.
The Root Mean Squared Error (RMSE) value is a measure of the difference between the predicted values and the actual values. It is a commonly used evaluation metric for regression models. The RMSE value represents the average difference between the predicted and actual values, and it is calculated by taking the square root of the mean of the squared differences between the predicted and actual values. A lower RMSE value indicates that the model has a better fit to the data and is therefore making more accurate predictions (Gero, 2023).
The last table, Task1: Table D shows the RMSE value comparison and shows three different RMSE values. RMSE value Full dataset, Train dataset, and Test dataset. As the difference between these three values is negligible, the model is not overfitting.

In the all three coefficient tables, the PrivateYES and Perc.alumni values are influencing the graduation rate significantly followed by Top10perc, Top25perc, Student to Faculty ratio(except Lambda 1SE model), and Phd.
To summarize, Graduate rate is influenced by type of university(PrivateYES), Percentage alumni donate, Percentage of new students from top 10% of H.S. class, Percentage of new students from top 25% of H.S. class, and Percentage of Faculty with Phd’s.

Lasso:

The Least Absolute Shrinkage and Selection Operator approach, sometimes known as Lasso, is a popular regularization implementation method.

Regularization at the L1 level:
Adds a penalty proportional to the absolute value of the magnitude of the coefficients and restricts their size.
L1 has the ability to decrease coefficients to zero and hence delete the variable.
#LASSO######

#Best lambda value

set.seed(130)
crssval.lasso = cv.glmnet(traindata_x, traindata_y, nfolds = 10, alpha = 1)
plot(crssval.lasso)

#Finding optimal Lambda value

minlamb.lasso = log(crssval.lasso$lambda.min) #minimum Lambda value
se1lamb.lasso = log(crssval.lasso$lambda.1se) #1Standard Error Lambda value


loglamblasso = rbind(minlamb.lasso,se1lamb.lasso)
col_lasso <- c("Lambda log Value: Lasso")   
row_lasso <- c("Lambda Min log value","Lambda 1 Standard Error log value")
  dimnames(loglamblasso) <- list(row_lasso,col_lasso)
  
kable(round(loglamblasso,3),
  caption = "<center>Lambda log values of Lasso Regression</center>",
       align = "c") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Table a, Sub task 1",
           general_title = "Task 2: ")
Lambda log values of Lasso Regression
Lambda log Value: Lasso
Lambda Min log value -0.672
Lambda 1 Standard Error log value 0.631
Task 2: Table a, Sub task 1
#Lasso Regression Model on Lambda Min

modl.minilasso = glmnet(traindata_x, traindata_y, alpha = 1, lambda = crssval.lasso$lambda.min)
#print(modl.minilasso)

Dflamlassomi = as.numeric(modl.minilasso$df[1])
Dfratlassomi = as.numeric(modl.minilasso$dev.ratio[1] * 100)
Dflamlassomivl = as.numeric(modl.minilasso$lambda[1])

lassominrdg = cbind(Dflamlassomi,
                round(Dfratlassomi,2),
                round(Dflamlassomivl,3))

col_lassomin <- c("Degree of Freedom", "% Dev", "Lambda")   
row_lassomin <- c("Lambda Min Ridge Regression")
  dimnames(lassominrdg) <- list(row_ridmin,col_ridmin)

kable(lassominrdg,
  caption = "<center>Lasso Min Regression on Train data</center>",
       align = "c") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Table b, Sub task 1",
           general_title = "Task 2: ")
Lasso Min Regression on Train data
Degree of Freedom % Dev Lambda
Min Ridge Regression 9 45.51 0.511
Task 2: Table b, Sub task 1
#Regression Coefficient Lambda Min

lammin.lasso_coef = as.data.frame(as.matrix(coef(modl.minilasso)))

kable(lammin.lasso_coef,
      caption = "<center>Lasso Lamba Min Regularization Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Lasso Regression Coefficient, Lamba Min",
           general_title = "Task 2: ")
Lasso Lamba Min Regularization Model Coefficient
Coefficient
(Intercept) 37.1259921
PrivateYes 2.0139038
Apps 0.0005527
Accept 0.0000000
Enroll 0.0000000
Top10perc 0.0505645
Top25perc 0.0937967
F.Undergrad 0.0000000
P.Undergrad -0.0011761
Outstate 0.0007233
Room.Board 0.0018791
Books 0.0000000
Personal -0.0026094
PhD 0.0000000
Terminal 0.0000000
S.F.Ratio 0.0000000
perc.alumni 0.3198977
Expend 0.0000000
Task 2: Lasso Regression Coefficient, Lamba Min
#lasso Regression Model by Lambda 1SE

modl.selasso = glmnet(traindata_x, traindata_y, alpha = 1, lambda = crssval.lasso$lambda.1se) #alpha 1 is for Lasso (L2)
#print(modl.selasso)
Dflamlassose1 = as.numeric(modl.selasso$df[1])
Dfratlassose1 = as.numeric(modl.selasso$dev.ratio[1] * 100)
Dflamlassose1vl = as.numeric(modl.selasso$lambda[1])

lassose1rdg = cbind(Dflamlassose1,
                round(Dfratlassose1,2),
                round(Dflamlassose1vl,3))

col_lassose1 <- c("Degree of Freedom", "% Dev", "Lambda")   
row_lassose1 <- c("Lambda Min Ridge Regression")
  dimnames(lassose1rdg) <- list(row_lassose1,col_lassose1)

kable(lassose1rdg,
  caption = "<center>Lasso 1SE Regression on Train data</center>",
       align = "c") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Table c, Sub task 2",
           general_title = "Task 2: ")
Lasso 1SE Regression on Train data
Degree of Freedom % Dev Lambda
Lambda Min Ridge Regression 7 40.98 1.879
Task 2: Table c, Sub task 2
#Regression Coefficient Lambda 1SE


lam1se.lasso_coef = as.data.frame(as.matrix(coef(modl.selasso)))

kable(lam1se.lasso_coef,
      caption = "<center>Lasso Lamba 1SE Regularization Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Lasso Regression Coefficient, Lamba 1SE",
           general_title = "Task 2: ")
Lasso Lamba 1SE Regularization Model Coefficient
Coefficient
(Intercept) 40.6998598
PrivateYes 0.0000000
Apps 0.0000000
Accept 0.0000000
Enroll 0.0000000
Top10perc 0.0760849
Top25perc 0.0631056
F.Undergrad 0.0000000
P.Undergrad -0.0001614
Outstate 0.0009352
Room.Board 0.0011986
Books 0.0000000
Personal -0.0013104
PhD 0.0000000
Terminal 0.0000000
S.F.Ratio 0.0000000
perc.alumni 0.2610037
Expend 0.0000000
Task 2: Lasso Regression Coefficient, Lamba 1SE
#Regression Coefficient with no Regularization
noreg.lasso = lm(Grad.Rate ~ ., data = traindata)

noreg.lasso_coef = as.data.frame(as.matrix(coef(noreg.lasso)))

kable(noreg.lasso_coef,
      caption = "<center>No Regularization Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Regression Coefficient, No Regularization",
           general_title = "Task 2: ")
No Regularization Model Coefficient
Coefficient
(Intercept) 34.6946731
PrivateYes 4.0971439
Apps 0.0010680
Accept -0.0007622
Enroll 0.0006467
Top10perc 0.0844861
Top25perc 0.0857064
F.Undergrad 0.0002285
P.Undergrad -0.0017100
Outstate 0.0009027
Room.Board 0.0024830
Books -0.0007731
Personal -0.0030138
PhD 0.0862238
Terminal -0.1062937
S.F.Ratio 0.0981707
perc.alumni 0.3499178
Expend -0.0004000
Task 2: Regression Coefficient, No Regularization
#view RMSE on Full data
pred.noreg = predict(noreg.lasso, new = testdata)
Full.rmse = rmse(testdata$Grad.Rate, pred.noreg)
#print(Full.rmse)

#Prediction Test dataset
testlasso_pred = predict(modl.selasso, newx = testdata_x)

test.lassormse = rmse(testdata_y, testlasso_pred)


#Prediction Train dataset

trainlasso_pred = predict(modl.selasso, newx= traindata_x)

train.lassormse = rmse(traindata_y, trainlasso_pred) #root mean square error

#Comparing RMSE values
#print(Full.rmse) 
#print(train.lassormse)
#print(test.lassormse)

CompRMSE_lasso = cbind(round(Full.rmse,3),
                round(train.lassormse,3),
                round(test.lassormse,3))

col_comprmse_lasso <- c("RMSE value Full dataset", "RMSE value Train dataset", "RMSE value Test dataset")   
row_comprmse_lasso <- c("RMSE values")
  dimnames(CompRMSE_lasso) <- list(row_comprmse_lasso,col_comprmse_lasso)

kable(CompRMSE_lasso,
  caption = "<center>Comparison of RMSE values of Ridge Min, 1SE, and No Regression on Train, Test, and Full data</center>",
       align = "c") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Table d, Sub task 5 & 6",
           general_title = "Task 2: ")
Comparison of RMSE values of Ridge Min, 1SE, and No Regression on Train, Test, and Full data
RMSE value Full dataset RMSE value Train dataset RMSE value Test dataset
RMSE values 12.413 13.508 12.604
Task 2: Table d, Sub task 5 & 6
Observation:

Continuing the regularization, Lasso regression is used on Train and Test dataset to understand how Graduation rare is influenced by other variables. But before we proceed, we need to understand the Lambda Min and 1SE using Cross Validation method.
In Lasso Regression, the regularization term is a penalty for having large coefficients for the features, and the value of the penalty is controlled by the lambda parameter. The goal is to find a balance between fitting the data well and having a simple model with small coefficients for the features(Gero, 2023).
Lambda min is the value of the lambda parameter that gives the minimum cross-validated mean square error (MSE) for the model. This is the value that would be chosen if the goal were to minimize the MSE (Hastie, 2001).
1SE (one standard error) is a commonly used measure of how much a model’s MSE would change if it were fit to a different dataset. The 1SE value of lambda is the value that gives a model whose MSE is within one standard deviation of the MSE of the model with lambda min (Hastie, 2001).
The significance of Lambda min and 1SE values in Lasso Regression is that they can help to determine the best trade-off between model fit and complexity. Choosing the lambda value that gives the lowest MSE may result in overfitting, while choosing a higher lambda value may result in underfitting. The 1SE value provides a measure of the uncertainty in the MSE and can be used to choose a more robust model that is less sensitive to the choice of data (Hastie, 2001; Gero, 2023).

Using the Cross-Validation, I would be extracting log value of Lambda and the same could be plotted to understand the number of variables influencing the Lambda minimum and 1 Standard Error value. In the plot, the Cross-Validation values are represented with dotted vertical lines. The minimum and 1 Standard Error values are -0.67 and 0.63 respectively. The number of variables it has counted from College dataset is between 7 and 11 means these variables are effecting Graduation rate. The same is represented in Task 2: Table a. Task 2: Table b represents the Minimum Lasso values. Degree of Freedom, Deviation Ratio, and Lambda values are captured from Train dataset using the glmnet(). Furthermore, Task 2 table shows the coefficient values of the glmnet() on Train dataset. Same procedures are repeated for 1SE and No Regularization. The Root Mean Squared Error (RMSE) value is a measure of the difference between the predicted values and the actual values. It is a commonly used evaluation metric for regression models. The RMSE value represents the average difference between the predicted and actual values, and it is calculated by taking the square root of the mean of the squared differences between the predicted and actual values. A lower RMSE value indicates that the model has a better fit to the data and is therefore making more accurate predictions (Gero, 2023). The last table, Task2: Table D shows the RMSE value comparison and shows three different RMSE values. RMSE value Full dataset, Train dataset, and Test dataset. As the difference between these three values is negligible, the model is not overfitting.
Furthermore, in the all three coefficient tables, Perc.alumni values are influencing the graduation rate significantly followed by Top25perc. The noticeable difference between Ridge and Lasso is Lasso eliminate few variables and captures most influencing variables. This elimination would be different for Lambda Min and Lamnda 1SE. In the Task 2: Lambda Min Coefficient, Accept, Enroll, Fulltime Undergraduates, Books, Phd, Terminal, and Expend variables are eliminiated. However, in Lambda 1SE Coefficient table, in addition to min coefficient elimination, Private(Yes) and Student to Faculty ratio variables are eliminated.
Surprisingly, the Private(Yes) variables have most impact on Graduation Rate in Ridge Regression but in Lasso for 1SE Lamnda value, it is eliminated. To summarize, Graduate rate is influenced by Percentage alumni donate in all three regression models and except in 1SE Lambda, the Private(YES) is influencing variable. In Lambda 1SE value total 6 variables are considered influencing Graduation rate and in Lambda Min value 9 variables.

The Task 2: Table d shows the RMSE values of three models and the difference between them is negligible, so we can conclude that Lasso models are not overfitting.

Task 12 & 13:
#Optimal model using step()

optmod = step(lm(Grad.Rate ~ ., data = Project4_data), direction = "both") #finding optimal model
## Start:  AIC=3972.98
## Grad.Rate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc + 
##     F.Undergrad + P.Undergrad + Outstate + Room.Board + Books + 
##     Personal + PhD + Terminal + S.F.Ratio + perc.alumni + Expend
## 
##               Df Sum of Sq    RSS    AIC
## - S.F.Ratio    1      36.8 123332 3971.2
## - Books        1      94.1 123389 3971.6
## - Top10perc    1      95.1 123390 3971.6
## - Accept       1     105.8 123401 3971.6
## - Enroll       1     142.2 123437 3971.9
## - F.Undergrad  1     224.5 123519 3972.4
## - Terminal     1     233.4 123528 3972.4
## <none>                     123295 3973.0
## - PhD          1     383.4 123678 3973.4
## - Private      1     645.3 123940 3975.0
## - Personal     1     758.7 124054 3975.7
## - Top25perc    1     981.7 124277 3977.1
## - Apps         1    1403.4 124698 3979.8
## - Expend       1    1424.2 124719 3979.9
## - Room.Board   1    1705.5 125000 3981.7
## - P.Undergrad  1    2348.7 125644 3985.6
## - Outstate     1    3086.3 126381 3990.2
## - perc.alumni  1    5241.6 128537 4003.3
## 
## Step:  AIC=3971.21
## Grad.Rate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc + 
##     F.Undergrad + P.Undergrad + Outstate + Room.Board + Books + 
##     Personal + PhD + Terminal + perc.alumni + Expend
## 
##               Df Sum of Sq    RSS    AIC
## - Books        1      91.7 123423 3969.8
## - Top10perc    1      93.4 123425 3969.8
## - Accept       1     109.1 123441 3969.9
## - Enroll       1     141.2 123473 3970.1
## - F.Undergrad  1     216.4 123548 3970.6
## - Terminal     1     237.6 123569 3970.7
## <none>                     123332 3971.2
## - PhD          1     400.2 123732 3971.7
## + S.F.Ratio    1      36.8 123295 3973.0
## - Private      1     613.0 123945 3973.1
## - Personal     1     788.4 124120 3974.2
## - Top25perc    1     978.6 124310 3975.3
## - Apps         1    1426.8 124759 3978.1
## - Room.Board   1    1705.0 125037 3979.9
## - Expend       1    1854.2 125186 3980.8
## - P.Undergrad  1    2356.0 125688 3983.9
## - Outstate     1    3055.0 126387 3988.2
## - perc.alumni  1    5207.4 128539 4001.3
## 
## Step:  AIC=3969.79
## Grad.Rate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc + 
##     F.Undergrad + P.Undergrad + Outstate + Room.Board + Personal + 
##     PhD + Terminal + perc.alumni + Expend
## 
##               Df Sum of Sq    RSS    AIC
## - Top10perc    1      86.7 123510 3968.3
## - Accept       1     110.1 123533 3968.5
## - Enroll       1     140.9 123564 3968.7
## - F.Undergrad  1     218.1 123641 3969.2
## - Terminal     1     279.1 123703 3969.5
## <none>                     123423 3969.8
## - PhD          1     469.5 123893 3970.7
## + Books        1      91.7 123332 3971.2
## - Private      1     599.2 124023 3971.5
## + S.F.Ratio    1      34.3 123389 3971.6
## - Personal     1     908.6 124332 3973.5
## - Top25perc    1     965.8 124389 3973.8
## - Apps         1    1425.9 124849 3976.7
## - Room.Board   1    1638.9 125062 3978.0
## - Expend       1    1885.5 125309 3979.6
## - P.Undergrad  1    2365.4 125789 3982.5
## - Outstate     1    3107.2 126531 3987.1
## - perc.alumni  1    5293.4 128717 4000.4
## 
## Step:  AIC=3968.33
## Grad.Rate ~ Private + Apps + Accept + Enroll + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + 
##     perc.alumni + Expend
## 
##               Df Sum of Sq    RSS    AIC
## - Enroll       1     181.1 123691 3967.5
## - Accept       1     208.0 123718 3967.6
## - F.Undergrad  1     227.5 123738 3967.8
## - Terminal     1     315.0 123825 3968.3
## <none>                     123510 3968.3
## - PhD          1     533.2 124043 3969.7
## + Top10perc    1      86.7 123423 3969.8
## + Books        1      85.0 123425 3969.8
## + S.F.Ratio    1      32.9 123477 3970.1
## - Private      1     628.0 124138 3970.3
## - Personal     1     902.6 124413 3972.0
## - Room.Board   1    1602.8 125113 3976.3
## - Expend       1    1816.3 125326 3977.7
## - Apps         1    1835.9 125346 3977.8
## - P.Undergrad  1    2491.0 126001 3981.8
## - Outstate     1    3241.9 126752 3986.5
## - Top25perc    1    4063.5 127574 3991.5
## - perc.alumni  1    5334.0 128844 3999.2
## 
## Step:  AIC=3967.47
## Grad.Rate ~ Private + Apps + Accept + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + 
##     perc.alumni + Expend
## 
##               Df Sum of Sq    RSS    AIC
## - F.Undergrad  1      53.0 123744 3965.8
## - Accept       1      95.1 123786 3966.1
## <none>                     123691 3967.5
## - Terminal     1     338.0 124029 3967.6
## + Enroll       1     181.1 123510 3968.3
## + Top10perc    1     126.9 123564 3968.7
## - PhD          1     545.3 124236 3968.9
## + Books        1      83.2 123608 3968.9
## + S.F.Ratio    1      31.5 123660 3969.3
## - Private      1     631.2 124322 3969.4
## - Personal     1     895.3 124586 3971.1
## - Room.Board   1    1523.1 125214 3975.0
## - Expend       1    1715.1 125406 3976.2
## - Apps         1    1720.1 125411 3976.2
## - P.Undergrad  1    2613.8 126305 3981.7
## - Outstate     1    3190.0 126881 3985.3
## - Top25perc    1    4126.3 127817 3991.0
## - perc.alumni  1    5621.6 129313 4000.0
## 
## Step:  AIC=3965.8
## Grad.Rate ~ Private + Apps + Accept + Top25perc + P.Undergrad + 
##     Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni + 
##     Expend
## 
##               Df Sum of Sq    RSS    AIC
## - Accept       1     251.7 123996 3965.4
## <none>                     123744 3965.8
## - Terminal     1     350.2 124094 3966.0
## + Top10perc    1     102.4 123642 3967.2
## - PhD          1     549.6 124294 3967.2
## + Books        1      85.4 123659 3967.3
## + F.Undergrad  1      53.0 123691 3967.5
## + S.F.Ratio    1      26.2 123718 3967.6
## + Enroll       1       6.5 123738 3967.8
## - Private      1     749.8 124494 3968.5
## - Personal     1     973.0 124717 3969.9
## - Room.Board   1    1572.3 125317 3973.6
## - Expend       1    1750.9 125495 3974.7
## - Apps         1    1775.8 125520 3974.9
## - P.Undergrad  1    3195.5 126940 3983.6
## - Outstate     1    3415.9 127160 3985.0
## - Top25perc    1    4094.9 127839 3989.1
## - perc.alumni  1    5579.5 129324 3998.1
## 
## Step:  AIC=3965.38
## Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + Outstate + 
##     Room.Board + Personal + PhD + Terminal + perc.alumni + Expend
## 
##               Df Sum of Sq    RSS    AIC
## <none>                     123996 3965.4
## + Accept       1     251.7 123744 3965.8
## - Terminal     1     391.0 124387 3965.8
## + F.Undergrad  1     209.5 123786 3966.1
## + Top10perc    1     196.1 123800 3966.2
## - PhD          1     524.0 124520 3966.7
## + Books        1      86.5 123909 3966.8
## + Enroll       1      63.4 123932 3967.0
## + S.F.Ratio    1      25.0 123971 3967.2
## - Private      1     785.5 124781 3968.3
## - Personal     1     992.0 124988 3969.6
## - Expend       1    1512.5 125508 3972.8
## - Room.Board   1    1705.6 125701 3974.0
## - Outstate     1    3221.2 127217 3983.3
## - P.Undergrad  1    3449.0 127445 3984.7
## - Top25perc    1    4503.8 128500 3991.1
## - Apps         1    5016.2 129012 3994.2
## - perc.alumni  1    5748.0 129744 3998.6
#summary(optmod) 

optimodel = lm(formula = Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + 
     Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni +      Expend, data = Project4_data) #optimal model

optimodel_coef = data.frame((coef(optimodel))) #coefficient of model 

kable(optimodel_coef,
      caption = "<center> Table 13.1 Stepwise Selection Fit Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Model:Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni +      Expend",
           general_title = "Task 13: ")
Table 13.1 Stepwise Selection Fit Model Coefficient
Coefficient
(Intercept) 33.4888648
PrivateYes 3.5847682
Apps 0.0008950
Top25perc 0.1697318
P.Undergrad -0.0016749
Outstate 0.0010061
Room.Board 0.0018799
Personal -0.0018516
PhD 0.0997365
Terminal -0.0950484
perc.alumni 0.2887259
Expend -0.0003942
Task 13: Model:Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni + Expend
pred.optimodel = predict(optimodel, new = Project4_data) #predict model
pred.optimodel.rmse = rmse(Project4_data$Grad.Rate, pred.optimodel) #RMSE value on actual and predict 

kable(pred.optimodel.rmse,
  caption = "<center>Table 13.2 RMSE value</center>",
      align = "c",
      col.names = "RMSE value") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Model:Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni +      Expend",
           general_title = "Task 13: ")
Table 13.2 RMSE value
RMSE value
12.63261
Task 13: Model:Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni + Expend
Observation:

In this task, I am using stepwise selection function in both directions to find best model that will be influencing Graduation rate and compare it’s RMSE with Ridge and Lasso.
After running step(lm()), I can compare 7 models and selected lowest AIC model. The lowest AIC provides the best fit model compared to others. In the table 13.1, Coefficient of Stepwise Selection Fit model shows the Private(YES), Percentage of new students from top 25% of H.S. class, Percentage of Faculty Phd’s, and Percentage of Alumni who donate influence the Graduation rate.
The Table 13.2 shows the RMSE of the model and compared to Lasso and Ridge, I can conclude that the model is not overfit as the difference between RMSE is negligible.

Reference:


Bluman, A. (2014). Elementary statistics: A step by step approach. McGraw-Hill Education.
Gero, E. (2023). ALY6015_Module3_Logistic_Regression[Lecture recording]. University.https://canvas.northeastern.edu/
Hastie, T., Tibshirani, R., Friedman, J. H. (2001).The elements of statistical learning: data mining, inference, and prediction: with 200 full-color illustrations. Springer.
Kabacoff, R. (2015). R in Action. Manning Publications Co.