rm(list=ls())
#Library######
library(readr)
library(tidyverse)
library(dplyr)
library(DT)
library(RColorBrewer)
library(rio)
library(dbplyr)
library(psych)
library(FSA)
library(knitr)
library(RColorBrewer)
library(plotrix)
library(kableExtra)
library(ISLR)
library(data.table)
library(magrittr)
library(ggplot2)
library(summarytools)
library(hrbrthemes)
library(cowplot)
library(reshape2)
library(scales)
library(zoo)
library(corrplot)
library(lares)
library(leaps)
library(MASS)
library(car)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(tidyr)
library(readxl)
library(forcats)
library(ISLR)
library(caret)
library(pROC)
library(glmnet)
library(Metrics)
#Dataset
Project4_data = data.frame(College)
Regularization
Introduction
Regularization techniques are used to decrease model errors by fitting
functions adequately on provided training sets while avoiding parameter
overfitting. Simple linear regression formula is:
\(Y = \beta_0 + \beta_1X_1 + \beta_2X_2
+...+\beta_nX_n + \epsilon\)
Where
\(Y\) is the response
variable and
\(\beta\) represents the
coefficients for each predictor. Often the model can learn the noise,
and the coefficients are unreliable, which means they can’t generalize
effectively to new data. Overfitting occurs when a dataset has a small
number of observations, multicollinearity exists, or too many predictors
are included in the model. When at all feasible, avoid overfitting a
model. Regularization uses high-valued regression coefficients as a
means of penalty to prevent overfitting (Gero, 2023).
A loss
function described as the residual sum of squares, or RSS, is used in
the fitting process. The coefficients are chosen to reduce the loss
function.
\[RSS = \sum^{n}_{i=1}(\,y_i - \beta_0 -
\sum^{p}_{j=1}\beta_jx_{ij})\,^2\]
Task 1: Splitting dataset into Train
and Test
#table representation of the statistical description
describe(Project4_data)%>%
kable(caption = "<center>Table 1: Statistical Description of College Dataset</center>",
align = "c")%>%
kable_styling(bootstrap_options = c("hover",
"bordered"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "US Colleges from the 1995 issue of US News and World Report.\n",
general_title = "Data Source:",
symbol = c("Categorical Variable"))
Table 1: Statistical Description of College Dataset
|
|
vars
|
n
|
mean
|
sd
|
median
|
trimmed
|
mad
|
min
|
max
|
range
|
skew
|
kurtosis
|
se
|
|
Private*
|
1
|
777
|
1.727156
|
0.4457084
|
2.0
|
1.783307
|
0.00000
|
1.0
|
2.0
|
1.0
|
-1.0179902
|
-0.9649328
|
0.0159897
|
|
Apps
|
2
|
777
|
3001.638353
|
3870.2014844
|
1558.0
|
2193.008026
|
1463.32620
|
81.0
|
48094.0
|
48013.0
|
3.7093849
|
26.5184313
|
138.8427049
|
|
Accept
|
3
|
777
|
2018.804376
|
2451.1139710
|
1110.0
|
1510.287319
|
1008.16800
|
72.0
|
26330.0
|
26258.0
|
3.4045428
|
18.7526403
|
87.9332239
|
|
Enroll
|
4
|
777
|
779.972973
|
929.1761901
|
434.0
|
575.953451
|
354.34140
|
35.0
|
6392.0
|
6357.0
|
2.6800857
|
8.7368340
|
33.3340101
|
|
Top10perc
|
5
|
777
|
27.558559
|
17.6403644
|
23.0
|
25.130016
|
13.34340
|
1.0
|
96.0
|
95.0
|
1.4077650
|
2.1728286
|
0.6328445
|
|
Top25perc
|
6
|
777
|
55.796654
|
19.8047776
|
54.0
|
55.121990
|
20.75640
|
9.0
|
100.0
|
91.0
|
0.2583399
|
-0.5744647
|
0.7104924
|
|
F.Undergrad
|
7
|
777
|
3699.907336
|
4850.4205309
|
1707.0
|
2574.884430
|
1441.08720
|
139.0
|
31643.0
|
31504.0
|
2.6003876
|
7.6120676
|
174.0078673
|
|
P.Undergrad
|
8
|
777
|
855.298584
|
1522.4318873
|
353.0
|
536.361156
|
449.22780
|
1.0
|
21836.0
|
21835.0
|
5.6703938
|
54.5249401
|
54.6169397
|
|
Outstate
|
9
|
777
|
10440.669241
|
4023.0164841
|
9990.0
|
10181.658106
|
4121.62800
|
2340.0
|
21700.0
|
19360.0
|
0.5073133
|
-0.4255258
|
144.3249124
|
|
Room.Board
|
10
|
777
|
4357.526383
|
1096.6964156
|
4200.0
|
4301.704655
|
1005.20280
|
1780.0
|
8124.0
|
6344.0
|
0.4755141
|
-0.2012779
|
39.3437648
|
|
Books
|
11
|
777
|
549.380952
|
165.1053601
|
500.0
|
535.219904
|
148.26000
|
96.0
|
2340.0
|
2244.0
|
3.4715806
|
28.0632782
|
5.9231218
|
|
Personal
|
12
|
777
|
1340.642214
|
677.0714536
|
1200.0
|
1268.345104
|
593.04000
|
250.0
|
6800.0
|
6550.0
|
1.7357745
|
7.0446395
|
24.2898031
|
|
PhD
|
13
|
777
|
72.660232
|
16.3281547
|
75.0
|
73.922954
|
17.79120
|
8.0
|
103.0
|
95.0
|
-0.7652067
|
0.5442923
|
0.5857693
|
|
Terminal
|
14
|
777
|
79.702703
|
14.7223585
|
82.0
|
81.102729
|
14.82600
|
24.0
|
100.0
|
76.0
|
-0.8133924
|
0.2244365
|
0.5281617
|
|
S.F.Ratio
|
15
|
777
|
14.089704
|
3.9583491
|
13.6
|
13.935795
|
3.40998
|
2.5
|
39.8
|
37.3
|
0.6648606
|
2.5228017
|
0.1420050
|
|
perc.alumni
|
16
|
777
|
22.743887
|
12.3918015
|
21.0
|
21.857143
|
13.34340
|
0.0
|
64.0
|
64.0
|
0.6045500
|
-0.1113466
|
0.4445534
|
|
Expend
|
17
|
777
|
9660.171171
|
5221.7684399
|
8377.0
|
8823.704655
|
2730.94920
|
3186.0
|
56233.0
|
53047.0
|
3.4459767
|
18.5875365
|
187.3298993
|
|
Grad.Rate
|
18
|
777
|
65.463320
|
17.1777099
|
65.0
|
65.601926
|
17.79120
|
10.0
|
118.0
|
108.0
|
-0.1133384
|
-0.2187930
|
0.6162469
|
|
Data Source:
|
US Colleges from the 1995 issue of US News and World
Report.
|
|
* Categorical Variable
|
#Splitting dataset into Train and Test######
set.seed(130)
traintestind <- createDataPartition(Project4_data$Private, p=0.70, list = FALSE)
traindata <- Project4_data[traintestind,] #assigning 70% of data to train
testdata <- Project4_data[-traintestind,] #assigning remaining 30% of data to test
head(traindata) %>%
kable(caption = "<center>Table 2: Head of Train dataset of College Dataset</center>",
align = "c")%>%
kable_styling(bootstrap_options = c("hover",
"bordered"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "US Colleges from the 1995 issue of US News and World Report.\n",
general_title = "Data Source:")
Table 2: Head of Train dataset of College Dataset
|
|
Private
|
Apps
|
Accept
|
Enroll
|
Top10perc
|
Top25perc
|
F.Undergrad
|
P.Undergrad
|
Outstate
|
Room.Board
|
Books
|
Personal
|
PhD
|
Terminal
|
S.F.Ratio
|
perc.alumni
|
Expend
|
Grad.Rate
|
|
Adelphi University
|
Yes
|
2186
|
1924
|
512
|
16
|
29
|
2683
|
1227
|
12280
|
6450
|
750
|
1500
|
29
|
30
|
12.2
|
16
|
10527
|
56
|
|
Adrian College
|
Yes
|
1428
|
1097
|
336
|
22
|
50
|
1036
|
99
|
11250
|
3750
|
400
|
1165
|
53
|
66
|
12.9
|
30
|
8735
|
54
|
|
Agnes Scott College
|
Yes
|
417
|
349
|
137
|
60
|
89
|
510
|
63
|
12960
|
5450
|
450
|
875
|
92
|
97
|
7.7
|
37
|
19016
|
59
|
|
Alaska Pacific University
|
Yes
|
193
|
146
|
55
|
16
|
44
|
249
|
869
|
7560
|
4120
|
800
|
1500
|
76
|
72
|
11.9
|
2
|
10922
|
15
|
|
Albertson College
|
Yes
|
587
|
479
|
158
|
38
|
62
|
678
|
41
|
13500
|
3335
|
500
|
675
|
67
|
73
|
9.4
|
11
|
9727
|
55
|
|
Albertus Magnus College
|
Yes
|
353
|
340
|
103
|
17
|
45
|
416
|
230
|
13290
|
5720
|
500
|
1500
|
90
|
93
|
11.5
|
26
|
8861
|
63
|
|
Data Source:
|
US Colleges from the 1995 issue of US News and World
Report.
|
head(testdata)%>%
kable(caption = "<center>Table 3: Head of Test dataset of College Dataset</center>",
align = "c")%>%
kable_styling(bootstrap_options = c("hover",
"bordered"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "US Colleges from the 1995 issue of US News and World Report.\n",
general_title = "Data Source:")
Table 3: Head of Test dataset of College Dataset
|
|
Private
|
Apps
|
Accept
|
Enroll
|
Top10perc
|
Top25perc
|
F.Undergrad
|
P.Undergrad
|
Outstate
|
Room.Board
|
Books
|
Personal
|
PhD
|
Terminal
|
S.F.Ratio
|
perc.alumni
|
Expend
|
Grad.Rate
|
|
Abilene Christian University
|
Yes
|
1660
|
1232
|
721
|
23
|
52
|
2885
|
537
|
7440
|
3300
|
450
|
2200
|
70
|
78
|
18.1
|
12
|
7041
|
60
|
|
Amherst College
|
Yes
|
4302
|
992
|
418
|
83
|
96
|
1593
|
5
|
19760
|
5300
|
660
|
1598
|
93
|
98
|
8.4
|
63
|
21424
|
100
|
|
Antioch University
|
Yes
|
713
|
661
|
252
|
25
|
44
|
712
|
23
|
15476
|
3336
|
400
|
1100
|
69
|
82
|
11.3
|
35
|
42926
|
48
|
|
Appalachian State University
|
No
|
7313
|
4664
|
1910
|
20
|
63
|
9940
|
1035
|
6806
|
2540
|
96
|
2000
|
83
|
96
|
18.3
|
14
|
5854
|
70
|
|
Aquinas College
|
Yes
|
619
|
516
|
219
|
20
|
51
|
1251
|
767
|
11208
|
4124
|
350
|
1615
|
55
|
65
|
12.7
|
25
|
6584
|
65
|
|
Arkansas Tech University
|
No
|
1734
|
1729
|
951
|
12
|
52
|
3602
|
939
|
3460
|
2650
|
450
|
1000
|
57
|
60
|
19.6
|
5
|
4739
|
48
|
|
Data Source:
|
US Colleges from the 1995 issue of US News and World
Report.
|
#Converting dataframe into matrix to make glmnet compatible
#model.matrix() is used as Private is Categorical variable
traindata_x <- model.matrix(Grad.Rate ~., traindata)[,-1]
testdata_x <- model.matrix(Grad.Rate ~., testdata)[,-1]
traindata_y = traindata$Grad.Rate
testdata_y = testdata$Grad.Rate
Observation:
In this task, I have loaded the College dataset and used describe()
for descriptive statistics of the dataset. This dataset have 18
variables and Private is categorical variable whereas others are
continues and discrete variables. I would be using this dataset to run
the Regularization regressions, Lasso L1 and Ridge L2 but before
that I split the dataset into two datasets. Train dataset have 70% of
the College and Test have remaining 30% of the College dataset. Train
and Test datasets’ head observations are displayed using kable().
Ridge Regression:
L2 regularization is used in the Ridge regression technique, which
applies a penalty proportional to the square of the size of the
coefficients. The coefficients in Ridge regression will approach 0 but
will never equal zero.
Regularization at the L2 level:
Adds a penalty equal to the square
of the coefficient magnitude.
The same factor reduces the value of
each coefficient (none are eliminated) (Gero, 2023).
#Ridge Regression#####
#Best lambda value
set.seed(130)
crssval.ridge = cv.glmnet(traindata_x, traindata_y, nfolds = 10, alpha = 0) #Alpha 0 = Ridge, 1 = Lasso
plot(crssval.ridge)

#Finding optimal Lambda value
minlamb.ridge = log(crssval.ridge$lambda.min) #minimum Lambda value
selamb.ridge = log(crssval.ridge$lambda.1se) #1Standard Error Lambda value
loglamb = rbind(minlamb.ridge,selamb.ridge)
col_n <- c("Lambda log Value")
row_n <- c("Lambda Min log value","Lambda 1 Standard Error log value")
dimnames(loglamb) <- list(row_n,col_n)
kable(loglamb,
caption = "<center>Lambda log values of Ridge Regression</center>",
align = "c") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Table a, Sub task 1",
general_title = "Task 1: ")
Lambda log values of Ridge Regression
|
|
Lambda log Value
|
|
Lambda Min log value
|
1.212158
|
|
Lambda 1 Standard Error log value
|
3.258900
|
|
Task 1:
|
|
Table a, Sub task 1
|
##Ridge Regression Model on Lambda Min######
modl.miniridge = glmnet(traindata_x, traindata_y, alpha = 0, lambda = crssval.ridge$lambda.min) #Alpha 0 = Ridge, 1 = Lasso
#print(modl.miniridge)
Dflaridmi = as.numeric(modl.miniridge$df[1])
Dfratlaridmi = as.numeric(modl.miniridge$dev.ratio[1] * 100)
Dflaridmivl = as.numeric(modl.miniridge$lambda[1])
minirdg = cbind(Dflaridmi,
round(Dfratlaridmi,3),
round(Dflaridmivl,3))
col_ridmin <- c("Degree of Freedom", "% Dev", "Lambda")
row_ridmin <- c("Min Ridge Regression")
dimnames(minirdg) <- list(row_ridmin,col_ridmin)
kable(minirdg,
caption = "<center>Ridge Min Regression on Train data</center>",
align = "c") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Table b, Sub task 1",
general_title = "Task 1: ")
Ridge Min Regression on Train data
|
|
Degree of Freedom
|
% Dev
|
Lambda
|
|
Min Ridge Regression
|
17
|
46.201
|
3.361
|
|
Task 1:
|
|
Table b, Sub task 1
|
###Regression Coefficient Lambda Min######
modl.miniridge_coef = as.data.frame(as.matrix(coef(modl.miniridge)))
kable(modl.miniridge_coef,
caption = "<center>Min Model Coefficient</center>",
align = "c",
col.names = "Coefficient") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Ridge Regression Coefficient Min",
general_title = "Task 1: ")
Min Model Coefficient
|
|
Coefficient
|
|
(Intercept)
|
35.7641835
|
|
PrivateYes
|
3.8414462
|
|
Apps
|
0.0003850
|
|
Accept
|
0.0003265
|
|
Enroll
|
0.0003817
|
|
Top10perc
|
0.0917166
|
|
Top25perc
|
0.0852349
|
|
F.Undergrad
|
0.0000420
|
|
P.Undergrad
|
-0.0013521
|
|
Outstate
|
0.0006718
|
|
Room.Board
|
0.0020998
|
|
Books
|
-0.0012379
|
|
Personal
|
-0.0028284
|
|
PhD
|
0.0396931
|
|
Terminal
|
-0.0335658
|
|
S.F.Ratio
|
0.0703800
|
|
perc.alumni
|
0.2912263
|
|
Expend
|
-0.0001139
|
|
Task 1:
|
|
Ridge Regression Coefficient Min
|
##Ridge Regression Model by Lambda 1SE######
modl.seridge = glmnet(traindata_x, traindata_y, alpha = 0, lambda = crssval.ridge$lambda.1se) #alpha 1 is for Lasso (L2)
#print(modl.seridge)
Dflaridse = as.numeric(modl.seridge$df[1])
Dfratlaridse = as.numeric(modl.seridge$dev.ratio[1] * 100)
Dflaridsevl = as.numeric(modl.seridge$lambda[1])
se1rdg = cbind(Dflaridse,
round(Dfratlaridse,3),
round(Dflaridsevl,3))
col_ridse <- c("Degree of Freedom", "% Dev", "Lambda")
row_ridse <- c("Lambda 1SE Ridge Regression")
dimnames(se1rdg) <- list(row_ridse,col_ridse)
kable(se1rdg,
caption = "<center>Ridge 1SE Regression on Train data</center>",
align = "c") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Table c, Sub task 1",
general_title = "Task 1: ")
Ridge 1SE Regression on Train data
|
|
Degree of Freedom
|
% Dev
|
Lambda
|
|
Lambda 1SE Ridge Regression
|
17
|
40.708
|
26.021
|
|
Task 1:
|
|
Table c, Sub task 1
|
##Regression Coefficient Lambda 1SE######
modl.seridge_coef = as.data.frame(as.matrix(coef(modl.seridge)))
kable(modl.seridge_coef,
caption = "<center>1SE Model Coefficient</center>",
align = "c",
col.names = "Coefficient") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Ridge Regression Coefficient 1SE",
general_title = "Task 1: ")
1SE Model Coefficient
|
|
Coefficient
|
|
(Intercept)
|
42.7249657
|
|
PrivateYes
|
2.5934946
|
|
Apps
|
0.0001683
|
|
Accept
|
0.0001864
|
|
Enroll
|
0.0001015
|
|
Top10perc
|
0.0725714
|
|
Top25perc
|
0.0651564
|
|
F.Undergrad
|
-0.0000197
|
|
P.Undergrad
|
-0.0007152
|
|
Outstate
|
0.0004360
|
|
Room.Board
|
0.0012771
|
|
Books
|
-0.0007472
|
|
Personal
|
-0.0018947
|
|
PhD
|
0.0329862
|
|
Terminal
|
0.0219444
|
|
S.F.Ratio
|
-0.0900863
|
|
perc.alumni
|
0.1596575
|
|
Expend
|
0.0001158
|
|
Task 1:
|
|
Ridge Regression Coefficient 1SE
|
##Regression Coefficient with no Regularization######
noreg.rdg = lm(Grad.Rate ~ ., data = traindata)
#coef(noreg.rdg)
noreg.rdg_coef = as.data.frame(as.matrix(coef(noreg.rdg)))
kable(noreg.rdg_coef,
caption = "<center>No Regularization Model Coefficient</center>",
align = "c",
col.names = "Coefficient") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Regression Coefficient, No Regularization",
general_title = "Task 1: ")
No Regularization Model Coefficient
|
|
Coefficient
|
|
(Intercept)
|
34.6946731
|
|
PrivateYes
|
4.0971439
|
|
Apps
|
0.0010680
|
|
Accept
|
-0.0007622
|
|
Enroll
|
0.0006467
|
|
Top10perc
|
0.0844861
|
|
Top25perc
|
0.0857064
|
|
F.Undergrad
|
0.0002285
|
|
P.Undergrad
|
-0.0017100
|
|
Outstate
|
0.0009027
|
|
Room.Board
|
0.0024830
|
|
Books
|
-0.0007731
|
|
Personal
|
-0.0030138
|
|
PhD
|
0.0862238
|
|
Terminal
|
-0.1062937
|
|
S.F.Ratio
|
0.0981707
|
|
perc.alumni
|
0.3499178
|
|
Expend
|
-0.0004000
|
|
Task 1:
|
|
Regression Coefficient, No Regularization
|
##view RMSE on Full data#######
pred.noreg.rdg = predict(noreg.rdg, new = testdata)
Full.rmse.rdg = rmse(testdata$Grad.Rate, pred.noreg.rdg)
#print(Full.rmse.rdg)
###Prediction Test dataset#######
testridge_pred = predict(modl.seridge, newx = testdata_x)
test.ridgermse = rmse(testdata_y, testridge_pred)
###Prediction Train dataset######
trainridge_pred = predict(modl.seridge, newx= traindata_x)
train.ridgermse = rmse(traindata_y, trainridge_pred) #root mean square error
##Comparing RMSE values#######
#print(Full.rmse.rdg)
#print(train.ridgermse)
#print(test.ridgermse)
CompRMSE = cbind(round(Full.rmse.rdg,3),
round(train.ridgermse,3),
round(test.ridgermse,3))
col_comprmse <- c("RMSE value Full dataset", "RMSE value Train dataset", "RMSE value Test dataset")
row_comprmse <- c("RMSE values")
dimnames(CompRMSE) <- list(row_comprmse,col_comprmse)
kable(CompRMSE,
caption = "<center>Comparison of RMSE values of Ridge Min, 1SE, and No Regression on Train, Test, and Full data</center>",
align = "c") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Table d, Sub task 5 & 6",
general_title = "Task 1: ")
Comparison of RMSE values of Ridge Min, 1SE, and No Regression on Train,
Test, and Full data
|
|
RMSE value Full dataset
|
RMSE value Train dataset
|
RMSE value Test dataset
|
|
RMSE values
|
12.413
|
13.539
|
12.553
|
|
Task 1:
|
|
Table d, Sub task 5 & 6
|
Observation:
In this project, I would be using Graduate rate as dependent variable
and independent variables which are affecting the graduation rate.
Ridge Regression have been tested on Train and Test dataset to determine
the performance of the model and identify whether there is an overfit or
not. Before we proceed with the model, I would like to know the log
value of Lambda. Lambda min and 1SE values are 1.21 and 3.25
respectively.
The significance of the lambda min value is that it is
the value of the regularization parameter that produces the smallest
cross-validated mean squared error (Hastie, 2001). Which means it is the
value of lambda that results in the best prediction performance, as
measured by the mean squared error.
The significance of the 1SE
value is that it is the value of the regularization parameter that
produces a prediction performance that is within one standard error of
the prediction performance at the lambda min value (Hastie, 2001). Which
means it is the value of lambda that results in a prediction performance
that is almost as good as the best prediction performance, but with less
overfitting to the training data.
Both the lambda min and 1SE values
are useful in determining the appropriate level of regularization in
Ridge Regression, as they help to balance the trade-off between
overfitting and underfitting.
Using the Cross-Validation, I would be extracting log value of Lambda
and the same could be plotted to understand the number of variables
influencing the Lambda minimum and 1 Standard Error value. In the plot,
the Cross-Validation values are represented with dotted vertical lines.
The minimum and 1 Standard Error values are 1.21 and 3.25 respectively.
The number of variables it has counted from College dataset is 17 means
all other variables effecting Graduation rate. The same is represented
in Task 1: Table a.
Task 1: Table b represents the Minimum Ridge values. Degree of Freedom,
Deviation Ratio, and Lambda values are captured from Train dataset using
the glmnet(). Furthermore, Task 1 table shows the coefficient values of
the glmnet() on Train dataset. Same procedures are repeated for 1SE and
No Regularization.
The Root Mean Squared Error (RMSE) value is a
measure of the difference between the predicted values and the actual
values. It is a commonly used evaluation metric for regression models.
The RMSE value represents the average difference between the predicted
and actual values, and it is calculated by taking the square root of the
mean of the squared differences between the predicted and actual values.
A lower RMSE value indicates that the model has a better fit to the data
and is therefore making more accurate predictions (Gero, 2023).
The
last table, Task1: Table D shows the RMSE value comparison and shows
three different RMSE values. RMSE value Full dataset, Train dataset, and
Test dataset. As the difference between these three values is
negligible, the model is not overfitting.
In the all three coefficient tables, the PrivateYES and Perc.alumni
values are influencing the graduation rate significantly followed by
Top10perc, Top25perc, Student to Faculty ratio(except Lambda 1SE model),
and Phd.
To summarize, Graduate rate is influenced by type of
university(PrivateYES), Percentage alumni donate, Percentage of new
students from top 10% of H.S. class, Percentage of new students from top
25% of H.S. class, and Percentage of Faculty with Phd’s.
Lasso:
The Least Absolute Shrinkage and Selection Operator approach, sometimes
known as Lasso, is a popular regularization implementation method.
Regularization at the L1 level:
Adds a penalty proportional to the
absolute value of the magnitude of the coefficients and restricts their
size.
L1 has the ability to decrease coefficients to zero and hence
delete the variable.
#LASSO######
#Best lambda value
set.seed(130)
crssval.lasso = cv.glmnet(traindata_x, traindata_y, nfolds = 10, alpha = 1)
plot(crssval.lasso)

#Finding optimal Lambda value
minlamb.lasso = log(crssval.lasso$lambda.min) #minimum Lambda value
se1lamb.lasso = log(crssval.lasso$lambda.1se) #1Standard Error Lambda value
loglamblasso = rbind(minlamb.lasso,se1lamb.lasso)
col_lasso <- c("Lambda log Value: Lasso")
row_lasso <- c("Lambda Min log value","Lambda 1 Standard Error log value")
dimnames(loglamblasso) <- list(row_lasso,col_lasso)
kable(round(loglamblasso,3),
caption = "<center>Lambda log values of Lasso Regression</center>",
align = "c") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Table a, Sub task 1",
general_title = "Task 2: ")
Lambda log values of Lasso Regression
|
|
Lambda log Value: Lasso
|
|
Lambda Min log value
|
-0.672
|
|
Lambda 1 Standard Error log value
|
0.631
|
|
Task 2:
|
|
Table a, Sub task 1
|
#Lasso Regression Model on Lambda Min
modl.minilasso = glmnet(traindata_x, traindata_y, alpha = 1, lambda = crssval.lasso$lambda.min)
#print(modl.minilasso)
Dflamlassomi = as.numeric(modl.minilasso$df[1])
Dfratlassomi = as.numeric(modl.minilasso$dev.ratio[1] * 100)
Dflamlassomivl = as.numeric(modl.minilasso$lambda[1])
lassominrdg = cbind(Dflamlassomi,
round(Dfratlassomi,2),
round(Dflamlassomivl,3))
col_lassomin <- c("Degree of Freedom", "% Dev", "Lambda")
row_lassomin <- c("Lambda Min Ridge Regression")
dimnames(lassominrdg) <- list(row_ridmin,col_ridmin)
kable(lassominrdg,
caption = "<center>Lasso Min Regression on Train data</center>",
align = "c") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Table b, Sub task 1",
general_title = "Task 2: ")
Lasso Min Regression on Train data
|
|
Degree of Freedom
|
% Dev
|
Lambda
|
|
Min Ridge Regression
|
9
|
45.51
|
0.511
|
|
Task 2:
|
|
Table b, Sub task 1
|
#Regression Coefficient Lambda Min
lammin.lasso_coef = as.data.frame(as.matrix(coef(modl.minilasso)))
kable(lammin.lasso_coef,
caption = "<center>Lasso Lamba Min Regularization Model Coefficient</center>",
align = "c",
col.names = "Coefficient") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Lasso Regression Coefficient, Lamba Min",
general_title = "Task 2: ")
Lasso Lamba Min Regularization Model Coefficient
|
|
Coefficient
|
|
(Intercept)
|
37.1259921
|
|
PrivateYes
|
2.0139038
|
|
Apps
|
0.0005527
|
|
Accept
|
0.0000000
|
|
Enroll
|
0.0000000
|
|
Top10perc
|
0.0505645
|
|
Top25perc
|
0.0937967
|
|
F.Undergrad
|
0.0000000
|
|
P.Undergrad
|
-0.0011761
|
|
Outstate
|
0.0007233
|
|
Room.Board
|
0.0018791
|
|
Books
|
0.0000000
|
|
Personal
|
-0.0026094
|
|
PhD
|
0.0000000
|
|
Terminal
|
0.0000000
|
|
S.F.Ratio
|
0.0000000
|
|
perc.alumni
|
0.3198977
|
|
Expend
|
0.0000000
|
|
Task 2:
|
|
Lasso Regression Coefficient, Lamba Min
|
#lasso Regression Model by Lambda 1SE
modl.selasso = glmnet(traindata_x, traindata_y, alpha = 1, lambda = crssval.lasso$lambda.1se) #alpha 1 is for Lasso (L2)
#print(modl.selasso)
Dflamlassose1 = as.numeric(modl.selasso$df[1])
Dfratlassose1 = as.numeric(modl.selasso$dev.ratio[1] * 100)
Dflamlassose1vl = as.numeric(modl.selasso$lambda[1])
lassose1rdg = cbind(Dflamlassose1,
round(Dfratlassose1,2),
round(Dflamlassose1vl,3))
col_lassose1 <- c("Degree of Freedom", "% Dev", "Lambda")
row_lassose1 <- c("Lambda Min Ridge Regression")
dimnames(lassose1rdg) <- list(row_lassose1,col_lassose1)
kable(lassose1rdg,
caption = "<center>Lasso 1SE Regression on Train data</center>",
align = "c") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Table c, Sub task 2",
general_title = "Task 2: ")
Lasso 1SE Regression on Train data
|
|
Degree of Freedom
|
% Dev
|
Lambda
|
|
Lambda Min Ridge Regression
|
7
|
40.98
|
1.879
|
|
Task 2:
|
|
Table c, Sub task 2
|
#Regression Coefficient Lambda 1SE
lam1se.lasso_coef = as.data.frame(as.matrix(coef(modl.selasso)))
kable(lam1se.lasso_coef,
caption = "<center>Lasso Lamba 1SE Regularization Model Coefficient</center>",
align = "c",
col.names = "Coefficient") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Lasso Regression Coefficient, Lamba 1SE",
general_title = "Task 2: ")
Lasso Lamba 1SE Regularization Model Coefficient
|
|
Coefficient
|
|
(Intercept)
|
40.6998598
|
|
PrivateYes
|
0.0000000
|
|
Apps
|
0.0000000
|
|
Accept
|
0.0000000
|
|
Enroll
|
0.0000000
|
|
Top10perc
|
0.0760849
|
|
Top25perc
|
0.0631056
|
|
F.Undergrad
|
0.0000000
|
|
P.Undergrad
|
-0.0001614
|
|
Outstate
|
0.0009352
|
|
Room.Board
|
0.0011986
|
|
Books
|
0.0000000
|
|
Personal
|
-0.0013104
|
|
PhD
|
0.0000000
|
|
Terminal
|
0.0000000
|
|
S.F.Ratio
|
0.0000000
|
|
perc.alumni
|
0.2610037
|
|
Expend
|
0.0000000
|
|
Task 2:
|
|
Lasso Regression Coefficient, Lamba 1SE
|
#Regression Coefficient with no Regularization
noreg.lasso = lm(Grad.Rate ~ ., data = traindata)
noreg.lasso_coef = as.data.frame(as.matrix(coef(noreg.lasso)))
kable(noreg.lasso_coef,
caption = "<center>No Regularization Model Coefficient</center>",
align = "c",
col.names = "Coefficient") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Regression Coefficient, No Regularization",
general_title = "Task 2: ")
No Regularization Model Coefficient
|
|
Coefficient
|
|
(Intercept)
|
34.6946731
|
|
PrivateYes
|
4.0971439
|
|
Apps
|
0.0010680
|
|
Accept
|
-0.0007622
|
|
Enroll
|
0.0006467
|
|
Top10perc
|
0.0844861
|
|
Top25perc
|
0.0857064
|
|
F.Undergrad
|
0.0002285
|
|
P.Undergrad
|
-0.0017100
|
|
Outstate
|
0.0009027
|
|
Room.Board
|
0.0024830
|
|
Books
|
-0.0007731
|
|
Personal
|
-0.0030138
|
|
PhD
|
0.0862238
|
|
Terminal
|
-0.1062937
|
|
S.F.Ratio
|
0.0981707
|
|
perc.alumni
|
0.3499178
|
|
Expend
|
-0.0004000
|
|
Task 2:
|
|
Regression Coefficient, No Regularization
|
#view RMSE on Full data
pred.noreg = predict(noreg.lasso, new = testdata)
Full.rmse = rmse(testdata$Grad.Rate, pred.noreg)
#print(Full.rmse)
#Prediction Test dataset
testlasso_pred = predict(modl.selasso, newx = testdata_x)
test.lassormse = rmse(testdata_y, testlasso_pred)
#Prediction Train dataset
trainlasso_pred = predict(modl.selasso, newx= traindata_x)
train.lassormse = rmse(traindata_y, trainlasso_pred) #root mean square error
#Comparing RMSE values
#print(Full.rmse)
#print(train.lassormse)
#print(test.lassormse)
CompRMSE_lasso = cbind(round(Full.rmse,3),
round(train.lassormse,3),
round(test.lassormse,3))
col_comprmse_lasso <- c("RMSE value Full dataset", "RMSE value Train dataset", "RMSE value Test dataset")
row_comprmse_lasso <- c("RMSE values")
dimnames(CompRMSE_lasso) <- list(row_comprmse_lasso,col_comprmse_lasso)
kable(CompRMSE_lasso,
caption = "<center>Comparison of RMSE values of Ridge Min, 1SE, and No Regression on Train, Test, and Full data</center>",
align = "c") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Table d, Sub task 5 & 6",
general_title = "Task 2: ")
Comparison of RMSE values of Ridge Min, 1SE, and No Regression on Train,
Test, and Full data
|
|
RMSE value Full dataset
|
RMSE value Train dataset
|
RMSE value Test dataset
|
|
RMSE values
|
12.413
|
13.508
|
12.604
|
|
Task 2:
|
|
Table d, Sub task 5 & 6
|
Observation:
Continuing the regularization, Lasso regression is used on Train and
Test dataset to understand how Graduation rare is influenced by other
variables. But before we proceed, we need to understand the Lambda Min
and 1SE using Cross Validation method.
In Lasso Regression, the
regularization term is a penalty for having large coefficients for the
features, and the value of the penalty is controlled by the lambda
parameter. The goal is to find a balance between fitting the data well
and having a simple model with small coefficients for the features(Gero,
2023).
Lambda min is the value of the lambda parameter that gives
the minimum cross-validated mean square error (MSE) for the model. This
is the value that would be chosen if the goal were to minimize the MSE
(Hastie, 2001).
1SE (one standard error) is a commonly used measure
of how much a model’s MSE would change if it were fit to a different
dataset. The 1SE value of lambda is the value that gives a model whose
MSE is within one standard deviation of the MSE of the model with lambda
min (Hastie, 2001).
The significance of Lambda min and 1SE values in
Lasso Regression is that they can help to determine the best trade-off
between model fit and complexity. Choosing the lambda value that gives
the lowest MSE may result in overfitting, while choosing a higher lambda
value may result in underfitting. The 1SE value provides a measure of
the uncertainty in the MSE and can be used to choose a more robust model
that is less sensitive to the choice of data (Hastie, 2001; Gero,
2023).
Using the Cross-Validation, I would be extracting log value of Lambda
and the same could be plotted to understand the number of variables
influencing the Lambda minimum and 1 Standard Error value. In the plot,
the Cross-Validation values are represented with dotted vertical lines.
The minimum and 1 Standard Error values are -0.67 and 0.63 respectively.
The number of variables it has counted from College dataset is between 7
and 11 means these variables are effecting Graduation rate. The same is
represented in Task 2: Table a. Task 2: Table b represents the Minimum
Lasso values. Degree of Freedom, Deviation Ratio, and Lambda values are
captured from Train dataset using the glmnet(). Furthermore, Task 2
table shows the coefficient values of the glmnet() on Train dataset.
Same procedures are repeated for 1SE and No Regularization. The Root
Mean Squared Error (RMSE) value is a measure of the difference between
the predicted values and the actual values. It is a commonly used
evaluation metric for regression models. The RMSE value represents the
average difference between the predicted and actual values, and it is
calculated by taking the square root of the mean of the squared
differences between the predicted and actual values. A lower RMSE value
indicates that the model has a better fit to the data and is therefore
making more accurate predictions (Gero, 2023). The last table, Task2:
Table D shows the RMSE value comparison and shows three different RMSE
values. RMSE value Full dataset, Train dataset, and Test dataset. As the
difference between these three values is negligible, the model is not
overfitting.
Furthermore, in the all three coefficient tables,
Perc.alumni values are influencing the graduation rate significantly
followed by Top25perc. The noticeable difference between Ridge and Lasso
is Lasso eliminate few variables and captures most influencing
variables. This elimination would be different for Lambda Min and Lamnda
1SE. In the Task 2: Lambda Min Coefficient, Accept, Enroll, Fulltime
Undergraduates, Books, Phd, Terminal, and Expend variables are
eliminiated. However, in Lambda 1SE Coefficient table, in addition to
min coefficient elimination, Private(Yes) and Student to Faculty ratio
variables are eliminated.
Surprisingly, the Private(Yes) variables
have most impact on Graduation Rate in Ridge Regression but in Lasso for
1SE Lamnda value, it is eliminated. To summarize, Graduate rate is
influenced by Percentage alumni donate in all three regression models
and except in 1SE Lambda, the Private(YES) is influencing variable. In
Lambda 1SE value total 6 variables are considered influencing Graduation
rate and in Lambda Min value 9 variables.
The Task 2: Table d shows the RMSE values of three models and the
difference between them is negligible, so we can conclude that Lasso
models are not overfitting.
Task 12 & 13:
#Optimal model using step()
optmod = step(lm(Grad.Rate ~ ., data = Project4_data), direction = "both") #finding optimal model
## Start: AIC=3972.98
## Grad.Rate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc +
## F.Undergrad + P.Undergrad + Outstate + Room.Board + Books +
## Personal + PhD + Terminal + S.F.Ratio + perc.alumni + Expend
##
## Df Sum of Sq RSS AIC
## - S.F.Ratio 1 36.8 123332 3971.2
## - Books 1 94.1 123389 3971.6
## - Top10perc 1 95.1 123390 3971.6
## - Accept 1 105.8 123401 3971.6
## - Enroll 1 142.2 123437 3971.9
## - F.Undergrad 1 224.5 123519 3972.4
## - Terminal 1 233.4 123528 3972.4
## <none> 123295 3973.0
## - PhD 1 383.4 123678 3973.4
## - Private 1 645.3 123940 3975.0
## - Personal 1 758.7 124054 3975.7
## - Top25perc 1 981.7 124277 3977.1
## - Apps 1 1403.4 124698 3979.8
## - Expend 1 1424.2 124719 3979.9
## - Room.Board 1 1705.5 125000 3981.7
## - P.Undergrad 1 2348.7 125644 3985.6
## - Outstate 1 3086.3 126381 3990.2
## - perc.alumni 1 5241.6 128537 4003.3
##
## Step: AIC=3971.21
## Grad.Rate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc +
## F.Undergrad + P.Undergrad + Outstate + Room.Board + Books +
## Personal + PhD + Terminal + perc.alumni + Expend
##
## Df Sum of Sq RSS AIC
## - Books 1 91.7 123423 3969.8
## - Top10perc 1 93.4 123425 3969.8
## - Accept 1 109.1 123441 3969.9
## - Enroll 1 141.2 123473 3970.1
## - F.Undergrad 1 216.4 123548 3970.6
## - Terminal 1 237.6 123569 3970.7
## <none> 123332 3971.2
## - PhD 1 400.2 123732 3971.7
## + S.F.Ratio 1 36.8 123295 3973.0
## - Private 1 613.0 123945 3973.1
## - Personal 1 788.4 124120 3974.2
## - Top25perc 1 978.6 124310 3975.3
## - Apps 1 1426.8 124759 3978.1
## - Room.Board 1 1705.0 125037 3979.9
## - Expend 1 1854.2 125186 3980.8
## - P.Undergrad 1 2356.0 125688 3983.9
## - Outstate 1 3055.0 126387 3988.2
## - perc.alumni 1 5207.4 128539 4001.3
##
## Step: AIC=3969.79
## Grad.Rate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc +
## F.Undergrad + P.Undergrad + Outstate + Room.Board + Personal +
## PhD + Terminal + perc.alumni + Expend
##
## Df Sum of Sq RSS AIC
## - Top10perc 1 86.7 123510 3968.3
## - Accept 1 110.1 123533 3968.5
## - Enroll 1 140.9 123564 3968.7
## - F.Undergrad 1 218.1 123641 3969.2
## - Terminal 1 279.1 123703 3969.5
## <none> 123423 3969.8
## - PhD 1 469.5 123893 3970.7
## + Books 1 91.7 123332 3971.2
## - Private 1 599.2 124023 3971.5
## + S.F.Ratio 1 34.3 123389 3971.6
## - Personal 1 908.6 124332 3973.5
## - Top25perc 1 965.8 124389 3973.8
## - Apps 1 1425.9 124849 3976.7
## - Room.Board 1 1638.9 125062 3978.0
## - Expend 1 1885.5 125309 3979.6
## - P.Undergrad 1 2365.4 125789 3982.5
## - Outstate 1 3107.2 126531 3987.1
## - perc.alumni 1 5293.4 128717 4000.4
##
## Step: AIC=3968.33
## Grad.Rate ~ Private + Apps + Accept + Enroll + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal +
## perc.alumni + Expend
##
## Df Sum of Sq RSS AIC
## - Enroll 1 181.1 123691 3967.5
## - Accept 1 208.0 123718 3967.6
## - F.Undergrad 1 227.5 123738 3967.8
## - Terminal 1 315.0 123825 3968.3
## <none> 123510 3968.3
## - PhD 1 533.2 124043 3969.7
## + Top10perc 1 86.7 123423 3969.8
## + Books 1 85.0 123425 3969.8
## + S.F.Ratio 1 32.9 123477 3970.1
## - Private 1 628.0 124138 3970.3
## - Personal 1 902.6 124413 3972.0
## - Room.Board 1 1602.8 125113 3976.3
## - Expend 1 1816.3 125326 3977.7
## - Apps 1 1835.9 125346 3977.8
## - P.Undergrad 1 2491.0 126001 3981.8
## - Outstate 1 3241.9 126752 3986.5
## - Top25perc 1 4063.5 127574 3991.5
## - perc.alumni 1 5334.0 128844 3999.2
##
## Step: AIC=3967.47
## Grad.Rate ~ Private + Apps + Accept + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal +
## perc.alumni + Expend
##
## Df Sum of Sq RSS AIC
## - F.Undergrad 1 53.0 123744 3965.8
## - Accept 1 95.1 123786 3966.1
## <none> 123691 3967.5
## - Terminal 1 338.0 124029 3967.6
## + Enroll 1 181.1 123510 3968.3
## + Top10perc 1 126.9 123564 3968.7
## - PhD 1 545.3 124236 3968.9
## + Books 1 83.2 123608 3968.9
## + S.F.Ratio 1 31.5 123660 3969.3
## - Private 1 631.2 124322 3969.4
## - Personal 1 895.3 124586 3971.1
## - Room.Board 1 1523.1 125214 3975.0
## - Expend 1 1715.1 125406 3976.2
## - Apps 1 1720.1 125411 3976.2
## - P.Undergrad 1 2613.8 126305 3981.7
## - Outstate 1 3190.0 126881 3985.3
## - Top25perc 1 4126.3 127817 3991.0
## - perc.alumni 1 5621.6 129313 4000.0
##
## Step: AIC=3965.8
## Grad.Rate ~ Private + Apps + Accept + Top25perc + P.Undergrad +
## Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni +
## Expend
##
## Df Sum of Sq RSS AIC
## - Accept 1 251.7 123996 3965.4
## <none> 123744 3965.8
## - Terminal 1 350.2 124094 3966.0
## + Top10perc 1 102.4 123642 3967.2
## - PhD 1 549.6 124294 3967.2
## + Books 1 85.4 123659 3967.3
## + F.Undergrad 1 53.0 123691 3967.5
## + S.F.Ratio 1 26.2 123718 3967.6
## + Enroll 1 6.5 123738 3967.8
## - Private 1 749.8 124494 3968.5
## - Personal 1 973.0 124717 3969.9
## - Room.Board 1 1572.3 125317 3973.6
## - Expend 1 1750.9 125495 3974.7
## - Apps 1 1775.8 125520 3974.9
## - P.Undergrad 1 3195.5 126940 3983.6
## - Outstate 1 3415.9 127160 3985.0
## - Top25perc 1 4094.9 127839 3989.1
## - perc.alumni 1 5579.5 129324 3998.1
##
## Step: AIC=3965.38
## Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + Outstate +
## Room.Board + Personal + PhD + Terminal + perc.alumni + Expend
##
## Df Sum of Sq RSS AIC
## <none> 123996 3965.4
## + Accept 1 251.7 123744 3965.8
## - Terminal 1 391.0 124387 3965.8
## + F.Undergrad 1 209.5 123786 3966.1
## + Top10perc 1 196.1 123800 3966.2
## - PhD 1 524.0 124520 3966.7
## + Books 1 86.5 123909 3966.8
## + Enroll 1 63.4 123932 3967.0
## + S.F.Ratio 1 25.0 123971 3967.2
## - Private 1 785.5 124781 3968.3
## - Personal 1 992.0 124988 3969.6
## - Expend 1 1512.5 125508 3972.8
## - Room.Board 1 1705.6 125701 3974.0
## - Outstate 1 3221.2 127217 3983.3
## - P.Undergrad 1 3449.0 127445 3984.7
## - Top25perc 1 4503.8 128500 3991.1
## - Apps 1 5016.2 129012 3994.2
## - perc.alumni 1 5748.0 129744 3998.6
#summary(optmod)
optimodel = lm(formula = Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad +
Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni + Expend, data = Project4_data) #optimal model
optimodel_coef = data.frame((coef(optimodel))) #coefficient of model
kable(optimodel_coef,
caption = "<center> Table 13.1 Stepwise Selection Fit Model Coefficient</center>",
align = "c",
col.names = "Coefficient") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Model:Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni + Expend",
general_title = "Task 13: ")
Table 13.1 Stepwise Selection Fit Model Coefficient
|
|
Coefficient
|
|
(Intercept)
|
33.4888648
|
|
PrivateYes
|
3.5847682
|
|
Apps
|
0.0008950
|
|
Top25perc
|
0.1697318
|
|
P.Undergrad
|
-0.0016749
|
|
Outstate
|
0.0010061
|
|
Room.Board
|
0.0018799
|
|
Personal
|
-0.0018516
|
|
PhD
|
0.0997365
|
|
Terminal
|
-0.0950484
|
|
perc.alumni
|
0.2887259
|
|
Expend
|
-0.0003942
|
|
Task 13:
|
|
Model:Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad +
Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni + Expend
|
pred.optimodel = predict(optimodel, new = Project4_data) #predict model
pred.optimodel.rmse = rmse(Project4_data$Grad.Rate, pred.optimodel) #RMSE value on actual and predict
kable(pred.optimodel.rmse,
caption = "<center>Table 13.2 RMSE value</center>",
align = "c",
col.names = "RMSE value") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Model:Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni + Expend",
general_title = "Task 13: ")
Table 13.2 RMSE value
|
RMSE value
|
|
12.63261
|
|
Task 13:
|
|
Model:Grad.Rate ~ Private + Apps + Top25perc + P.Undergrad +
Outstate + Room.Board + Personal + PhD + Terminal + perc.alumni + Expend
|
Observation:
In this task, I am using stepwise selection function in both
directions to find best model that will be influencing Graduation rate
and compare it’s RMSE with Ridge and Lasso.
After running
step(lm()), I can compare 7 models and selected lowest AIC model. The
lowest AIC provides the best fit model compared to others. In the table
13.1, Coefficient of Stepwise Selection Fit model shows the
Private(YES), Percentage of new students from top 25% of H.S. class,
Percentage of Faculty Phd’s, and Percentage of Alumni who donate
influence the Graduation rate.
The Table 13.2 shows the RMSE of the
model and compared to Lasso and Ridge, I can conclude that the model is
not overfit as the difference between RMSE is negligible.
Reference:
Bluman, A. (2014). Elementary statistics: A step by step
approach. McGraw-Hill Education.
Gero, E. (2023).
ALY6015_Module3_Logistic_Regression[Lecture recording]. Canvas@Northeastern
University.https://canvas.northeastern.edu/
Hastie, T., Tibshirani,
R., Friedman, J. H. (2001).The elements of statistical learning: data
mining, inference, and prediction: with 200 full-color
illustrations. Springer.
Kabacoff, R. (2015). R in
Action. Manning Publications Co.