Machine Learning Ex3

Eldad Aviv: 206836165

Exercise

Pre Process

Load Libraries

library(dplyr)
library(ggplot2)

library(ISLR)

library(caret)
library(rsample)
library(yardstick)
library(recipes)

Use the “U.S. News and World Report’s College Data” dataset (‘College’ in ISLR). this dataset contains 777 observations of US colleges with the following variables:

data("College", package = "ISLR")
head(College) 
##                              Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University     Yes 1660   1232    721        23        52
## Adelphi University               Yes 2186   1924    512        16        29
## Adrian College                   Yes 1428   1097    336        22        50
## Agnes Scott College              Yes  417    349    137        60        89
## Alaska Pacific University        Yes  193    146     55        16        44
## Albertson College                Yes  587    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University        2885         537     7440       3300   450
## Adelphi University                  2683        1227    12280       6450   750
## Adrian College                      1036          99    11250       3750   400
## Agnes Scott College                  510          63    12960       5450   450
## Alaska Pacific University            249         869     7560       4120   800
## Albertson College                    678          41    13500       3335   500
##                              Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University     2200  70       78      18.1          12   7041
## Adelphi University               1500  29       30      12.2          16  10527
## Adrian College                   1165  53       66      12.9          30   8735
## Agnes Scott College               875  92       97       7.7          37  19016
## Alaska Pacific University        1500  76       72      11.9           2  10922
## Albertson College                 675  67       73       9.4          11   9727
##                              Grad.Rate
## Abilene Christian University        60
## Adelphi University                  56
## Adrian College                      54
## Agnes Scott College                 59
## Alaska Pacific University           15
## Albertson College                   55

Lets predict Grad.Rate (Graduation rate) from these 17 variables.

1) Split to train and test. use 0.7 for the train data

set.seed(123444) 
splits <- initial_split(College, prop = 0.7)
College.train <- training(splits)
College.test <- testing(splits)

2) Then, use each of the learned methods to answer this task. That is:

i. Best Subset Selection

Show best regression model to predict Grad.Rate with 1-8 parameters

regfit.full <- leaps::regsubsets(Grad.Rate ~ ., data = College.train)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Grad.Rate ~ ., data = College.train)
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Outstate        FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 ) " "        " "  " "    " "    " "       "*"       " "        
## 3  ( 1 ) " "        " "  " "    " "    " "       "*"       " "        
## 4  ( 1 ) " "        " "  " "    " "    " "       "*"       " "        
## 5  ( 1 ) " "        "*"  " "    " "    " "       "*"       " "        
## 6  ( 1 ) " "        "*"  " "    " "    " "       "*"       " "        
## 7  ( 1 ) " "        "*"  " "    " "    " "       "*"       " "        
## 8  ( 1 ) "*"        "*"  " "    " "    " "       "*"       " "        
##          P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 ) " "         "*"      " "        " "   " "      " " " "      " "      
## 2  ( 1 ) " "         "*"      " "        " "   " "      " " " "      " "      
## 3  ( 1 ) " "         "*"      " "        " "   "*"      " " " "      " "      
## 4  ( 1 ) " "         "*"      " "        " "   "*"      " " " "      " "      
## 5  ( 1 ) "*"         "*"      " "        " "   " "      " " " "      " "      
## 6  ( 1 ) "*"         "*"      " "        " "   "*"      " " " "      " "      
## 7  ( 1 ) "*"         "*"      "*"        " "   "*"      " " " "      " "      
## 8  ( 1 ) "*"         "*"      "*"        " "   "*"      " " " "      " "      
##          perc.alumni Expend
## 1  ( 1 ) " "         " "   
## 2  ( 1 ) " "         " "   
## 3  ( 1 ) " "         " "   
## 4  ( 1 ) "*"         " "   
## 5  ( 1 ) "*"         " "   
## 6  ( 1 ) "*"         " "   
## 7  ( 1 ) "*"         " "   
## 8  ( 1 ) "*"         " "

Best Subset: Results & Plot

# Assuming reg.summary is the summary of regfit.full
reg.summary <- summary(regfit.full)

# Extract relevant components from reg.summary
rsq <- reg.summary$rsq
rss <- reg.summary$rss
adjr2 <- reg.summary$adjr2
cp <- reg.summary$cp
bic <- reg.summary$bic

# Combine the components into a data frame
reg.summary_df <- data.frame(model_size = 1:length(rsq), rsq = rsq, rss = rss, adjr2 = adjr2, cp = cp, bic = bic)

# Pivot the data frame to long format
reg.summary_df <- tidyr::pivot_longer(reg.summary_df, cols = -model_size, names_to = "Index", values_to = "value")

# Print the resulting data frame to verify
print(reg.summary_df)
## # A tibble: 40 × 3
##    model_size Index      value
##         <int> <chr>      <dbl>
##  1          1 rsq        0.303
##  2          1 rss   114200.   
##  3          1 adjr2      0.302
##  4          1 cp       152.   
##  5          1 bic     -184.   
##  6          2 rsq        0.374
##  7          2 rss   102620.   
##  8          2 adjr2      0.372
##  9          2 cp        84.1  
## 10          2 bic     -236.   
## # ℹ 30 more rows
# Plotting
ggplot(reg.summary_df, aes(x = model_size, y = value)) +
  facet_wrap(~Index, scales = "free", ncol = 3) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:length(rsq), minor_breaks = NULL) +
  theme_minimal() +
  labs(x = "Model Size", y = "Value")

Report:

Model Size Impact: The performance metrics suggest that increasing the number of predictors generally improves model fit, but there is a point of diminishing returns.

Optimal Model Size: Based on BIC and Cp, an optimal model size likely lies around 5-6 predictors, balancing model fit and complexity.

ii. Ridge regression

Pre Process:

# Dummy Coding:
rec <- recipe(Grad.Rate ~ ., data = College.train) |> 
  step_dummy(all_factor_predictors())
# Scaling:
rec <- rec |> 
  step_scale(all_numeric_predictors())

Tuning: Test 50 optional Lambda values range from 0.25 to 1024

Train Control: Use 5-fold Cross Validation

tg <- expand.grid(
  alpha = 0, # Alpha(0-1) # 0 = ridge, 1 = lasso 
  lambda = 2 ^ seq(10, -2, length = 50) # [0, Inf]
)

tc <- trainControl(method = "cv", number = 5)

Train Model:

set.seed(1)
rigreg_fit <- train(
  x = rec,
  data = College.train,
  method = "glmnet",
  tuneGrid = tg,
  trControl =  tc
)
## Loading required namespace: glmnet
## Loading required package: Matrix
## Loaded glmnet 4.1-8
rigreg_fit
## glmnet 
## 
## 543 samples
##  17 predictor
## 
## Recipe steps: dummy, scale 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 435, 436, 434, 433, 434 
## Resampling results across tuning parameters:
## 
##   lambda        RMSE      Rsquared   MAE      
##      0.2500000  13.15738  0.4334813   9.840874
##      0.2962522  13.15738  0.4334813   9.840874
##      0.3510616  13.15738  0.4334813   9.840874
##      0.4160111  13.15738  0.4334813   9.840874
##      0.4929769  13.15738  0.4334813   9.840874
##      0.5841820  13.15738  0.4334813   9.840874
##      0.6922609  13.15738  0.4334813   9.840874
##      0.8203354  13.15738  0.4334813   9.840874
##      0.9721047  13.15744  0.4334948   9.841268
##      1.1519528  13.14490  0.4343909   9.835751
##      1.3650744  13.13235  0.4353021   9.830603
##      1.6176254  13.12025  0.4361997   9.826409
##      1.9169006  13.10880  0.4370766   9.822453
##      2.2715443  13.09839  0.4379158   9.819318
##      2.6918004  13.08945  0.4386964   9.817676
##      3.1898076  13.08253  0.4393939   9.817515
##      3.7799505  13.07822  0.4399835   9.820988
##      4.4792752  13.07694  0.4404590   9.828086
##      5.3079812  13.07955  0.4407862   9.839702
##      6.2900053  13.08694  0.4409360   9.854772
##      7.4537126  13.09996  0.4408859   9.876659
##      8.8327161  13.11952  0.4406165   9.906774
##     10.4668477  13.14645  0.4401265   9.944062
##     12.4033082  13.18192  0.4393941   9.988502
##     14.6980313  13.22689  0.4384314  10.042480
##     17.4172986  13.28247  0.4372499  10.106641
##     20.6396548  13.34962  0.4358814  10.179897
##     24.4581758  13.42972  0.4343441  10.264028
##     28.9831573  13.52345  0.4326923  10.371159
##     34.3453008  13.63190  0.4309561  10.487436
##     40.6994890  13.75598  0.4291753  10.614097
##     48.2292588  13.89594  0.4273962  10.758432
##     57.1521035  14.05127  0.4256617  10.912526
##     67.7257544  14.22132  0.4239964  11.079077
##     80.2556253  14.40419  0.4224254  11.255463
##     95.1036345  14.59781  0.4209577  11.437809
##    112.6986584  14.79947  0.4196030  11.619849
##    133.5489192  15.00514  0.4183785  11.805880
##    158.2566649  15.21108  0.4172804  11.993246
##    187.5355648  15.41415  0.4162993  12.177008
##    222.2313233  15.61067  0.4154312  12.357330
##    263.3461078  15.79844  0.4146644  12.531203
##    312.0674955  15.97536  0.4139911  12.695010
##    369.8027761  16.13877  0.4134129  12.845386
##    438.2196006  16.28862  0.4129106  12.982200
##    519.2941501  16.42444  0.4124770  13.106092
##    615.3682172  16.54625  0.4121036  13.216374
##    729.2168469  16.65497  0.4117815  13.315462
##    864.1284923  16.75110  0.4115058  13.402926
##   1024.0000000  16.83507  0.4112732  13.479363
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 4.479275.
rigreg_fit$bestTune # gives the row number for best lambda
##    alpha   lambda
## 18     0 4.479275
ggplot(rigreg_fit) +
  scale_x_continuous(trans = scales::transform_log(),
                     breaks = scales::breaks_log())

(bestlambda <- rigreg_fit$bestTune$lambda)
## [1] 4.479275

Tuning Report:

RMSE was used to select the optimal model using the smallest value. The final values used for the model were alpha = 0 and lambda = 4.479275.

Results:

#Predict:
College.test$ridge.pred <- predict(rigreg_fit, newdata = College.test)
# Uses the best lambda

# EVALUATE the RMSE on the TEST set, associated with this value of lambda:
rsq(College.test, truth = Grad.Rate, estimate = ridge.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.409
rmse(College.test, truth = Grad.Rate, estimate = ridge.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        12.8
# (D) Coefficients:
## We can extract the model's coefficients according to lambda:
# The coef of the chosen model:
coef(rigreg_fit$finalModel, s = bestlambda)   
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 36.26762601
## Apps         1.57751842
## Accept       0.78122451
## Enroll       0.25593133
## Top10perc    2.09175077
## Top25perc    2.23440870
## F.Undergrad -0.15629291
## P.Undergrad -1.95074512
## Outstate     2.26372251
## Room.Board   1.70501670
## Books       -0.33336233
## Personal    -1.91212488
## PhD          0.89935594
## Terminal     0.08611352
## S.F.Ratio    0.04435657
## perc.alumni  2.60654639
## Expend      -0.74491634
## Private_Yes  1.92023341
# or other lambdas...
# E.g. for lambda = 0.0000, (this result should be similar to OLS result)
coef(rigreg_fit$finalModel, s = 0)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 34.85709510
## Apps         2.54264863
## Accept       0.31582693
## Enroll       0.35851784
## Top10perc    2.34289740
## Top25perc    2.38213510
## F.Undergrad -0.06730372
## P.Undergrad -2.22067614
## Outstate     2.61115292
## Room.Board   1.92105792
## Books       -0.39785802
## Personal    -1.99793652
## PhD          1.28280488
## Terminal    -0.42695228
## S.F.Ratio    0.17585818
## perc.alumni  3.02144956
## Expend      -1.75113926
## Private_Yes  2.24699814

Ridge - Best Lambda R^2: 0.4085047

iii. Lasso

tg$alpha <- 1 # Adapt alpha for lasso, keep everything else same as ridge regression
tg
##    alpha       lambda
## 1      1 1024.0000000
## 2      1  864.1284923
## 3      1  729.2168469
## 4      1  615.3682172
## 5      1  519.2941501
## 6      1  438.2196006
## 7      1  369.8027761
## 8      1  312.0674955
## 9      1  263.3461078
## 10     1  222.2313233
## 11     1  187.5355648
## 12     1  158.2566649
## 13     1  133.5489192
## 14     1  112.6986584
## 15     1   95.1036345
## 16     1   80.2556253
## 17     1   67.7257544
## 18     1   57.1521035
## 19     1   48.2292588
## 20     1   40.6994890
## 21     1   34.3453008
## 22     1   28.9831573
## 23     1   24.4581758
## 24     1   20.6396548
## 25     1   17.4172986
## 26     1   14.6980313
## 27     1   12.4033082
## 28     1   10.4668477
## 29     1    8.8327161
## 30     1    7.4537126
## 31     1    6.2900053
## 32     1    5.3079812
## 33     1    4.4792752
## 34     1    3.7799505
## 35     1    3.1898076
## 36     1    2.6918004
## 37     1    2.2715443
## 38     1    1.9169006
## 39     1    1.6176254
## 40     1    1.3650744
## 41     1    1.1519528
## 42     1    0.9721047
## 43     1    0.8203354
## 44     1    0.6922609
## 45     1    0.5841820
## 46     1    0.4929769
## 47     1    0.4160111
## 48     1    0.3510616
## 49     1    0.2962522
## 50     1    0.2500000

Train Model:

set.seed(1)
lasso_fit <- train(
  x = rec,
  data = College.train,
  method = "glmnet",
  tuneGrid = tg,
  trControl =  tc
)
## Warning in train_rec(rec = x, dat = data, info = trainInfo, method = models, :
## There were missing values in resampled performance measures.
lasso_fit
## glmnet 
## 
## 543 samples
##  17 predictor
## 
## Recipe steps: dummy, scale 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 435, 436, 434, 433, 434 
## Resampling results across tuning parameters:
## 
##   lambda        RMSE      Rsquared   MAE      
##      0.2500000  13.14514  0.4333522   9.832743
##      0.2962522  13.14326  0.4333874   9.830639
##      0.3510616  13.13573  0.4340384   9.823504
##      0.4160111  13.12959  0.4346848   9.822579
##      0.4929769  13.12871  0.4350031   9.825902
##      0.5841820  13.13005  0.4352410   9.829782
##      0.6922609  13.14418  0.4346154   9.851283
##      0.8203354  13.17227  0.4331366   9.892778
##      0.9721047  13.21679  0.4307247   9.949169
##      1.1519528  13.27837  0.4274935  10.042581
##      1.3650744  13.36151  0.4230700  10.167165
##      1.6176254  13.45692  0.4185612  10.278071
##      1.9169006  13.57141  0.4136273  10.387158
##      2.2715443  13.71277  0.4075155  10.510589
##      2.6918004  13.88709  0.4004508  10.666984
##      3.1898076  14.10423  0.3922077  10.870892
##      3.7799505  14.35695  0.3847889  11.120667
##      4.4792752  14.66575  0.3781722  11.426976
##      5.3079812  15.05824  0.3714270  11.805156
##      6.2900053  15.56087  0.3632781  12.276243
##      7.4537126  16.21302  0.3408888  12.882574
##      8.8327161  16.95579  0.3138436  13.571147
##     10.4668477  17.34800        NaN  13.943065
##     12.4033082  17.34800        NaN  13.943065
##     14.6980313  17.34800        NaN  13.943065
##     17.4172986  17.34800        NaN  13.943065
##     20.6396548  17.34800        NaN  13.943065
##     24.4581758  17.34800        NaN  13.943065
##     28.9831573  17.34800        NaN  13.943065
##     34.3453008  17.34800        NaN  13.943065
##     40.6994890  17.34800        NaN  13.943065
##     48.2292588  17.34800        NaN  13.943065
##     57.1521035  17.34800        NaN  13.943065
##     67.7257544  17.34800        NaN  13.943065
##     80.2556253  17.34800        NaN  13.943065
##     95.1036345  17.34800        NaN  13.943065
##    112.6986584  17.34800        NaN  13.943065
##    133.5489192  17.34800        NaN  13.943065
##    158.2566649  17.34800        NaN  13.943065
##    187.5355648  17.34800        NaN  13.943065
##    222.2313233  17.34800        NaN  13.943065
##    263.3461078  17.34800        NaN  13.943065
##    312.0674955  17.34800        NaN  13.943065
##    369.8027761  17.34800        NaN  13.943065
##    438.2196006  17.34800        NaN  13.943065
##    519.2941501  17.34800        NaN  13.943065
##    615.3682172  17.34800        NaN  13.943065
##    729.2168469  17.34800        NaN  13.943065
##    864.1284923  17.34800        NaN  13.943065
##   1024.0000000  17.34800        NaN  13.943065
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.4929769.

Plot Tuning:

ggplot(lasso_fit) +
  scale_x_continuous(trans = scales::transform_log(),
                     breaks = scales::breaks_log())

(bestlambda <- lasso_fit$bestTune$lambda)
## [1] 0.4929769

Tuning Report:

RMSE was used to select the optimal model using the smallest value. The final values used for the model were alpha = 1 and lambda = 0.4929769.

Results:

College.test$lasso.pred <- predict(lasso_fit, newdata = College.test) 

rmse(College.test, truth = Grad.Rate, estimate = lasso.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        12.8
rsq(College.test, truth = Grad.Rate, estimate = lasso.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.412
(coef <- coef(lasso_fit$finalModel, s = bestlambda))
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 37.10539315
## Apps         2.19789039
## Accept       .         
## Enroll       .         
## Top10perc    1.22355059
## Top25perc    2.90252377
## F.Undergrad  .         
## P.Undergrad -1.88603074
## Outstate     2.47777640
## Room.Board   1.53760036
## Books        .         
## Personal    -1.96178849
## PhD          0.37167324
## Terminal     .         
## S.F.Ratio    .         
## perc.alumni  2.82845597
## Expend      -0.09928657
## Private_Yes  1.43315130
sum(coef==0) # some coefs are exactly 0!
## [1] 6
plot(coef); abline(0, 0)

Lasso - Best Lambda R^2 = 0.4115157

Lasso regression had a larger R^2 than Ridge regression (0.408 < 0.411) with less parameters: 11 vs. 17

iv. Elastic Net

Tuning:

Alpha: Test 15 values between 0 and 1 when 0 = ridge & 1 = lasso

Lambda: Test 50 values from -2^2 to 10^2

tg <- expand.grid(
  alpha = seq(0, 1, length.out = 15), # [0, 1]
  lambda = 2 ^ seq(10,-2, length = 50) # [range 0.25:1024]
) 
#tg

Train Model:

elastic_fit <- train(
  x = rec,
  data = College.train,
  method = "glmnet",
  tuneGrid = tg,
  trControl =  tc
)
## Warning in train_rec(rec = x, dat = data, info = trainInfo, method = models, :
## There were missing values in resampled performance measures.
elastic_fit$bestTune
##         alpha   lambda
## 64 0.07142857 2.271544
rownames(elastic_fit$bestTune)
## [1] "64"
elastic_fit$results[rownames(elastic_fit$bestTune),]
##         alpha   lambda     RMSE  Rsquared      MAE   RMSESD RsquaredSD
## 64 0.07142857 2.271544 12.96911 0.4521102 9.776352 1.370509 0.07098191
##        MAESD
## 64 0.7292158

Tuning Report:

Best Alpha Lambda Combination:

Alpha = 0.071, Lambda = 2.271

College.test$elastic.pred <- predict(elastic_fit, newdata = College.test) 

rmse(College.test, truth = Grad.Rate, estimate = elastic.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        12.8
rsq(College.test, truth = Grad.Rate, estimate = elastic.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.411

Compare Models:

# Ridge Performance
rsq(College.test, truth = Grad.Rate, estimate = ridge.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.409
# Lasso Performance
rsq(College.test, truth = Grad.Rate, estimate = lasso.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.412
# Elastic Net Performance
rsq(College.test, truth = Grad.Rate, estimate = elastic.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.411

Compare Models: Report

Lasso regression had the best performance evaluated by R^2.

(R^2 = 0.412 compare to 0.409 of ridge regression & 0.411 of elastic net).

Higher R^2 of Lasso regression may indicate that parsimonious approach fit better to our specific data when predicating Graduation rate in college.