Machine Learning Ex3
Eldad Aviv: 206836165
library(dplyr)
library(ggplot2)
library(ISLR)
library(caret)
library(rsample)
library(yardstick)
library(recipes)
data("College", package = "ISLR")
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
Lets predict Grad.Rate (Graduation rate) from these 17 variables.
set.seed(123444)
splits <- initial_split(College, prop = 0.7)
College.train <- training(splits)
College.test <- testing(splits)
Show best regression model to predict Grad.Rate with 1-8 parameters
regfit.full <- leaps::regsubsets(Grad.Rate ~ ., data = College.train)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Grad.Rate ~ ., data = College.train)
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Outstate FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " "*" " "
## 4 ( 1 ) " " " " " " " " " " "*" " "
## 5 ( 1 ) " " "*" " " " " " " "*" " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " " " " " "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " "
## P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " "*" " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " "*" " " " " " "
## 4 ( 1 ) " " "*" " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " "*" " " " " " "
## 7 ( 1 ) "*" "*" "*" " " "*" " " " " " "
## 8 ( 1 ) "*" "*" "*" " " "*" " " " " " "
## perc.alumni Expend
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" " "
## 8 ( 1 ) "*" " "
# Assuming reg.summary is the summary of regfit.full
reg.summary <- summary(regfit.full)
# Extract relevant components from reg.summary
rsq <- reg.summary$rsq
rss <- reg.summary$rss
adjr2 <- reg.summary$adjr2
cp <- reg.summary$cp
bic <- reg.summary$bic
# Combine the components into a data frame
reg.summary_df <- data.frame(model_size = 1:length(rsq), rsq = rsq, rss = rss, adjr2 = adjr2, cp = cp, bic = bic)
# Pivot the data frame to long format
reg.summary_df <- tidyr::pivot_longer(reg.summary_df, cols = -model_size, names_to = "Index", values_to = "value")
# Print the resulting data frame to verify
print(reg.summary_df)
## # A tibble: 40 × 3
## model_size Index value
## <int> <chr> <dbl>
## 1 1 rsq 0.303
## 2 1 rss 114200.
## 3 1 adjr2 0.302
## 4 1 cp 152.
## 5 1 bic -184.
## 6 2 rsq 0.374
## 7 2 rss 102620.
## 8 2 adjr2 0.372
## 9 2 cp 84.1
## 10 2 bic -236.
## # ℹ 30 more rows
# Plotting
ggplot(reg.summary_df, aes(x = model_size, y = value)) +
facet_wrap(~Index, scales = "free", ncol = 3) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:length(rsq), minor_breaks = NULL) +
theme_minimal() +
labs(x = "Model Size", y = "Value")
Report:
Model Size Impact: The performance metrics suggest that increasing the number of predictors generally improves model fit, but there is a point of diminishing returns.
Optimal Model Size: Based on BIC and Cp, an optimal model size likely lies around 5-6 predictors, balancing model fit and complexity.
Pre Process:
# Dummy Coding:
rec <- recipe(Grad.Rate ~ ., data = College.train) |>
step_dummy(all_factor_predictors())
# Scaling:
rec <- rec |>
step_scale(all_numeric_predictors())
Tuning: Test 50 optional Lambda values range from 0.25 to 1024
Train Control: Use 5-fold Cross Validation
tg <- expand.grid(
alpha = 0, # Alpha(0-1) # 0 = ridge, 1 = lasso
lambda = 2 ^ seq(10, -2, length = 50) # [0, Inf]
)
tc <- trainControl(method = "cv", number = 5)
Train Model:
set.seed(1)
rigreg_fit <- train(
x = rec,
data = College.train,
method = "glmnet",
tuneGrid = tg,
trControl = tc
)
## Loading required namespace: glmnet
## Loading required package: Matrix
## Loaded glmnet 4.1-8
rigreg_fit
## glmnet
##
## 543 samples
## 17 predictor
##
## Recipe steps: dummy, scale
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 435, 436, 434, 433, 434
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.2500000 13.15738 0.4334813 9.840874
## 0.2962522 13.15738 0.4334813 9.840874
## 0.3510616 13.15738 0.4334813 9.840874
## 0.4160111 13.15738 0.4334813 9.840874
## 0.4929769 13.15738 0.4334813 9.840874
## 0.5841820 13.15738 0.4334813 9.840874
## 0.6922609 13.15738 0.4334813 9.840874
## 0.8203354 13.15738 0.4334813 9.840874
## 0.9721047 13.15744 0.4334948 9.841268
## 1.1519528 13.14490 0.4343909 9.835751
## 1.3650744 13.13235 0.4353021 9.830603
## 1.6176254 13.12025 0.4361997 9.826409
## 1.9169006 13.10880 0.4370766 9.822453
## 2.2715443 13.09839 0.4379158 9.819318
## 2.6918004 13.08945 0.4386964 9.817676
## 3.1898076 13.08253 0.4393939 9.817515
## 3.7799505 13.07822 0.4399835 9.820988
## 4.4792752 13.07694 0.4404590 9.828086
## 5.3079812 13.07955 0.4407862 9.839702
## 6.2900053 13.08694 0.4409360 9.854772
## 7.4537126 13.09996 0.4408859 9.876659
## 8.8327161 13.11952 0.4406165 9.906774
## 10.4668477 13.14645 0.4401265 9.944062
## 12.4033082 13.18192 0.4393941 9.988502
## 14.6980313 13.22689 0.4384314 10.042480
## 17.4172986 13.28247 0.4372499 10.106641
## 20.6396548 13.34962 0.4358814 10.179897
## 24.4581758 13.42972 0.4343441 10.264028
## 28.9831573 13.52345 0.4326923 10.371159
## 34.3453008 13.63190 0.4309561 10.487436
## 40.6994890 13.75598 0.4291753 10.614097
## 48.2292588 13.89594 0.4273962 10.758432
## 57.1521035 14.05127 0.4256617 10.912526
## 67.7257544 14.22132 0.4239964 11.079077
## 80.2556253 14.40419 0.4224254 11.255463
## 95.1036345 14.59781 0.4209577 11.437809
## 112.6986584 14.79947 0.4196030 11.619849
## 133.5489192 15.00514 0.4183785 11.805880
## 158.2566649 15.21108 0.4172804 11.993246
## 187.5355648 15.41415 0.4162993 12.177008
## 222.2313233 15.61067 0.4154312 12.357330
## 263.3461078 15.79844 0.4146644 12.531203
## 312.0674955 15.97536 0.4139911 12.695010
## 369.8027761 16.13877 0.4134129 12.845386
## 438.2196006 16.28862 0.4129106 12.982200
## 519.2941501 16.42444 0.4124770 13.106092
## 615.3682172 16.54625 0.4121036 13.216374
## 729.2168469 16.65497 0.4117815 13.315462
## 864.1284923 16.75110 0.4115058 13.402926
## 1024.0000000 16.83507 0.4112732 13.479363
##
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 4.479275.
rigreg_fit$bestTune # gives the row number for best lambda
## alpha lambda
## 18 0 4.479275
ggplot(rigreg_fit) +
scale_x_continuous(trans = scales::transform_log(),
breaks = scales::breaks_log())
(bestlambda <- rigreg_fit$bestTune$lambda)
## [1] 4.479275
Tuning Report:
RMSE was used to select the optimal model using the smallest value. The final values used for the model were alpha = 0 and lambda = 4.479275.
Results:
#Predict:
College.test$ridge.pred <- predict(rigreg_fit, newdata = College.test)
# Uses the best lambda
# EVALUATE the RMSE on the TEST set, associated with this value of lambda:
rsq(College.test, truth = Grad.Rate, estimate = ridge.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.409
rmse(College.test, truth = Grad.Rate, estimate = ridge.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 12.8
# (D) Coefficients:
## We can extract the model's coefficients according to lambda:
# The coef of the chosen model:
coef(rigreg_fit$finalModel, s = bestlambda)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 36.26762601
## Apps 1.57751842
## Accept 0.78122451
## Enroll 0.25593133
## Top10perc 2.09175077
## Top25perc 2.23440870
## F.Undergrad -0.15629291
## P.Undergrad -1.95074512
## Outstate 2.26372251
## Room.Board 1.70501670
## Books -0.33336233
## Personal -1.91212488
## PhD 0.89935594
## Terminal 0.08611352
## S.F.Ratio 0.04435657
## perc.alumni 2.60654639
## Expend -0.74491634
## Private_Yes 1.92023341
# or other lambdas...
# E.g. for lambda = 0.0000, (this result should be similar to OLS result)
coef(rigreg_fit$finalModel, s = 0)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 34.85709510
## Apps 2.54264863
## Accept 0.31582693
## Enroll 0.35851784
## Top10perc 2.34289740
## Top25perc 2.38213510
## F.Undergrad -0.06730372
## P.Undergrad -2.22067614
## Outstate 2.61115292
## Room.Board 1.92105792
## Books -0.39785802
## Personal -1.99793652
## PhD 1.28280488
## Terminal -0.42695228
## S.F.Ratio 0.17585818
## perc.alumni 3.02144956
## Expend -1.75113926
## Private_Yes 2.24699814
Ridge - Best Lambda R^2: 0.4085047
tg$alpha <- 1 # Adapt alpha for lasso, keep everything else same as ridge regression
tg
## alpha lambda
## 1 1 1024.0000000
## 2 1 864.1284923
## 3 1 729.2168469
## 4 1 615.3682172
## 5 1 519.2941501
## 6 1 438.2196006
## 7 1 369.8027761
## 8 1 312.0674955
## 9 1 263.3461078
## 10 1 222.2313233
## 11 1 187.5355648
## 12 1 158.2566649
## 13 1 133.5489192
## 14 1 112.6986584
## 15 1 95.1036345
## 16 1 80.2556253
## 17 1 67.7257544
## 18 1 57.1521035
## 19 1 48.2292588
## 20 1 40.6994890
## 21 1 34.3453008
## 22 1 28.9831573
## 23 1 24.4581758
## 24 1 20.6396548
## 25 1 17.4172986
## 26 1 14.6980313
## 27 1 12.4033082
## 28 1 10.4668477
## 29 1 8.8327161
## 30 1 7.4537126
## 31 1 6.2900053
## 32 1 5.3079812
## 33 1 4.4792752
## 34 1 3.7799505
## 35 1 3.1898076
## 36 1 2.6918004
## 37 1 2.2715443
## 38 1 1.9169006
## 39 1 1.6176254
## 40 1 1.3650744
## 41 1 1.1519528
## 42 1 0.9721047
## 43 1 0.8203354
## 44 1 0.6922609
## 45 1 0.5841820
## 46 1 0.4929769
## 47 1 0.4160111
## 48 1 0.3510616
## 49 1 0.2962522
## 50 1 0.2500000
Train Model:
set.seed(1)
lasso_fit <- train(
x = rec,
data = College.train,
method = "glmnet",
tuneGrid = tg,
trControl = tc
)
## Warning in train_rec(rec = x, dat = data, info = trainInfo, method = models, :
## There were missing values in resampled performance measures.
lasso_fit
## glmnet
##
## 543 samples
## 17 predictor
##
## Recipe steps: dummy, scale
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 435, 436, 434, 433, 434
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.2500000 13.14514 0.4333522 9.832743
## 0.2962522 13.14326 0.4333874 9.830639
## 0.3510616 13.13573 0.4340384 9.823504
## 0.4160111 13.12959 0.4346848 9.822579
## 0.4929769 13.12871 0.4350031 9.825902
## 0.5841820 13.13005 0.4352410 9.829782
## 0.6922609 13.14418 0.4346154 9.851283
## 0.8203354 13.17227 0.4331366 9.892778
## 0.9721047 13.21679 0.4307247 9.949169
## 1.1519528 13.27837 0.4274935 10.042581
## 1.3650744 13.36151 0.4230700 10.167165
## 1.6176254 13.45692 0.4185612 10.278071
## 1.9169006 13.57141 0.4136273 10.387158
## 2.2715443 13.71277 0.4075155 10.510589
## 2.6918004 13.88709 0.4004508 10.666984
## 3.1898076 14.10423 0.3922077 10.870892
## 3.7799505 14.35695 0.3847889 11.120667
## 4.4792752 14.66575 0.3781722 11.426976
## 5.3079812 15.05824 0.3714270 11.805156
## 6.2900053 15.56087 0.3632781 12.276243
## 7.4537126 16.21302 0.3408888 12.882574
## 8.8327161 16.95579 0.3138436 13.571147
## 10.4668477 17.34800 NaN 13.943065
## 12.4033082 17.34800 NaN 13.943065
## 14.6980313 17.34800 NaN 13.943065
## 17.4172986 17.34800 NaN 13.943065
## 20.6396548 17.34800 NaN 13.943065
## 24.4581758 17.34800 NaN 13.943065
## 28.9831573 17.34800 NaN 13.943065
## 34.3453008 17.34800 NaN 13.943065
## 40.6994890 17.34800 NaN 13.943065
## 48.2292588 17.34800 NaN 13.943065
## 57.1521035 17.34800 NaN 13.943065
## 67.7257544 17.34800 NaN 13.943065
## 80.2556253 17.34800 NaN 13.943065
## 95.1036345 17.34800 NaN 13.943065
## 112.6986584 17.34800 NaN 13.943065
## 133.5489192 17.34800 NaN 13.943065
## 158.2566649 17.34800 NaN 13.943065
## 187.5355648 17.34800 NaN 13.943065
## 222.2313233 17.34800 NaN 13.943065
## 263.3461078 17.34800 NaN 13.943065
## 312.0674955 17.34800 NaN 13.943065
## 369.8027761 17.34800 NaN 13.943065
## 438.2196006 17.34800 NaN 13.943065
## 519.2941501 17.34800 NaN 13.943065
## 615.3682172 17.34800 NaN 13.943065
## 729.2168469 17.34800 NaN 13.943065
## 864.1284923 17.34800 NaN 13.943065
## 1024.0000000 17.34800 NaN 13.943065
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.4929769.
Plot Tuning:
ggplot(lasso_fit) +
scale_x_continuous(trans = scales::transform_log(),
breaks = scales::breaks_log())
(bestlambda <- lasso_fit$bestTune$lambda)
## [1] 0.4929769
Tuning Report:
RMSE was used to select the optimal model using the smallest value. The final values used for the model were alpha = 1 and lambda = 0.4929769.
Results:
College.test$lasso.pred <- predict(lasso_fit, newdata = College.test)
rmse(College.test, truth = Grad.Rate, estimate = lasso.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 12.8
rsq(College.test, truth = Grad.Rate, estimate = lasso.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.412
(coef <- coef(lasso_fit$finalModel, s = bestlambda))
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 37.10539315
## Apps 2.19789039
## Accept .
## Enroll .
## Top10perc 1.22355059
## Top25perc 2.90252377
## F.Undergrad .
## P.Undergrad -1.88603074
## Outstate 2.47777640
## Room.Board 1.53760036
## Books .
## Personal -1.96178849
## PhD 0.37167324
## Terminal .
## S.F.Ratio .
## perc.alumni 2.82845597
## Expend -0.09928657
## Private_Yes 1.43315130
sum(coef==0) # some coefs are exactly 0!
## [1] 6
plot(coef); abline(0, 0)
Lasso - Best Lambda R^2 = 0.4115157
Lasso regression had a larger R^2 than Ridge regression (0.408 < 0.411) with less parameters: 11 vs. 17
Tuning:
Alpha: Test 15 values between 0 and 1 when 0 = ridge & 1 = lasso
Lambda: Test 50 values from -2^2 to 10^2
tg <- expand.grid(
alpha = seq(0, 1, length.out = 15), # [0, 1]
lambda = 2 ^ seq(10,-2, length = 50) # [range 0.25:1024]
)
#tg
Train Model:
elastic_fit <- train(
x = rec,
data = College.train,
method = "glmnet",
tuneGrid = tg,
trControl = tc
)
## Warning in train_rec(rec = x, dat = data, info = trainInfo, method = models, :
## There were missing values in resampled performance measures.
elastic_fit$bestTune
## alpha lambda
## 64 0.07142857 2.271544
rownames(elastic_fit$bestTune)
## [1] "64"
elastic_fit$results[rownames(elastic_fit$bestTune),]
## alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD
## 64 0.07142857 2.271544 12.96911 0.4521102 9.776352 1.370509 0.07098191
## MAESD
## 64 0.7292158
Tuning Report:
Best Alpha Lambda Combination:
Alpha = 0.071, Lambda = 2.271
College.test$elastic.pred <- predict(elastic_fit, newdata = College.test)
rmse(College.test, truth = Grad.Rate, estimate = elastic.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 12.8
rsq(College.test, truth = Grad.Rate, estimate = elastic.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.411
# Ridge Performance
rsq(College.test, truth = Grad.Rate, estimate = ridge.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.409
# Lasso Performance
rsq(College.test, truth = Grad.Rate, estimate = lasso.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.412
# Elastic Net Performance
rsq(College.test, truth = Grad.Rate, estimate = elastic.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.411
Lasso regression had the best performance evaluated by R^2.
(R^2 = 0.412 compare to 0.409 of ridge regression & 0.411 of elastic net).
Higher R^2 of Lasso regression may indicate that parsimonious approach fit better to our specific data when predicating Graduation rate in college.