options(repos = c(CRAN = "http://cran.rstudio.com"))
# Install the tidymodels package
install.packages("tidymodels")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidymodels' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpSE7Y1D\downloaded_packages
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.2
library(vip)
## Warning: package 'vip' was built under R version 4.3.2
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.2
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(yardstick)
## Warning: package 'yardstick' was built under R version 4.3.2
##
## Attaching package: 'yardstick'
##
## The following object is masked from 'package:readr':
##
## spec
library(dplyr)
library(purrr)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.8
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1
## Warning: package 'dials' was built under R version 4.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ Matrix::expand() masks tidyr::expand()
## ✖ magrittr::extract() masks tidyr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ Matrix::pack() masks tidyr::pack()
## ✖ magrittr::set_names() masks purrr::set_names()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## ✖ Matrix::unpack() masks tidyr::unpack()
## ✖ recipes::update() masks Matrix::update(), stats::update()
## • Search for functions across packages at https://www.tidymodels.org/find/
#PART 1: Tuning Our Regularized Regression Model
# Load the Boston housing data
boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 1. split our data
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
# Step 2. create our feature engineering recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_YeoJohnson(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric())
# Step 3. create resampling object
set.seed(123)
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
# Step 4. create our model object
reg_mod <- linear_reg(penalty = tune()) %>%
set_engine("glmnet")
# Step 5. create our hyperparameter search grid
reg_grid <- grid_regular(penalty(), levels = 10)
# Step 6. create our workflow object
boston_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_mod)
library(recipes)
boston_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_mod)
# Step 7. perform hyperparameter search
tuning_results <- boston_wf %>%
tune_grid(resamples = kfolds, grid = reg_grid)
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
##
There were issues with some computations A: x1
There were issues with some computations A: x2
There were issues with some computations A: x3
There were issues with some computations A: x4
There were issues with some computations A: x5
There were issues with some computations A: x5
# Step 8. assess results
tuning_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
arrange(mean)
# Assuming tuning_results is the output from your tuning process
formatted_results <- tuning_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
arrange(mean) %>%
mutate(penalty = sprintf("%.10f", penalty))
# Print the formatted results
print(formatted_results)
## # A tibble: 10 × 7
## penalty .metric .estimator mean n std_err .config
## <chr> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 rmse standard 0.486 5 0.0243 Preprocessor1_Model01
## 2 0.0000000013 rmse standard 0.486 5 0.0243 Preprocessor1_Model02
## 3 0.0000000167 rmse standard 0.486 5 0.0243 Preprocessor1_Model03
## 4 0.0000002154 rmse standard 0.486 5 0.0243 Preprocessor1_Model04
## 5 0.0000027826 rmse standard 0.486 5 0.0243 Preprocessor1_Model05
## 6 0.0000359381 rmse standard 0.486 5 0.0243 Preprocessor1_Model06
## 7 0.0004641589 rmse standard 0.486 5 0.0243 Preprocessor1_Model07
## 8 0.0059948425 rmse standard 0.487 5 0.0253 Preprocessor1_Model08
## 9 0.0774263683 rmse standard 0.540 5 0.0338 Preprocessor1_Model09
## 10 1.0000000000 rmse standard 0.996 5 0.0665 Preprocessor1_Model10
# Step 1. finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- select_best(tuning_results, metric = "rmse")
final_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_mod) %>%
finalize_workflow(best_hyperparameters)
# Step 2. fit our final workflow object across the full training set data
final_fit <- final_wf %>%
fit(data = boston_train)
# Step 3. plot the top 10 most influential features
final_fit %>%
extract_fit_parsnip() %>%
vip(num_features = 10) # Adjust num_features as needed

#PART 2: Tuning a Regularized Classification Model
# Load necessary libraries
library(earth)
## Warning: package 'earth' was built under R version 4.3.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.3.2
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:scales':
##
## rescale
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 4.3.2
library(tidymodels)
library(tidyverse)
library(vip)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:scales':
##
## alpha
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
# Load the spam data
data(spam)
# Step 1: create train and test splits
set.seed(123) # for reproducibility
split <- initial_split(spam, prop = 0.7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
# Step 2: create model & preprocessing recipe
spam_recipe <- recipe(type ~ ., data = spam_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
# Step 3: fit model across resampling object and collect results
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)
# Step 4: create ridge model object
logit_mod <- logistic_reg(mixture = tune(), penalty = tune()) %>%
set_engine("glmnet") %>%
set_mode("classification")
# Step 5: create our hyperparameter search grid
logit_grid <- grid_regular(mixture(), penalty(), levels = 10)
# Step 6: create workflow object to combine the recipe & model
spam_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(logit_mod)
# Step 7. perform hyperparameter search
tuning_results <- spam_wf %>%
tune_grid(resamples = kfolds, grid = logit_grid)
# Step 8. assess results
tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
# Assuming tuning_results is your tuned model results
mean_auc <- tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
summarize(mean_auc = mean(mean))
# Print the mean AUC
print(mean_auc)
## # A tibble: 1 × 1
## mean_auc
## <dbl>
## 1 0.942
# Step 1: finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- tuning_results %>%
select_best("roc_auc") %>%
slice(1) %>%
select(mixture, penalty)
final_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(logit_mod) %>%
finalize_workflow(best_hyperparameters)
# Step 2: fit our final workflow object across the full training set data
final_fit <- final_wf %>%
fit(data = spam_train)
# Step 3: plot the top 10 most influential features
final_fit %>%
extract_fit_parsnip() %>%
vip(num_features = 10)

#PART 3: Tuning a MARS Classification Model
# Load necessary libraries
library(tidymodels)
library(tidyverse)
library(vip)
library(kernlab)
library(earth)
install.packages("dials")
## Warning: package 'dials' is in use and will not be installed
library(dials)
# Load the spam data
data(spam)
# Step 1: create train and test splits
set.seed(123) # for reproducibility
split <- initial_split(spam, prop = 0.7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
# Step 2: create model & preprocessing recipe
spam_recipe <- recipe(type ~ ., data = spam_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
# Step 3: fit model across resampling object and collect results
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)
# Step 4: create MARS model object
mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
set_mode("classification")
# Step 5: create our hyperparameter search grid
num_terms_values <- seq(1, 30)
prod_degree_values <- NULL # Default values for prod_degree
mars_grid <- expand.grid(
num_terms = num_terms_values,
prod_degree = prod_degree_values
)
# Step 6: create workflow object to combine the recipe & model
spam_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(mars_mod)
# Step 5: create our hyperparameter search grid
num_terms_values <- seq(1, 30)
prod_degree_values <- 1 # Set default value for prod_degree
# Create a grid with expand.grid
mars_grid <- expand.grid(
num_terms = num_terms_values,
prod_degree = prod_degree_values
)
# Step 7: perform hyperparameter search
tuning_results <- spam_wf %>%
tune_grid(resamples = kfolds, grid = mars_grid)
# Step 8. assess results
tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
# Print best hyperparameter values and mean cross-validated ROC AUC
# Assuming tuning_results is your tuning results data frame
best_results <- tuning_results %>%
slice_min(order_by = c("AUC1", "AUC2", "AUC3", "AUC4", "AUC5"), n = 1)
# Step 1: finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- tuning_results %>%
select_best("roc_auc") %>%
slice(1) %>%
select(prod_degree, num_terms)
final_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(mars_mod) %>%
finalize_workflow(best_hyperparameters)
# Step 2: fit our final workflow object across the full training set data
final_fit <- final_wf %>%
fit(data = spam_train)
# Step 3: plot the top 10 most influential features
final_fit %>%
extract_fit_parsnip() %>%
vip(num_features = 10)
