Module 12 Lab

options(repos = c(CRAN = "http://cran.rstudio.com"))

# Install the tidymodels package
install.packages("tidymodels")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'tidymodels' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpSE7Y1D\downloaded_packages

library(rsample)

## Warning: package 'rsample' was built under R version 4.3.2

library(vip)

## Warning: package 'vip' was built under R version 4.3.2

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.2

## Warning: package 'readr' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(glmnet)

## Warning: package 'glmnet' was built under R version 4.3.2

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8

library(yardstick)

## Warning: package 'yardstick' was built under R version 4.3.2

## 
## Attaching package: 'yardstick'
## 
## The following object is masked from 'package:readr':
## 
##     spec

library(dplyr)
library(purrr)
library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.3.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ recipes      1.0.8
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1

## Warning: package 'dials' was built under R version 4.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()     masks purrr::discard()
## ✖ Matrix::expand()      masks tidyr::expand()
## ✖ magrittr::extract()   masks tidyr::extract()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ recipes::fixed()      masks stringr::fixed()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ Matrix::pack()        masks tidyr::pack()
## ✖ magrittr::set_names() masks purrr::set_names()
## ✖ yardstick::spec()     masks readr::spec()
## ✖ recipes::step()       masks stats::step()
## ✖ Matrix::unpack()      masks tidyr::unpack()
## ✖ recipes::update()     masks Matrix::update(), stats::update()
## • Search for functions across packages at https://www.tidymodels.org/find/

#PART 1: Tuning Our Regularized Regression Model

# Load the Boston housing data

boston <- read_csv("boston.csv")

## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Step 1. split our data
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)

# Step 2. create our feature engineering recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_YeoJohnson(all_numeric(), -all_outcomes()) %>%
  step_normalize(all_numeric())

# Step 3. create resampling object
set.seed(123)
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

# Step 4. create our model object
reg_mod <- linear_reg(penalty = tune()) %>%
  set_engine("glmnet")

# Step 5. create our hyperparameter search grid
reg_grid <- grid_regular(penalty(), levels = 10)

# Step 6. create our workflow object
boston_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(reg_mod)

library(recipes)



boston_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(reg_mod)


# Step 7. perform hyperparameter search
tuning_results <- boston_wf %>%
  tune_grid(resamples = kfolds, grid = reg_grid)

## → A | warning: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.

## 
There were issues with some computations   A: x1

There were issues with some computations   A: x2

There were issues with some computations   A: x3

There were issues with some computations   A: x4

There were issues with some computations   A: x5

There were issues with some computations   A: x5

# Step 8. assess results
tuning_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
arrange(mean)

# Assuming tuning_results is the output from your tuning process
formatted_results <- tuning_results %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  arrange(mean) %>%
  mutate(penalty = sprintf("%.10f", penalty))

# Print the formatted results
print(formatted_results)

## # A tibble: 10 × 7
##    penalty      .metric .estimator  mean     n std_err .config              
##    <chr>        <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1 0.0000000001 rmse    standard   0.486     5  0.0243 Preprocessor1_Model01
##  2 0.0000000013 rmse    standard   0.486     5  0.0243 Preprocessor1_Model02
##  3 0.0000000167 rmse    standard   0.486     5  0.0243 Preprocessor1_Model03
##  4 0.0000002154 rmse    standard   0.486     5  0.0243 Preprocessor1_Model04
##  5 0.0000027826 rmse    standard   0.486     5  0.0243 Preprocessor1_Model05
##  6 0.0000359381 rmse    standard   0.486     5  0.0243 Preprocessor1_Model06
##  7 0.0004641589 rmse    standard   0.486     5  0.0243 Preprocessor1_Model07
##  8 0.0059948425 rmse    standard   0.487     5  0.0253 Preprocessor1_Model08
##  9 0.0774263683 rmse    standard   0.540     5  0.0338 Preprocessor1_Model09
## 10 1.0000000000 rmse    standard   0.996     5  0.0665 Preprocessor1_Model10

# Step 1. finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- select_best(tuning_results, metric = "rmse")
final_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(reg_mod) %>%
  finalize_workflow(best_hyperparameters)

# Step 2. fit our final workflow object across the full training set data
final_fit <- final_wf %>%
  fit(data = boston_train)

# Step 3. plot the top 10 most influential features
final_fit %>%
  extract_fit_parsnip() %>%
  vip(num_features = 10)  # Adjust num_features as needed

#PART 2: Tuning a Regularized Classification Model

# Load necessary libraries

library(earth)

## Warning: package 'earth' was built under R version 4.3.2

## Loading required package: Formula

## Loading required package: plotmo

## Warning: package 'plotmo' was built under R version 4.3.2

## Loading required package: plotrix

## Warning: package 'plotrix' was built under R version 4.3.2

## 
## Attaching package: 'plotrix'

## The following object is masked from 'package:scales':
## 
##     rescale

## Loading required package: TeachingDemos

## Warning: package 'TeachingDemos' was built under R version 4.3.2

library(tidymodels)
library(tidyverse)
library(vip)
library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:scales':
## 
##     alpha

## The following object is masked from 'package:purrr':
## 
##     cross

## The following object is masked from 'package:ggplot2':
## 
##     alpha

# Load the spam data
data(spam)

# Step 1: create train and test splits
set.seed(123) # for reproducibility
split <- initial_split(spam, prop = 0.7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)

# Step 2: create model & preprocessing recipe
spam_recipe <- recipe(type ~ ., data = spam_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

# Step 3: fit model across resampling object and collect results
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)

# Step 4: create ridge model object
logit_mod <- logistic_reg(mixture = tune(), penalty = tune()) %>%
  set_engine("glmnet") %>%
  set_mode("classification")

# Step 5: create our hyperparameter search grid
logit_grid <- grid_regular(mixture(), penalty(), levels = 10)

# Step 6: create workflow object to combine the recipe & model
spam_wf <- workflow() %>%
  add_recipe(spam_recipe) %>%
  add_model(logit_mod)

# Step 7. perform hyperparameter search
tuning_results <- spam_wf %>%
  tune_grid(resamples = kfolds, grid = logit_grid)

# Step 8. assess results
tuning_results %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") %>%
  arrange(desc(mean))

# Assuming tuning_results is your tuned model results
mean_auc <- tuning_results %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") %>%
  summarize(mean_auc = mean(mean))

# Print the mean AUC
print(mean_auc)

## # A tibble: 1 × 1
##   mean_auc
##      <dbl>
## 1    0.942

# Step 1: finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- tuning_results %>%
  select_best("roc_auc") %>%
  slice(1) %>%
  select(mixture, penalty)

final_wf <- workflow() %>%
  add_recipe(spam_recipe) %>%
  add_model(logit_mod) %>%
  finalize_workflow(best_hyperparameters)

# Step 2: fit our final workflow object across the full training set data
final_fit <- final_wf %>%
  fit(data = spam_train)

# Step 3: plot the top 10 most influential features
final_fit %>%
  extract_fit_parsnip() %>%
  vip(num_features = 10)

#PART 3: Tuning a MARS Classification Model

# Load necessary libraries
library(tidymodels)
library(tidyverse)
library(vip)
library(kernlab)
library(earth)

install.packages("dials")

## Warning: package 'dials' is in use and will not be installed

library(dials)

# Load the spam data
data(spam)

# Step 1: create train and test splits
set.seed(123) # for reproducibility
split <- initial_split(spam, prop = 0.7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)

# Step 2: create model & preprocessing recipe
spam_recipe <- recipe(type ~ ., data = spam_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

# Step 3: fit model across resampling object and collect results
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)

# Step 4: create MARS model object
mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
  set_mode("classification")

# Step 5: create our hyperparameter search grid
num_terms_values <- seq(1, 30)
prod_degree_values <- NULL  # Default values for prod_degree

mars_grid <- expand.grid(
  num_terms = num_terms_values,
  prod_degree = prod_degree_values
)

# Step 6: create workflow object to combine the recipe & model
spam_wf <- workflow() %>%
  add_recipe(spam_recipe) %>%
  add_model(mars_mod)

# Step 5: create our hyperparameter search grid
num_terms_values <- seq(1, 30)
prod_degree_values <- 1  # Set default value for prod_degree

# Create a grid with expand.grid
mars_grid <- expand.grid(
  num_terms = num_terms_values,
  prod_degree = prod_degree_values
)

# Step 7: perform hyperparameter search
tuning_results <- spam_wf %>%
  tune_grid(resamples = kfolds, grid = mars_grid)

# Step 8. assess results
tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))

# Print best hyperparameter values and mean cross-validated ROC AUC

# Assuming tuning_results is your tuning results data frame
best_results <- tuning_results %>%
  slice_min(order_by = c("AUC1", "AUC2", "AUC3", "AUC4", "AUC5"), n = 1)

# Step 1: finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- tuning_results %>%
  select_best("roc_auc") %>%
  slice(1) %>%
  select(prod_degree, num_terms)

final_wf <- workflow() %>%
  add_recipe(spam_recipe) %>%
  add_model(mars_mod) %>%
  finalize_workflow(best_hyperparameters)

# Step 2: fit our final workflow object across the full training set data
final_fit <- final_wf %>%
  fit(data = spam_train)

# Step 3: plot the top 10 most influential features
final_fit %>%
  extract_fit_parsnip() %>%
  vip(num_features = 10)