library(here)
## here() starts at /Users/nandini
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ recipes      1.0.8
## ✔ dials        1.2.0     ✔ rsample      1.2.0
## ✔ dplyr        1.1.3     ✔ tibble       3.2.1
## ✔ ggplot2      3.4.3     ✔ tidyr        1.3.0
## ✔ infer        1.0.5     ✔ tune         1.1.2
## ✔ modeldata    1.2.0     ✔ workflows    1.1.3
## ✔ parsnip      1.1.1     ✔ workflowsets 1.0.1
## ✔ purrr        1.0.2     ✔ yardstick    1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

Part 1: Tuning our Regularized Regression Model

QUESTION 1:

boston <- read_csv("~/Desktop/BANA 4080 R/data_bana4080/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
boston_split <- initial_split(boston, 0.7, strata = cmedv)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)

QUESTION 2:

boston_recipe <- recipe(cmedv ~ ., boston_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

QUESTION 3:

kfold <- vfold_cv(boston_train, v = 5, strata = cmedv)

QUESTION 4:

reg_model <- linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine('glmnet')

QUESTION 5:

reg_grid <- grid_regular(penalty(range = c(-10, 5)), mixture(), levels = 10)

QUESTION 6:

boston_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_model)

QUESTION 7:

tuning_results <- boston_wf %>%
tune_grid(resamples = kfold, grid = reg_grid)
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x2

There were issues with some computations   A: x3

There were issues with some computations   A: x4

There were issues with some computations   A: x5

There were issues with some computations   A: x5

QUESTION 8:

tuning_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
arrange(mean)
## # A tibble: 100 × 8
##          penalty mixture .metric .estimator  mean     n std_err .config         
##            <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
##  1 0.0215          1     rmse    standard    4.49     5   0.296 Preprocessor1_M…
##  2 0.0215          0.889 rmse    standard    4.49     5   0.295 Preprocessor1_M…
##  3 0.0215          0.778 rmse    standard    4.49     5   0.294 Preprocessor1_M…
##  4 0.0215          0.667 rmse    standard    4.49     5   0.293 Preprocessor1_M…
##  5 0.0215          0.556 rmse    standard    4.49     5   0.292 Preprocessor1_M…
##  6 0.0215          0.444 rmse    standard    4.49     5   0.291 Preprocessor1_M…
##  7 0.0215          0.333 rmse    standard    4.49     5   0.290 Preprocessor1_M…
##  8 0.0000000001    0.444 rmse    standard    4.49     5   0.290 Preprocessor1_M…
##  9 0.00000000464   0.444 rmse    standard    4.49     5   0.290 Preprocessor1_M…
## 10 0.000000215     0.444 rmse    standard    4.49     5   0.290 Preprocessor1_M…
## # ℹ 90 more rows
best_hyperparameters <- tuning_results %>%
  select_best(metric = 'rmse')

final_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_model) %>%
finalize_workflow(best_hyperparameters)

# Step 2. fit our final workflow object across the full training set data
final_fit <- final_wf %>%
fit(data = boston_train)
# Step 3. plot the top 10 most influential features
final_fit %>%
extract_fit_parsnip() %>%
vip()

Part 2: Tuning a Regularized Classificatin Model

library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following object is masked from 'package:scales':
## 
##     alpha
data(spam)

QUESTION 1:

set.seed(123) 
split <- initial_split(spam, prop = 0.7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)

QUESTION 2:

spam_recipe <- recipe(type ~ ., data = spam_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())

QUESTION 3:

set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)

QUESTION 4:

logit_mod <- logistic_reg(penalty = tune(), mixture = tune()) %>%
set_engine('glmnet') %>%
set_mode('classification')

QUESTION 5:

logit_grid <- grid_regular(penalty(range = c(-10, 5)), mixture(), levels = 10)

QUESTION 6:

spam_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(logit_mod)

QUESTION 7:

tuning_results <- spam_wf %>%
  tune_grid(resamples = kfolds, grid = logit_grid)

QUESTION 8:

tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
## # A tibble: 100 × 8
##         penalty mixture .metric .estimator  mean     n std_err .config          
##           <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>            
##  1 0.000464       1     roc_auc binary     0.979     5 0.00352 Preprocessor1_Mo…
##  2 0.000464       0.889 roc_auc binary     0.979     5 0.00349 Preprocessor1_Mo…
##  3 0.000464       0.778 roc_auc binary     0.979     5 0.00347 Preprocessor1_Mo…
##  4 0.000464       0.667 roc_auc binary     0.979     5 0.00344 Preprocessor1_Mo…
##  5 0.000464       0.222 roc_auc binary     0.979     5 0.00329 Preprocessor1_Mo…
##  6 0.000464       0.333 roc_auc binary     0.979     5 0.00333 Preprocessor1_Mo…
##  7 0.000464       0.556 roc_auc binary     0.979     5 0.00342 Preprocessor1_Mo…
##  8 0.000464       0.111 roc_auc binary     0.979     5 0.00328 Preprocessor1_Mo…
##  9 0.000464       0.444 roc_auc binary     0.979     5 0.00339 Preprocessor1_Mo…
## 10 0.0000000001   0.111 roc_auc binary     0.979     5 0.00335 Preprocessor1_Mo…
## # ℹ 90 more rows
autoplot(tuning_results)

best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")
final_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(logit_mod) %>%
finalize_workflow(best_hyperparameters)

final_fit <- final_wf %>%
fit(data = spam_train)

final_fit %>%
extract_fit_parsnip() %>%
vip()

Part 3: Tuning a MARS Classification Mode

packages required

#library(kernlab) #data(spam) QUESTION 1:

set.seed(123) 
split <- initial_split(spam_train, prop = 0.7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)

QUESTION 2:

spam_recipe <- recipe(type ~ ., data = spam_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())

QUESTION 3:

set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)

QUESTION 4:

mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
set_mode("classification")

QUESTION 5:

mars_grid <- grid_regular(num_terms(range = c(1, 30)), prod_degree(), levels = 25)

QUESTION 6:

spam_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(mars_mod)

QUESTION 7:

tuning_results <- spam_wf %>%
tune_grid(resamples = kfolds, grid = mars_grid)
## → A | warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x2

There were issues with some computations   A: x3

There were issues with some computations   A: x4

There were issues with some computations   A: x4

QUESTION 8:

tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
## # A tibble: 50 × 8
##    num_terms prod_degree .metric .estimator  mean     n std_err .config         
##        <int>       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
##  1        13           2 roc_auc binary     0.977     5 0.00365 Preprocessor1_M…
##  2        14           2 roc_auc binary     0.977     5 0.00352 Preprocessor1_M…
##  3        15           2 roc_auc binary     0.977     5 0.00320 Preprocessor1_M…
##  4        16           2 roc_auc binary     0.977     5 0.00315 Preprocessor1_M…
##  5        22           2 roc_auc binary     0.977     5 0.00325 Preprocessor1_M…
##  6        27           2 roc_auc binary     0.977     5 0.00337 Preprocessor1_M…
##  7        28           2 roc_auc binary     0.977     5 0.00337 Preprocessor1_M…
##  8        30           2 roc_auc binary     0.977     5 0.00337 Preprocessor1_M…
##  9        25           2 roc_auc binary     0.977     5 0.00336 Preprocessor1_M…
## 10        26           2 roc_auc binary     0.977     5 0.00331 Preprocessor1_M…
## # ℹ 40 more rows
autoplot(tuning_results)

best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")
final_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(mars_mod) %>%
finalize_workflow(best_hyperparameters)
final_fit <- final_wf %>%
fit(data = spam_train)
final_fit %>%
extract_fit_parsnip() %>%
vip()