Prerequisites
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.0 ✔ recipes 1.0.1
## ✔ dials 1.0.0 ✔ rsample 1.1.0
## ✔ dplyr 1.0.9 ✔ tibble 3.1.8
## ✔ ggplot2 3.3.6 ✔ tidyr 1.2.0
## ✔ infer 1.0.3 ✔ tune 1.0.1
## ✔ modeldata 1.0.1 ✔ workflows 1.1.0
## ✔ parsnip 1.0.2 ✔ workflowsets 1.0.0
## ✔ purrr 0.3.4 ✔ yardstick 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ stringr 1.4.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
library(here)
## here() starts at C:/Users/danya/OneDrive - University of Cincinnati/R/DataMining
Part 1: Feature Engineering
#1
path <- here('data', 'boston.csv')
boston <- read_csv(path)
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
boston_split <- initial_split(boston, 0.7, strata = cmedv)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)
lml <- linear_reg() %>%
fit(cmedv ~ ., boston_train)
lml %>%
predict(boston_test) %>%
bind_cols(select(boston_test, cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
#2
boston_recipe <- recipe(cmedv ~ ., boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_YeoJohnson(all_numeric_predictors())
bostom_workflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(boston_recipe)
lml <- bostom_workflow %>%
fit(boston_train)
lml %>%
predict(boston_test) %>%
bind_cols(select(boston_test, cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.56
#3
ames <- AmesHousing::make_ames()
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = Sale_Price)
ames_train <- training(split)
ames_test <- testing(split)
mlr_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_other(all_nominal_predictors(), threshold = 0.01, other = "other")
mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)
mlr_fit <- mlr_wflow %>%
fit(data = ames_train)
mlr_fit %>%
predict(ames_test) %>%
bind_cols(ames_test %>% select(Sale_Price)) %>%
rmse(truth = Sale_Price, estimate = .pred)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 25917.
rm(list = ls())
Part 2: Resampling
#1
path <- here('data', 'Advertising.csv')
adv <- read_csv(path)
## Rows: 200 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): TV, radio, newspaper, sales
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
split <- initial_split(adv, 0.7, strata = sales)
train <- training(split)
test <- testing(split)
#2
set.seed(123)
kfold <- vfold_cv(train, v = 10, strata = sales)
mlr_recipe <- recipe(sales ~ ., TV + radio + newspaper, data = train)
mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)
mlr_fit <- mlr_wflow %>%
fit_resamples(kfold)
#3
collect_metrics(mlr_fit)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.65 10 0.154 Preprocessor1_Model1
## 2 rsq standard 0.913 10 0.0166 Preprocessor1_Model1
#4
collect_metrics(mlr_fit, summarize = FALSE) %>%
filter(.metric == 'rmse') %>%
summarise(rng = range(.estimate))
## # A tibble: 2 × 1
## rng
## <dbl>
## 1 0.975
## 2 2.59
#5
set.seed(123)
bs_sample <- bootstraps(train, times = 10, strata = sales)
bs_fit <- mlr_wflow %>%
fit_resamples(bs_sample)
#6
collect_metrics(bs_fit)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.83 10 0.0973 Preprocessor1_Model1
## 2 rsq standard 0.885 10 0.0102 Preprocessor1_Model1
#7
collect_metrics(bs_fit, summarize = F) %>%
filter(.metric == 'rmse') %>%
summarise(rng = range(.estimate))
## # A tibble: 2 × 1
## rng
## <dbl>
## 1 1.42
## 2 2.51