Prereqs

Packages

library(tidymodels)
library(tidyverse)

a. import the boston.csv data

b. create a 70-30 train/test dataset split

c. fit a multiple linear regression model using all predictors in their current state (without any feature engineering)

d. and compute the RMSE on the test data

library(readr)

boston <- read_csv("~/R Studio/BANA 4080/data/boston.csv")


set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)

boston_lm1 <- linear_reg() %>%
fit(cmedv ~ ., data = boston_train)

boston_lm1 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83
  1. a. Create a recipe

    b. Create a workflow object that contains the model and recipe

    c. Train the model

    d. and compute the RMSE on the test data


mlr_recipe <- recipe(cmedv ~., data = boston_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())

mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)

mlr_fit <- mlr_wflow %>%
fit(data = boston_train)

mlr_fit %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.43
  1. Import Ames data and split into train/test

    Remove trouble variables

    Train the model without the above trouble variables

    Compute test data generalization RMSE

ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility

split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)

trbl_vars <- c("MS_SubClass", "Condition_2", "Exterior_1st",
"Exterior_2nd", "Misc_Feature")
ames_train_subset <- ames_train %>%
select(-trbl_vars)

ames_lm1 <- linear_reg() %>%
fit(Sale_Price ~ ., data = ames_train_subset)

ames_lm1 %>%
predict(ames_test) %>%
bind_cols(ames_test) %>%
rmse(truth = Sale_Price, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      22959.

Rather than remove the trouble variables, fill in the blanks to apply a feature engineering step to all nominal predictor variables to lump novel levels together so that you can use those variables as predictors. Use a threshold of 1% so that all categorical levels with an observation rate of less than 1% will be pooled together into an “other” level.

mlr_recipe <- recipe(Sale_Price ~ . , data = ames_train) %>%
step_other(all_nominal_predictors(), threshold = 0.01, other = "other")

mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)

mlr_fit <- mlr_wflow %>%
fit(data = ames_train)

mlr_fit %>%
predict(ames_test) %>%
bind_cols(ames_test %>% select(Sale_Price)) %>%
rmse(truth = Sale_Price, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      25917.

Part 2: Resampling Use the Advertising.csv data to complete the following tasks in this section. The Advertising.csv data contains three predictor variables - TV, radio, and newspaper, which represents a companies advertising budget for these respective mediums across 200 metropolitan markets. It also contains the response variable - sales, which represents the total sales in thousands of units for a given product. 1. Fill in the blanks to split the data into 70-30 training-test sets. # a. import the Advertising.csv data # b. create a 70-30 train/test dataset split


Advertising <- read_csv("~/R Studio/BANA 4080/data/Advertising.csv")

set.seed(123)
split <- initial_split(Advertising, prop = .7, strata = sales)
advertising_train <- training(split)
advertising_test <- testing(split)
  1. Fill in the blanks to apply 10-fold cross-validation procedure that models Sales as a function of all threepredictors (without any feature engineering).

create 10-fold cross-validation object

create recipe with no feature engineering steps

create workflow object

fit our model across the 10-fold CV


set.seed(123)
kfolds <- vfold_cv(advertising_train, v = 10, strata = sales)

mlr_recipe <- recipe(sales ~ ., data = advertising_train)


mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)

mlr_fit_cv <- mlr_wflow %>%
fit_resamples(kfolds)
  1. What is the average cross-validation RMSE?
collect_metrics(mlr_fit_cv)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.65     10  0.154  Preprocessor1_Model1
## 2 rsq     standard   0.913    10  0.0166 Preprocessor1_Model1
  1. What is the range of cross-validation RMSE values across all ten folds?
collect_metrics(mlr_fit_cv, summarize = FALSE) %>%
filter(.metric == 'rmse')
## # A tibble: 10 × 5
##    id     .metric .estimator .estimate .config             
##    <chr>  <chr>   <chr>          <dbl> <chr>               
##  1 Fold01 rmse    standard       2.59  Preprocessor1_Model1
##  2 Fold02 rmse    standard       1.20  Preprocessor1_Model1
##  3 Fold03 rmse    standard       1.73  Preprocessor1_Model1
##  4 Fold04 rmse    standard       1.44  Preprocessor1_Model1
##  5 Fold05 rmse    standard       2.23  Preprocessor1_Model1
##  6 Fold06 rmse    standard       0.975 Preprocessor1_Model1
##  7 Fold07 rmse    standard       1.73  Preprocessor1_Model1
##  8 Fold08 rmse    standard       1.58  Preprocessor1_Model1
##  9 Fold09 rmse    standard       1.75  Preprocessor1_Model1
## 10 Fold10 rmse    standard       1.23  Preprocessor1_Model1
  1. Fill in the blanks to apply a boostrap resampling procedure that models Sales as a function of all threepredictors (without any feature engineering). Use 10 bootstrap samples. # create bootstrap sample object # fit our model across the bootstrapped samples

set.seed(123)
bs_samples <- bootstraps(advertising_train, times = 10, strata = sales)

mlr_fit_bs <- mlr_wflow %>%
fit_resamples(bs_samples)
  1. What is the average bootstrap RMSE?
collect_metrics(mlr_fit_bs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.83     10  0.0973 Preprocessor1_Model1
## 2 rsq     standard   0.885    10  0.0102 Preprocessor1_Model1
  1. What is the range of bootstrap RMSE values across all ten bootstrap samples?
collect_metrics(mlr_fit_bs, summarize = FALSE) %>%
filter(.metric == 'rmse')
## # A tibble: 10 × 5
##    id          .metric .estimator .estimate .config             
##    <chr>       <chr>   <chr>          <dbl> <chr>               
##  1 Bootstrap01 rmse    standard        2.02 Preprocessor1_Model1
##  2 Bootstrap02 rmse    standard        2.02 Preprocessor1_Model1
##  3 Bootstrap03 rmse    standard        1.42 Preprocessor1_Model1
##  4 Bootstrap04 rmse    standard        1.61 Preprocessor1_Model1
##  5 Bootstrap05 rmse    standard        1.57 Preprocessor1_Model1
##  6 Bootstrap06 rmse    standard        1.93 Preprocessor1_Model1
##  7 Bootstrap07 rmse    standard        1.75 Preprocessor1_Model1
##  8 Bootstrap08 rmse    standard        2.51 Preprocessor1_Model1
##  9 Bootstrap09 rmse    standard        1.76 Preprocessor1_Model1
## 10 Bootstrap10 rmse    standard        1.75 Preprocessor1_Model1