library(tidymodels)
library(tidyverse)
What is the generalization RMSE?
# a. import the data
boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# b. create a 70-30 train/test dataset split
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
# c. fit a multiple linear regression model using all predictors in their current state
boston_lm1 <- linear_reg() %>%
fit(cmedv ~ ., data = boston_train)
# d. compute the RMSE on the test data
boston_lm1 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
The generalization error is: 4.829261
Does this model improve performance?
# a. create a recipe
mlr_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
# b. Create a workflow object that contains the model and recipe
mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)
# c. train the model
mlr_fit <- mlr_wflow %>%
fit(data = boston_train)
# d. compute the RMSE on the test data
mlr_fit %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.43
Yes this does improve model performance as the generalization RMSE is less than before at 4.428523.
# Import Ames data and split into train/test
ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
# Remove trouble variables
trbl_vars <- c("MS_SubClass", "Condition_2", "Exterior_1st",
"Exterior_2nd", "Misc_Feature")
ames_train_subset <- ames_train %>%
select(-trbl_vars)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(trbl_vars)
##
## # Now:
## data %>% select(all_of(trbl_vars))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Train the model without the above trouble variables
ames_lm1 <- linear_reg() %>%
fit(Sale_Price ~ ., data = ames_train_subset)
# Compute test data generalization RMSE
ames_lm1 %>%
predict(ames_test) %>%
bind_cols(ames_test) %>%
rmse(truth = Sale_Price, estimate = .pred)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response", : prediction from rank-deficient fit; consider predict(.,
## rankdeficient="NA")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 22959.
Apply a feature engineering step to all nominal predictor variables to lump novel levels together so that you can use those variables as predictors. Use a threshold of 1% so that all categorical levels with an observation rate of less than 1% will be pooled together into an “other” level
Does this model improve performance?
mlr_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_other(all_nominal_predictors(), threshold = 0.01, other = "other")
mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)
mlr_fit <- mlr_wflow %>%
fit(data = ames_train)
mlr_fit %>%
predict(ames_test) %>%
bind_cols(ames_test %>% select(Sale_Price)) %>%
rmse(truth = Sale_Price, estimate = .pred)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response", : prediction from rank-deficient fit; consider predict(.,
## rankdeficient="NA")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 25917.
This model does not improve the performance as the RMSE is 25917.42 which is greater than the previous RMSE of 22959.41 and the goal is to minimize the RMSE.
Use the Advertising.csv data to complete the following tasks in this section. The Advertising.csv data contains three predictor variables - TV, radio, and newspaper, which represents a companies advertising budget for these respective mediums across 200 metropolitan markets. It also contains the response variable - sales, which represents the total sales in thousands of units for a given product.
# a. import the Advertising.csv data
advertising <- read_csv("Advertising.csv")
# b. create a 70-30 train/test dataset split
set.seed(123)
split <- initial_split(advertising, prop = 0.7, strata = sales)
advertising_train <- training(split)
advertising_test <- testing(split)
# create a 10-fold cross validation object
set.seed(123)
kfolds <- vfold_cv(advertising_train, v = 10, strata = sales)
# Create recipe with no feature engineering steps
mlr_recipe <- recipe(sales ~ ., data = advertising_train)
# Create workflow object
mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)
# Fit our model across the 10-fold CV
mlr_fit_cv <- mlr_wflow %>%
fit_resamples(kfolds)
collect_metrics(mlr_fit_cv)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.65 10 0.154 Preprocessor1_Model1
## 2 rsq standard 0.913 10 0.0166 Preprocessor1_Model1
The average cross-validation RMSE is 1.6469071.
collect_metrics(mlr_fit_cv, summarize = FALSE) %>%
filter(.metric == 'rmse')
## # A tibble: 10 × 5
## id .metric .estimator .estimate .config
## <chr> <chr> <chr> <dbl> <chr>
## 1 Fold01 rmse standard 2.59 Preprocessor1_Model1
## 2 Fold02 rmse standard 1.20 Preprocessor1_Model1
## 3 Fold03 rmse standard 1.73 Preprocessor1_Model1
## 4 Fold04 rmse standard 1.44 Preprocessor1_Model1
## 5 Fold05 rmse standard 2.23 Preprocessor1_Model1
## 6 Fold06 rmse standard 0.975 Preprocessor1_Model1
## 7 Fold07 rmse standard 1.73 Preprocessor1_Model1
## 8 Fold08 rmse standard 1.58 Preprocessor1_Model1
## 9 Fold09 rmse standard 1.75 Preprocessor1_Model1
## 10 Fold10 rmse standard 1.23 Preprocessor1_Model1
The range of RMSE is 0.9746033 to 2.5889985.
# create bootstrap sample object
set.seed(123)
bs_samples <- bootstraps(advertising_train, times = 10, strata = sales)
# fit our model across the bootstrapped samples
mlr_fit_bs <- mlr_wflow %>%
fit_resamples(bs_samples)
collect_metrics(mlr_fit_bs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.83 10 0.0973 Preprocessor1_Model1
## 2 rsq standard 0.885 10 0.0102 Preprocessor1_Model1
The average bootstrap RMSE is 1.833085.
collect_metrics(mlr_fit_bs, summarize = FALSE) %>%
filter(.metric == 'rmse')
## # A tibble: 10 × 5
## id .metric .estimator .estimate .config
## <chr> <chr> <chr> <dbl> <chr>
## 1 Bootstrap01 rmse standard 2.02 Preprocessor1_Model1
## 2 Bootstrap02 rmse standard 2.02 Preprocessor1_Model1
## 3 Bootstrap03 rmse standard 1.42 Preprocessor1_Model1
## 4 Bootstrap04 rmse standard 1.61 Preprocessor1_Model1
## 5 Bootstrap05 rmse standard 1.57 Preprocessor1_Model1
## 6 Bootstrap06 rmse standard 1.93 Preprocessor1_Model1
## 7 Bootstrap07 rmse standard 1.75 Preprocessor1_Model1
## 8 Bootstrap08 rmse standard 2.51 Preprocessor1_Model1
## 9 Bootstrap09 rmse standard 1.76 Preprocessor1_Model1
## 10 Bootstrap10 rmse standard 1.75 Preprocessor1_Model1
The range of RMSE is 1.418276 to 2.512187.