Packages
library(tidymodels)
library(tidyverse)
library(readr)
boston <- read_csv("~/R Studio/BANA 4080/data/boston.csv")
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
boston_lm1 <- linear_reg() %>%
fit(cmedv ~ ., data = boston_train)
boston_lm1 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
mlr_recipe <- recipe(cmedv ~., data = boston_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)
mlr_fit <- mlr_wflow %>%
fit(data = boston_train)
mlr_fit %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.43
ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
trbl_vars <- c("MS_SubClass", "Condition_2", "Exterior_1st",
"Exterior_2nd", "Misc_Feature")
ames_train_subset <- ames_train %>%
select(-trbl_vars)
ames_lm1 <- linear_reg() %>%
fit(Sale_Price ~ ., data = ames_train_subset)
ames_lm1 %>%
predict(ames_test) %>%
bind_cols(ames_test) %>%
rmse(truth = Sale_Price, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 22959.
Rather than remove the trouble variables, fill in the blanks to apply a feature engineering step to all nominal predictor variables to lump novel levels together so that you can use those variables as predictors. Use a threshold of 1% so that all categorical levels with an observation rate of less than 1% will be pooled together into an “other” level.
mlr_recipe <- recipe(Sale_Price ~ . , data = ames_train) %>%
step_other(all_nominal_predictors(), threshold = 0.01, other = "other")
mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)
mlr_fit <- mlr_wflow %>%
fit(data = ames_train)
mlr_fit %>%
predict(ames_test) %>%
bind_cols(ames_test %>% select(Sale_Price)) %>%
rmse(truth = Sale_Price, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 25917.
Part 2: Resampling Use the Advertising.csv data to complete the following tasks in this section. The Advertising.csv data contains three predictor variables - TV, radio, and newspaper, which represents a companies advertising budget for these respective mediums across 200 metropolitan markets. It also contains the response variable - sales, which represents the total sales in thousands of units for a given product. 1. Fill in the blanks to split the data into 70-30 training-test sets. # a. import the Advertising.csv data # b. create a 70-30 train/test dataset split
Advertising <- read_csv("~/R Studio/BANA 4080/data/Advertising.csv")
set.seed(123)
split <- initial_split(Advertising, prop = .7, strata = sales)
advertising_train <- training(split)
advertising_test <- testing(split)
set.seed(123)
kfolds <- vfold_cv(advertising_train, v = 10, strata = sales)
mlr_recipe <- recipe(sales ~ ., data = advertising_train)
mlr_wflow <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(mlr_recipe)
mlr_fit_cv <- mlr_wflow %>%
fit_resamples(kfolds)
collect_metrics(mlr_fit_cv)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.65 10 0.154 Preprocessor1_Model1
## 2 rsq standard 0.913 10 0.0166 Preprocessor1_Model1
collect_metrics(mlr_fit_cv, summarize = FALSE) %>%
filter(.metric == 'rmse')
## # A tibble: 10 × 5
## id .metric .estimator .estimate .config
## <chr> <chr> <chr> <dbl> <chr>
## 1 Fold01 rmse standard 2.59 Preprocessor1_Model1
## 2 Fold02 rmse standard 1.20 Preprocessor1_Model1
## 3 Fold03 rmse standard 1.73 Preprocessor1_Model1
## 4 Fold04 rmse standard 1.44 Preprocessor1_Model1
## 5 Fold05 rmse standard 2.23 Preprocessor1_Model1
## 6 Fold06 rmse standard 0.975 Preprocessor1_Model1
## 7 Fold07 rmse standard 1.73 Preprocessor1_Model1
## 8 Fold08 rmse standard 1.58 Preprocessor1_Model1
## 9 Fold09 rmse standard 1.75 Preprocessor1_Model1
## 10 Fold10 rmse standard 1.23 Preprocessor1_Model1
set.seed(123)
bs_samples <- bootstraps(advertising_train, times = 10, strata = sales)
mlr_fit_bs <- mlr_wflow %>%
fit_resamples(bs_samples)
collect_metrics(mlr_fit_bs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.83 10 0.0973 Preprocessor1_Model1
## 2 rsq standard 0.885 10 0.0102 Preprocessor1_Model1
collect_metrics(mlr_fit_bs, summarize = FALSE) %>%
filter(.metric == 'rmse')
## # A tibble: 10 × 5
## id .metric .estimator .estimate .config
## <chr> <chr> <chr> <dbl> <chr>
## 1 Bootstrap01 rmse standard 2.02 Preprocessor1_Model1
## 2 Bootstrap02 rmse standard 2.02 Preprocessor1_Model1
## 3 Bootstrap03 rmse standard 1.42 Preprocessor1_Model1
## 4 Bootstrap04 rmse standard 1.61 Preprocessor1_Model1
## 5 Bootstrap05 rmse standard 1.57 Preprocessor1_Model1
## 6 Bootstrap06 rmse standard 1.93 Preprocessor1_Model1
## 7 Bootstrap07 rmse standard 1.75 Preprocessor1_Model1
## 8 Bootstrap08 rmse standard 2.51 Preprocessor1_Model1
## 9 Bootstrap09 rmse standard 1.76 Preprocessor1_Model1
## 10 Bootstrap10 rmse standard 1.75 Preprocessor1_Model1