Packages required:

library(tidymodels)
library(tidyverse)
library(AmesHousing)

Part 1

Import and Generalization RMSE

# a. Import the boston.csv data
boston <- read_csv("~/Library/CloudStorage/OneDrive-UniversityofCincinnati/UC Courses/Fall 2024/Data Mining/lab10_data/boston.csv")
Rows: 506 Columns: 16── Column specification ──────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# b. Create a 70-30 train/test dataset split
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)

# c. Fit a multiple linear regression model using all predictors
boston_lm1 <- linear_reg() %>%
  fit(cmedv ~ ., data = boston_train)

# d. Compute the RMSE on the test data
boston_lm1 %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
# b. Create a 70-30 train/test dataset split
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)

# c. Fit a multiple linear regression model using all predictors
boston_lm1 <- linear_reg() %>%
  fit(cmedv ~ ., data = boston_train)

# d. Compute the RMSE on the test data
boston_lm1 %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
NA

Question 2:

# a. Create a recipe
mlr_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

# b. Create a workflow object that contains the model and recipe
mlr_wflow <- workflow() %>%
  add_model(linear_reg()) %>%
  add_recipe(mlr_recipe)

# c. Train the model
mlr_fit <- mlr_wflow %>%
  fit(data = boston_train)

# d. Compute the RMSE on the test data
mlr_fit %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
NA

Question 3:

ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
# Remove trouble variables
trbl_vars <- c("MS_SubClass", "Condition_2", "Exterior_1st",
"Exterior_2nd", "Misc_Feature")
ames_train_subset <- ames_train %>%
select(-trbl_vars)
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(trbl_vars)

# Now:
data %>% select(all_of(trbl_vars))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# Train the model without the above trouble variables
ames_lm1 <- linear_reg() %>%
fit(Sale_Price ~ ., data = ames_train_subset)
# Compute test data generalization RMSE
ames_lm1 %>%
predict(ames_test) %>%
bind_cols(ames_test) %>%
rmse(truth = Sale_Price, estimate = .pred)
Warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)

# a. Create a recipe with novel level lumping
mlr_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_other(all_nominal_predictors(), threshold = 0.01, other = "other")

# b. Create a workflow and train model
mlr_wflow <- workflow() %>%
  add_model(linear_reg()) %>%
  add_recipe(mlr_recipe)

mlr_fit <- mlr_wflow %>%
  fit(data = ames_train)

# c. Compute the RMSE on the test data
mlr_fit %>%
  predict(ames_test) %>%
  bind_cols(ames_test %>% select(Sale_Price)) %>%
  rmse(truth = Sale_Price, estimate =.pred)
Warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")

Part 2

Question 1:

# a. import the Advertising.csv data
advertising <- read_csv("~/Library/CloudStorage/OneDrive-UniversityofCincinnati/UC Courses/Fall 2024/Data Mining/lab10_data/Advertising.csv")
Rows: 200 Columns: 4── Column specification ──────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (4): TV, radio, newspaper, sales
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# b. create a 70-30 train/test dataset split
set.seed(123)
split <- initial_split(advertising, prop = 0.7, strata = sales)
advertising_train <- training(split)
advertising_test <- testing(split)

Question 2:

# a. Create 10-fold cross-validation object
set.seed(123)
kfolds <- vfold_cv(advertising_train, v = 10, strata = sales)

# b. Create recipe with no feature engineering steps
mlr_recipe <- recipe(sales ~ ., data = advertising_train)

# c. Create workflow object with model and recipe
mlr_wflow <- workflow() %>%
  add_model(linear_reg()) %>%
  add_recipe(mlr_recipe)

# d. Fit our model across the 10-fold cross-validation
mlr_fit_cv <- mlr_wflow %>%
  fit_resamples(kfolds)

Question 3:

# Calculate the average cross-validation RMSE
collect_metrics(mlr_fit_cv) %>%
  filter(.metric == "rmse")
NA

Question 4:

# Range of cross-validation RMSE values across all folds
cv_rmse_range <- collect_metrics(mlr_fit_cv, summarize = FALSE) %>%
  filter(.metric == "rmse") %>%
  summarize(range_rmse = max(.estimate) - min(.estimate))

cv_rmse_range
NA

Question 5:

# Create bootstrap sample object
set.seed(123)
bs_samples <- bootstraps(advertising_train, times = 10, strata = sales)

# Fit our model across the bootstrapped samples
mlr_fit_bs <- mlr_wflow %>%
  fit_resamples(bs_samples)

Question 6:

# Calculate average bootstrap RMSE
mlr_fit_bs %>%
  collect_metrics(summarize = FALSE) %>%
  filter(.metric == "rmse") %>%
  summarize(avg_rmse = mean(.estimate))
NA

Question 7:

collect_metrics(mlr_fit_bs, summarize = FALSE) %>%
  filter(.metric == "rmse") %>%
  summarize(min_rmse = min(.estimate), max_rmse = max(.estimate))
NA
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMjIyBQYWNrYWdlcyByZXF1aXJlZDogCmBgYHtyIGVjaG89VFJVRSwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KbGlicmFyeSh0aWR5bW9kZWxzKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShBbWVzSG91c2luZykKYGBgCiMjIyBQYXJ0IDEgCiMjIyMgSW1wb3J0IGFuZCBHZW5lcmFsaXphdGlvbiBSTVNFCmBgYHtyfQojIGEuIEltcG9ydCB0aGUgYm9zdG9uLmNzdiBkYXRhCmJvc3RvbiA8LSByZWFkX2Nzdigifi9MaWJyYXJ5L0Nsb3VkU3RvcmFnZS9PbmVEcml2ZS1Vbml2ZXJzaXR5b2ZDaW5jaW5uYXRpL1VDIENvdXJzZXMvRmFsbCAyMDI0L0RhdGEgTWluaW5nL2xhYjEwX2RhdGEvYm9zdG9uLmNzdiIpCgojIGIuIENyZWF0ZSBhIDcwLTMwIHRyYWluL3Rlc3QgZGF0YXNldCBzcGxpdApzZXQuc2VlZCgxMjMpCnNwbGl0IDwtIGluaXRpYWxfc3BsaXQoYm9zdG9uLCBwcm9wID0gMC43LCBzdHJhdGEgPSBjbWVkdikKYm9zdG9uX3RyYWluIDwtIHRyYWluaW5nKHNwbGl0KQpib3N0b25fdGVzdCA8LSB0ZXN0aW5nKHNwbGl0KQoKIyBjLiBGaXQgYSBtdWx0aXBsZSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbCB1c2luZyBhbGwgcHJlZGljdG9ycwpib3N0b25fbG0xIDwtIGxpbmVhcl9yZWcoKSAlPiUKICBmaXQoY21lZHYgfiAuLCBkYXRhID0gYm9zdG9uX3RyYWluKQoKIyBkLiBDb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGEKYm9zdG9uX2xtMSAlPiUKICBwcmVkaWN0KGJvc3Rvbl90ZXN0KSAlPiUKICBiaW5kX2NvbHMoYm9zdG9uX3Rlc3QgJT4lIHNlbGVjdChjbWVkdikpICU+JQogIHJtc2UodHJ1dGggPSBjbWVkdiwgZXN0aW1hdGUgPSAucHJlZCkKYGBgCgpgYGB7cn0KIyBiLiBDcmVhdGUgYSA3MC0zMCB0cmFpbi90ZXN0IGRhdGFzZXQgc3BsaXQKc2V0LnNlZWQoMTIzKQpzcGxpdCA8LSBpbml0aWFsX3NwbGl0KGJvc3RvbiwgcHJvcCA9IDAuNywgc3RyYXRhID0gY21lZHYpCmJvc3Rvbl90cmFpbiA8LSB0cmFpbmluZyhzcGxpdCkKYm9zdG9uX3Rlc3QgPC0gdGVzdGluZyhzcGxpdCkKCiMgYy4gRml0IGEgbXVsdGlwbGUgbGluZWFyIHJlZ3Jlc3Npb24gbW9kZWwgdXNpbmcgYWxsIHByZWRpY3RvcnMKYm9zdG9uX2xtMSA8LSBsaW5lYXJfcmVnKCkgJT4lCiAgZml0KGNtZWR2IH4gLiwgZGF0YSA9IGJvc3Rvbl90cmFpbikKCiMgZC4gQ29tcHV0ZSB0aGUgUk1TRSBvbiB0aGUgdGVzdCBkYXRhCmJvc3Rvbl9sbTEgJT4lCiAgcHJlZGljdChib3N0b25fdGVzdCkgJT4lCiAgYmluZF9jb2xzKGJvc3Rvbl90ZXN0ICU+JSBzZWxlY3QoY21lZHYpKSAlPiUKICBybXNlKHRydXRoID0gY21lZHYsIGVzdGltYXRlID0gLnByZWQpCgpgYGAKIyMjIyBRdWVzdGlvbiAyOiAKYGBge3J9CiMgYS4gQ3JlYXRlIGEgcmVjaXBlCm1scl9yZWNpcGUgPC0gcmVjaXBlKGNtZWR2IH4gLiwgZGF0YSA9IGJvc3Rvbl90cmFpbikgJT4lCiAgc3RlcF9ZZW9Kb2huc29uKGFsbF9udW1lcmljX3ByZWRpY3RvcnMoKSkgJT4lCiAgc3RlcF9ub3JtYWxpemUoYWxsX251bWVyaWNfcHJlZGljdG9ycygpKQoKIyBiLiBDcmVhdGUgYSB3b3JrZmxvdyBvYmplY3QgdGhhdCBjb250YWlucyB0aGUgbW9kZWwgYW5kIHJlY2lwZQptbHJfd2Zsb3cgPC0gd29ya2Zsb3coKSAlPiUKICBhZGRfbW9kZWwobGluZWFyX3JlZygpKSAlPiUKICBhZGRfcmVjaXBlKG1scl9yZWNpcGUpCgojIGMuIFRyYWluIHRoZSBtb2RlbAptbHJfZml0IDwtIG1scl93ZmxvdyAlPiUKICBmaXQoZGF0YSA9IGJvc3Rvbl90cmFpbikKCiMgZC4gQ29tcHV0ZSB0aGUgUk1TRSBvbiB0aGUgdGVzdCBkYXRhCm1scl9maXQgJT4lCiAgcHJlZGljdChib3N0b25fdGVzdCkgJT4lCiAgYmluZF9jb2xzKGJvc3Rvbl90ZXN0ICU+JSBzZWxlY3QoY21lZHYpKSAlPiUKICBybXNlKHRydXRoID0gY21lZHYsIGVzdGltYXRlID0gLnByZWQpCgpgYGAKIyMjIyBRdWVzdGlvbiAzOiAKYGBge3J9CmFtZXMgPC0gQW1lc0hvdXNpbmc6Om1ha2VfYW1lcygpCnNldC5zZWVkKDEyMykgIyBmb3IgcmVwcm9kdWNpYmlsaXR5CnNwbGl0IDwtIGluaXRpYWxfc3BsaXQoYW1lcywgcHJvcCA9IDAuNywgc3RyYXRhID0gIlNhbGVfUHJpY2UiKQphbWVzX3RyYWluIDwtIHRyYWluaW5nKHNwbGl0KQphbWVzX3Rlc3QgPC0gdGVzdGluZyhzcGxpdCkKIyBSZW1vdmUgdHJvdWJsZSB2YXJpYWJsZXMKdHJibF92YXJzIDwtIGMoIk1TX1N1YkNsYXNzIiwgIkNvbmRpdGlvbl8yIiwgIkV4dGVyaW9yXzFzdCIsCiJFeHRlcmlvcl8ybmQiLCAiTWlzY19GZWF0dXJlIikKYW1lc190cmFpbl9zdWJzZXQgPC0gYW1lc190cmFpbiAlPiUKc2VsZWN0KC10cmJsX3ZhcnMpCiMgVHJhaW4gdGhlIG1vZGVsIHdpdGhvdXQgdGhlIGFib3ZlIHRyb3VibGUgdmFyaWFibGVzCmFtZXNfbG0xIDwtIGxpbmVhcl9yZWcoKSAlPiUKZml0KFNhbGVfUHJpY2UgfiAuLCBkYXRhID0gYW1lc190cmFpbl9zdWJzZXQpCiMgQ29tcHV0ZSB0ZXN0IGRhdGEgZ2VuZXJhbGl6YXRpb24gUk1TRQphbWVzX2xtMSAlPiUKcHJlZGljdChhbWVzX3Rlc3QpICU+JQpiaW5kX2NvbHMoYW1lc190ZXN0KSAlPiUKcm1zZSh0cnV0aCA9IFNhbGVfUHJpY2UsIGVzdGltYXRlID0gLnByZWQpCgphbWVzIDwtIEFtZXNIb3VzaW5nOjptYWtlX2FtZXMoKQpzZXQuc2VlZCgxMjMpICMgZm9yIHJlcHJvZHVjaWJpbGl0eQpzcGxpdCA8LSBpbml0aWFsX3NwbGl0KGFtZXMsIHByb3AgPSAwLjcsIHN0cmF0YSA9ICJTYWxlX1ByaWNlIikKYW1lc190cmFpbiA8LSB0cmFpbmluZyhzcGxpdCkKYW1lc190ZXN0IDwtIHRlc3Rpbmcoc3BsaXQpCgojIGEuIENyZWF0ZSBhIHJlY2lwZSB3aXRoIG5vdmVsIGxldmVsIGx1bXBpbmcKbWxyX3JlY2lwZSA8LSByZWNpcGUoU2FsZV9QcmljZSB+IC4sIGRhdGEgPSBhbWVzX3RyYWluKSAlPiUKICBzdGVwX290aGVyKGFsbF9ub21pbmFsX3ByZWRpY3RvcnMoKSwgdGhyZXNob2xkID0gMC4wMSwgb3RoZXIgPSAib3RoZXIiKQoKIyBiLiBDcmVhdGUgYSB3b3JrZmxvdyBhbmQgdHJhaW4gbW9kZWwKbWxyX3dmbG93IDwtIHdvcmtmbG93KCkgJT4lCiAgYWRkX21vZGVsKGxpbmVhcl9yZWcoKSkgJT4lCiAgYWRkX3JlY2lwZShtbHJfcmVjaXBlKQoKbWxyX2ZpdCA8LSBtbHJfd2Zsb3cgJT4lCiAgZml0KGRhdGEgPSBhbWVzX3RyYWluKQoKIyBjLiBDb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGEKbWxyX2ZpdCAlPiUKICBwcmVkaWN0KGFtZXNfdGVzdCkgJT4lCiAgYmluZF9jb2xzKGFtZXNfdGVzdCAlPiUgc2VsZWN0KFNhbGVfUHJpY2UpKSAlPiUKICBybXNlKHRydXRoID0gU2FsZV9QcmljZSwgZXN0aW1hdGUgPS5wcmVkKQpgYGAKCiMjIyBQYXJ0IDIKIyMjIyBRdWVzdGlvbiAxOiAKYGBge3J9CiMgYS4gaW1wb3J0IHRoZSBBZHZlcnRpc2luZy5jc3YgZGF0YQphZHZlcnRpc2luZyA8LSByZWFkX2Nzdigifi9MaWJyYXJ5L0Nsb3VkU3RvcmFnZS9PbmVEcml2ZS1Vbml2ZXJzaXR5b2ZDaW5jaW5uYXRpL1VDIENvdXJzZXMvRmFsbCAyMDI0L0RhdGEgTWluaW5nL2xhYjEwX2RhdGEvQWR2ZXJ0aXNpbmcuY3N2IikKIyBiLiBjcmVhdGUgYSA3MC0zMCB0cmFpbi90ZXN0IGRhdGFzZXQgc3BsaXQKc2V0LnNlZWQoMTIzKQpzcGxpdCA8LSBpbml0aWFsX3NwbGl0KGFkdmVydGlzaW5nLCBwcm9wID0gMC43LCBzdHJhdGEgPSBzYWxlcykKYWR2ZXJ0aXNpbmdfdHJhaW4gPC0gdHJhaW5pbmcoc3BsaXQpCmFkdmVydGlzaW5nX3Rlc3QgPC0gdGVzdGluZyhzcGxpdCkKYGBgCiMjIyMgUXVlc3Rpb24gMjogCmBgYHtyfQojIGEuIENyZWF0ZSAxMC1mb2xkIGNyb3NzLXZhbGlkYXRpb24gb2JqZWN0CnNldC5zZWVkKDEyMykKa2ZvbGRzIDwtIHZmb2xkX2N2KGFkdmVydGlzaW5nX3RyYWluLCB2ID0gMTAsIHN0cmF0YSA9IHNhbGVzKQoKIyBiLiBDcmVhdGUgcmVjaXBlIHdpdGggbm8gZmVhdHVyZSBlbmdpbmVlcmluZyBzdGVwcwptbHJfcmVjaXBlIDwtIHJlY2lwZShzYWxlcyB+IC4sIGRhdGEgPSBhZHZlcnRpc2luZ190cmFpbikKCiMgYy4gQ3JlYXRlIHdvcmtmbG93IG9iamVjdCB3aXRoIG1vZGVsIGFuZCByZWNpcGUKbWxyX3dmbG93IDwtIHdvcmtmbG93KCkgJT4lCiAgYWRkX21vZGVsKGxpbmVhcl9yZWcoKSkgJT4lCiAgYWRkX3JlY2lwZShtbHJfcmVjaXBlKQoKIyBkLiBGaXQgb3VyIG1vZGVsIGFjcm9zcyB0aGUgMTAtZm9sZCBjcm9zcy12YWxpZGF0aW9uCm1scl9maXRfY3YgPC0gbWxyX3dmbG93ICU+JQogIGZpdF9yZXNhbXBsZXMoa2ZvbGRzKQoKYGBgCiMjIyMgUXVlc3Rpb24gMzogCmBgYHtyfQojIENhbGN1bGF0ZSB0aGUgYXZlcmFnZSBjcm9zcy12YWxpZGF0aW9uIFJNU0UKY29sbGVjdF9tZXRyaWNzKG1scl9maXRfY3YpICU+JQogIGZpbHRlcigubWV0cmljID09ICJybXNlIikKCmBgYAojIyMjIFF1ZXN0aW9uIDQ6IApgYGB7cn0KIyBSYW5nZSBvZiBjcm9zcy12YWxpZGF0aW9uIFJNU0UgdmFsdWVzIGFjcm9zcyBhbGwgZm9sZHMKY3Zfcm1zZV9yYW5nZSA8LSBjb2xsZWN0X21ldHJpY3MobWxyX2ZpdF9jdiwgc3VtbWFyaXplID0gRkFMU0UpICU+JQogIGZpbHRlcigubWV0cmljID09ICJybXNlIikgJT4lCiAgc3VtbWFyaXplKHJhbmdlX3Jtc2UgPSBtYXgoLmVzdGltYXRlKSAtIG1pbiguZXN0aW1hdGUpKQoKY3Zfcm1zZV9yYW5nZQoKYGBgCiMjIyMgUXVlc3Rpb24gNTogCmBgYHtyfQojIENyZWF0ZSBib290c3RyYXAgc2FtcGxlIG9iamVjdApzZXQuc2VlZCgxMjMpCmJzX3NhbXBsZXMgPC0gYm9vdHN0cmFwcyhhZHZlcnRpc2luZ190cmFpbiwgdGltZXMgPSAxMCwgc3RyYXRhID0gc2FsZXMpCgojIEZpdCBvdXIgbW9kZWwgYWNyb3NzIHRoZSBib290c3RyYXBwZWQgc2FtcGxlcwptbHJfZml0X2JzIDwtIG1scl93ZmxvdyAlPiUKICBmaXRfcmVzYW1wbGVzKGJzX3NhbXBsZXMpCgpgYGAKIyMjIyBRdWVzdGlvbiA2OiAKYGBge3J9CiMgQ2FsY3VsYXRlIGF2ZXJhZ2UgYm9vdHN0cmFwIFJNU0UKbWxyX2ZpdF9icyAlPiUKICBjb2xsZWN0X21ldHJpY3Moc3VtbWFyaXplID0gRkFMU0UpICU+JQogIGZpbHRlcigubWV0cmljID09ICJybXNlIikgJT4lCiAgc3VtbWFyaXplKGF2Z19ybXNlID0gbWVhbiguZXN0aW1hdGUpKQoKYGBgCiMjIyMgUXVlc3Rpb24gNzogCmBgYHtyfQpjb2xsZWN0X21ldHJpY3MobWxyX2ZpdF9icywgc3VtbWFyaXplID0gRkFMU0UpICU+JQogIGZpbHRlcigubWV0cmljID09ICJybXNlIikgJT4lCiAgc3VtbWFyaXplKG1pbl9ybXNlID0gbWluKC5lc3RpbWF0ZSksIG1heF9ybXNlID0gbWF4KC5lc3RpbWF0ZSkpCgpgYGAKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCg==