Packages required:

library(tidymodels)
library(tidyverse)
library(AmesHousing)

Part 1

Import and Generalization RMSE

# a. Import the boston.csv data
boston <- read_csv("C:/Users/thanh/OneDrive - University of Cincinnati/Nie/F24_Data mining/boston_Lab 10.csv")
Rows: 506 Columns: 16── Column specification ────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptrat...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# b. Create a 70-30 train/test dataset split
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)

# c. Fit a multiple linear regression model using all predictors
boston_lm1 <- linear_reg() %>%
  fit(cmedv ~ ., data = boston_train)

# d. Compute the RMSE on the test data
boston_lm1 %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
# b. Create a 70-30 train/test dataset split
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)

# c. Fit a multiple linear regression model using all predictors
boston_lm1 <- linear_reg() %>%
  fit(cmedv ~ ., data = boston_train)

# d. Compute the RMSE on the test data
boston_lm1 %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
NA

Question 2:

# a. Create a recipe
mlr_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

# b. Create a workflow object that contains the model and recipe
mlr_wflow <- workflow() %>%
  add_model(linear_reg()) %>%
  add_recipe(mlr_recipe)

# c. Train the model
mlr_fit <- mlr_wflow %>%
  fit(data = boston_train)

# d. Compute the RMSE on the test data
mlr_fit %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
NA

Question 3:

ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
# Remove trouble variables
trbl_vars <- c("MS_SubClass", "Condition_2", "Exterior_1st",
"Exterior_2nd", "Misc_Feature")
ames_train_subset <- ames_train %>%
select(-trbl_vars)
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(trbl_vars)

# Now:
data %>% select(all_of(trbl_vars))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# Train the model without the above trouble variables
ames_lm1 <- linear_reg() %>%
fit(Sale_Price ~ ., data = ames_train_subset)
# Compute test data generalization RMSE
ames_lm1 %>%
predict(ames_test) %>%
bind_cols(ames_test) %>%
rmse(truth = Sale_Price, estimate = .pred)
Warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)

# a. Create a recipe with novel level lumping
mlr_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_other(all_nominal_predictors(), threshold = 0.01, other = "other")

# b. Create a workflow and train model
mlr_wflow <- workflow() %>%
  add_model(linear_reg()) %>%
  add_recipe(mlr_recipe)

mlr_fit <- mlr_wflow %>%
  fit(data = ames_train)

# c. Compute the RMSE on the test data
mlr_fit %>%
  predict(ames_test) %>%
  bind_cols(ames_test %>% select(Sale_Price)) %>%
  rmse(truth = Sale_Price, estimate =.pred)
Warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")

Part 2

Question 1:

# a. import the Advertising.csv data
advertising <- read_csv("C:/Users/thanh/OneDrive - University of Cincinnati/Nie/F24_Data mining/Advertising_Lab 10.csv")
Rows: 200 Columns: 4── Column specification ────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (4): TV, radio, newspaper, sales
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# b. create a 70-30 train/test dataset split
set.seed(123)
split <- initial_split(advertising, prop = 0.7, strata = sales)
advertising_train <- training(split)
advertising_test <- testing(split)

Question 2:

# a. Create 10-fold cross-validation object
set.seed(123)
kfolds <- vfold_cv(advertising_train, v = 10, strata = sales)

# b. Create recipe with no feature engineering steps
mlr_recipe <- recipe(sales ~ ., data = advertising_train)

# c. Create workflow object with model and recipe
mlr_wflow <- workflow() %>%
  add_model(linear_reg()) %>%
  add_recipe(mlr_recipe)

# d. Fit our model across the 10-fold cross-validation
mlr_fit_cv <- mlr_wflow %>%
  fit_resamples(kfolds)

Question 3:

# Calculate the average cross-validation RMSE
collect_metrics(mlr_fit_cv) %>%
  filter(.metric == "rmse")
NA

Question 4:

# Range of cross-validation RMSE values across all folds
cv_rmse_range <- collect_metrics(mlr_fit_cv, summarize = FALSE) %>%
  filter(.metric == "rmse") %>%
  summarize(range_rmse = max(.estimate) - min(.estimate))

cv_rmse_range
NA

Question 5:

# Create bootstrap sample object
set.seed(123)
bs_samples <- bootstraps(advertising_train, times = 10, strata = sales)

# Fit our model across the bootstrapped samples
mlr_fit_bs <- mlr_wflow %>%
  fit_resamples(bs_samples)

Question 6:

# Calculate average bootstrap RMSE
mlr_fit_bs %>%
  collect_metrics(summarize = FALSE) %>%
  filter(.metric == "rmse") %>%
  summarize(avg_rmse = mean(.estimate))
NA

Question 7:

collect_metrics(mlr_fit_bs, summarize = FALSE) %>%
  filter(.metric == "rmse") %>%
  summarize(min_rmse = min(.estimate), max_rmse = max(.estimate))
NA
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQojIyMjIFBhY2thZ2VzIHJlcXVpcmVkOiANCmBgYHtyIGVjaG89VFJVRSwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0NCmxpYnJhcnkodGlkeW1vZGVscykNCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShBbWVzSG91c2luZykNCmBgYA0KIyMjIFBhcnQgMSANCiMjIyMgSW1wb3J0IGFuZCBHZW5lcmFsaXphdGlvbiBSTVNFDQpgYGB7cn0NCiMgYS4gSW1wb3J0IHRoZSBib3N0b24uY3N2IGRhdGENCmJvc3RvbiA8LSByZWFkX2NzdigiQzovVXNlcnMvdGhhbmgvT25lRHJpdmUgLSBVbml2ZXJzaXR5IG9mIENpbmNpbm5hdGkvTmllL0YyNF9EYXRhIG1pbmluZy9ib3N0b25fTGFiIDEwLmNzdiIpDQoNCiMgYi4gQ3JlYXRlIGEgNzAtMzAgdHJhaW4vdGVzdCBkYXRhc2V0IHNwbGl0DQpzZXQuc2VlZCgxMjMpDQpzcGxpdCA8LSBpbml0aWFsX3NwbGl0KGJvc3RvbiwgcHJvcCA9IDAuNywgc3RyYXRhID0gY21lZHYpDQpib3N0b25fdHJhaW4gPC0gdHJhaW5pbmcoc3BsaXQpDQpib3N0b25fdGVzdCA8LSB0ZXN0aW5nKHNwbGl0KQ0KDQojIGMuIEZpdCBhIG11bHRpcGxlIGxpbmVhciByZWdyZXNzaW9uIG1vZGVsIHVzaW5nIGFsbCBwcmVkaWN0b3JzDQpib3N0b25fbG0xIDwtIGxpbmVhcl9yZWcoKSAlPiUNCiAgZml0KGNtZWR2IH4gLiwgZGF0YSA9IGJvc3Rvbl90cmFpbikNCg0KIyBkLiBDb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGENCmJvc3Rvbl9sbTEgJT4lDQogIHByZWRpY3QoYm9zdG9uX3Rlc3QpICU+JQ0KICBiaW5kX2NvbHMoYm9zdG9uX3Rlc3QgJT4lIHNlbGVjdChjbWVkdikpICU+JQ0KICBybXNlKHRydXRoID0gY21lZHYsIGVzdGltYXRlID0gLnByZWQpDQpgYGANCg0KYGBge3J9DQojIGIuIENyZWF0ZSBhIDcwLTMwIHRyYWluL3Rlc3QgZGF0YXNldCBzcGxpdA0Kc2V0LnNlZWQoMTIzKQ0Kc3BsaXQgPC0gaW5pdGlhbF9zcGxpdChib3N0b24sIHByb3AgPSAwLjcsIHN0cmF0YSA9IGNtZWR2KQ0KYm9zdG9uX3RyYWluIDwtIHRyYWluaW5nKHNwbGl0KQ0KYm9zdG9uX3Rlc3QgPC0gdGVzdGluZyhzcGxpdCkNCg0KIyBjLiBGaXQgYSBtdWx0aXBsZSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbCB1c2luZyBhbGwgcHJlZGljdG9ycw0KYm9zdG9uX2xtMSA8LSBsaW5lYXJfcmVnKCkgJT4lDQogIGZpdChjbWVkdiB+IC4sIGRhdGEgPSBib3N0b25fdHJhaW4pDQoNCiMgZC4gQ29tcHV0ZSB0aGUgUk1TRSBvbiB0aGUgdGVzdCBkYXRhDQpib3N0b25fbG0xICU+JQ0KICBwcmVkaWN0KGJvc3Rvbl90ZXN0KSAlPiUNCiAgYmluZF9jb2xzKGJvc3Rvbl90ZXN0ICU+JSBzZWxlY3QoY21lZHYpKSAlPiUNCiAgcm1zZSh0cnV0aCA9IGNtZWR2LCBlc3RpbWF0ZSA9IC5wcmVkKQ0KDQpgYGANCiMjIyMgUXVlc3Rpb24gMjogDQpgYGB7cn0NCiMgYS4gQ3JlYXRlIGEgcmVjaXBlDQptbHJfcmVjaXBlIDwtIHJlY2lwZShjbWVkdiB+IC4sIGRhdGEgPSBib3N0b25fdHJhaW4pICU+JQ0KICBzdGVwX1llb0pvaG5zb24oYWxsX251bWVyaWNfcHJlZGljdG9ycygpKSAlPiUNCiAgc3RlcF9ub3JtYWxpemUoYWxsX251bWVyaWNfcHJlZGljdG9ycygpKQ0KDQojIGIuIENyZWF0ZSBhIHdvcmtmbG93IG9iamVjdCB0aGF0IGNvbnRhaW5zIHRoZSBtb2RlbCBhbmQgcmVjaXBlDQptbHJfd2Zsb3cgPC0gd29ya2Zsb3coKSAlPiUNCiAgYWRkX21vZGVsKGxpbmVhcl9yZWcoKSkgJT4lDQogIGFkZF9yZWNpcGUobWxyX3JlY2lwZSkNCg0KIyBjLiBUcmFpbiB0aGUgbW9kZWwNCm1scl9maXQgPC0gbWxyX3dmbG93ICU+JQ0KICBmaXQoZGF0YSA9IGJvc3Rvbl90cmFpbikNCg0KIyBkLiBDb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGENCm1scl9maXQgJT4lDQogIHByZWRpY3QoYm9zdG9uX3Rlc3QpICU+JQ0KICBiaW5kX2NvbHMoYm9zdG9uX3Rlc3QgJT4lIHNlbGVjdChjbWVkdikpICU+JQ0KICBybXNlKHRydXRoID0gY21lZHYsIGVzdGltYXRlID0gLnByZWQpDQoNCmBgYA0KIyMjIyBRdWVzdGlvbiAzOiANCmBgYHtyfQ0KYW1lcyA8LSBBbWVzSG91c2luZzo6bWFrZV9hbWVzKCkNCnNldC5zZWVkKDEyMykgIyBmb3IgcmVwcm9kdWNpYmlsaXR5DQpzcGxpdCA8LSBpbml0aWFsX3NwbGl0KGFtZXMsIHByb3AgPSAwLjcsIHN0cmF0YSA9ICJTYWxlX1ByaWNlIikNCmFtZXNfdHJhaW4gPC0gdHJhaW5pbmcoc3BsaXQpDQphbWVzX3Rlc3QgPC0gdGVzdGluZyhzcGxpdCkNCiMgUmVtb3ZlIHRyb3VibGUgdmFyaWFibGVzDQp0cmJsX3ZhcnMgPC0gYygiTVNfU3ViQ2xhc3MiLCAiQ29uZGl0aW9uXzIiLCAiRXh0ZXJpb3JfMXN0IiwNCiJFeHRlcmlvcl8ybmQiLCAiTWlzY19GZWF0dXJlIikNCmFtZXNfdHJhaW5fc3Vic2V0IDwtIGFtZXNfdHJhaW4gJT4lDQpzZWxlY3QoLXRyYmxfdmFycykNCiMgVHJhaW4gdGhlIG1vZGVsIHdpdGhvdXQgdGhlIGFib3ZlIHRyb3VibGUgdmFyaWFibGVzDQphbWVzX2xtMSA8LSBsaW5lYXJfcmVnKCkgJT4lDQpmaXQoU2FsZV9QcmljZSB+IC4sIGRhdGEgPSBhbWVzX3RyYWluX3N1YnNldCkNCiMgQ29tcHV0ZSB0ZXN0IGRhdGEgZ2VuZXJhbGl6YXRpb24gUk1TRQ0KYW1lc19sbTEgJT4lDQpwcmVkaWN0KGFtZXNfdGVzdCkgJT4lDQpiaW5kX2NvbHMoYW1lc190ZXN0KSAlPiUNCnJtc2UodHJ1dGggPSBTYWxlX1ByaWNlLCBlc3RpbWF0ZSA9IC5wcmVkKQ0KDQphbWVzIDwtIEFtZXNIb3VzaW5nOjptYWtlX2FtZXMoKQ0Kc2V0LnNlZWQoMTIzKSAjIGZvciByZXByb2R1Y2liaWxpdHkNCnNwbGl0IDwtIGluaXRpYWxfc3BsaXQoYW1lcywgcHJvcCA9IDAuNywgc3RyYXRhID0gIlNhbGVfUHJpY2UiKQ0KYW1lc190cmFpbiA8LSB0cmFpbmluZyhzcGxpdCkNCmFtZXNfdGVzdCA8LSB0ZXN0aW5nKHNwbGl0KQ0KDQojIGEuIENyZWF0ZSBhIHJlY2lwZSB3aXRoIG5vdmVsIGxldmVsIGx1bXBpbmcNCm1scl9yZWNpcGUgPC0gcmVjaXBlKFNhbGVfUHJpY2UgfiAuLCBkYXRhID0gYW1lc190cmFpbikgJT4lDQogIHN0ZXBfb3RoZXIoYWxsX25vbWluYWxfcHJlZGljdG9ycygpLCB0aHJlc2hvbGQgPSAwLjAxLCBvdGhlciA9ICJvdGhlciIpDQoNCiMgYi4gQ3JlYXRlIGEgd29ya2Zsb3cgYW5kIHRyYWluIG1vZGVsDQptbHJfd2Zsb3cgPC0gd29ya2Zsb3coKSAlPiUNCiAgYWRkX21vZGVsKGxpbmVhcl9yZWcoKSkgJT4lDQogIGFkZF9yZWNpcGUobWxyX3JlY2lwZSkNCg0KbWxyX2ZpdCA8LSBtbHJfd2Zsb3cgJT4lDQogIGZpdChkYXRhID0gYW1lc190cmFpbikNCg0KIyBjLiBDb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGENCm1scl9maXQgJT4lDQogIHByZWRpY3QoYW1lc190ZXN0KSAlPiUNCiAgYmluZF9jb2xzKGFtZXNfdGVzdCAlPiUgc2VsZWN0KFNhbGVfUHJpY2UpKSAlPiUNCiAgcm1zZSh0cnV0aCA9IFNhbGVfUHJpY2UsIGVzdGltYXRlID0ucHJlZCkNCmBgYA0KDQojIyMgUGFydCAyDQojIyMjIFF1ZXN0aW9uIDE6IA0KYGBge3J9DQojIGEuIGltcG9ydCB0aGUgQWR2ZXJ0aXNpbmcuY3N2IGRhdGENCmFkdmVydGlzaW5nIDwtIHJlYWRfY3N2KCJDOi9Vc2Vycy90aGFuaC9PbmVEcml2ZSAtIFVuaXZlcnNpdHkgb2YgQ2luY2lubmF0aS9OaWUvRjI0X0RhdGEgbWluaW5nL0FkdmVydGlzaW5nX0xhYiAxMC5jc3YiKQ0KIyBiLiBjcmVhdGUgYSA3MC0zMCB0cmFpbi90ZXN0IGRhdGFzZXQgc3BsaXQNCnNldC5zZWVkKDEyMykNCnNwbGl0IDwtIGluaXRpYWxfc3BsaXQoYWR2ZXJ0aXNpbmcsIHByb3AgPSAwLjcsIHN0cmF0YSA9IHNhbGVzKQ0KYWR2ZXJ0aXNpbmdfdHJhaW4gPC0gdHJhaW5pbmcoc3BsaXQpDQphZHZlcnRpc2luZ190ZXN0IDwtIHRlc3Rpbmcoc3BsaXQpDQpgYGANCiMjIyMgUXVlc3Rpb24gMjogDQpgYGB7cn0NCiMgYS4gQ3JlYXRlIDEwLWZvbGQgY3Jvc3MtdmFsaWRhdGlvbiBvYmplY3QNCnNldC5zZWVkKDEyMykNCmtmb2xkcyA8LSB2Zm9sZF9jdihhZHZlcnRpc2luZ190cmFpbiwgdiA9IDEwLCBzdHJhdGEgPSBzYWxlcykNCg0KIyBiLiBDcmVhdGUgcmVjaXBlIHdpdGggbm8gZmVhdHVyZSBlbmdpbmVlcmluZyBzdGVwcw0KbWxyX3JlY2lwZSA8LSByZWNpcGUoc2FsZXMgfiAuLCBkYXRhID0gYWR2ZXJ0aXNpbmdfdHJhaW4pDQoNCiMgYy4gQ3JlYXRlIHdvcmtmbG93IG9iamVjdCB3aXRoIG1vZGVsIGFuZCByZWNpcGUNCm1scl93ZmxvdyA8LSB3b3JrZmxvdygpICU+JQ0KICBhZGRfbW9kZWwobGluZWFyX3JlZygpKSAlPiUNCiAgYWRkX3JlY2lwZShtbHJfcmVjaXBlKQ0KDQojIGQuIEZpdCBvdXIgbW9kZWwgYWNyb3NzIHRoZSAxMC1mb2xkIGNyb3NzLXZhbGlkYXRpb24NCm1scl9maXRfY3YgPC0gbWxyX3dmbG93ICU+JQ0KICBmaXRfcmVzYW1wbGVzKGtmb2xkcykNCg0KYGBgDQojIyMjIFF1ZXN0aW9uIDM6IA0KYGBge3J9DQojIENhbGN1bGF0ZSB0aGUgYXZlcmFnZSBjcm9zcy12YWxpZGF0aW9uIFJNU0UNCmNvbGxlY3RfbWV0cmljcyhtbHJfZml0X2N2KSAlPiUNCiAgZmlsdGVyKC5tZXRyaWMgPT0gInJtc2UiKQ0KDQpgYGANCiMjIyMgUXVlc3Rpb24gNDogDQpgYGB7cn0NCiMgUmFuZ2Ugb2YgY3Jvc3MtdmFsaWRhdGlvbiBSTVNFIHZhbHVlcyBhY3Jvc3MgYWxsIGZvbGRzDQpjdl9ybXNlX3JhbmdlIDwtIGNvbGxlY3RfbWV0cmljcyhtbHJfZml0X2N2LCBzdW1tYXJpemUgPSBGQUxTRSkgJT4lDQogIGZpbHRlcigubWV0cmljID09ICJybXNlIikgJT4lDQogIHN1bW1hcml6ZShyYW5nZV9ybXNlID0gbWF4KC5lc3RpbWF0ZSkgLSBtaW4oLmVzdGltYXRlKSkNCg0KY3Zfcm1zZV9yYW5nZQ0KDQpgYGANCiMjIyMgUXVlc3Rpb24gNTogDQpgYGB7cn0NCiMgQ3JlYXRlIGJvb3RzdHJhcCBzYW1wbGUgb2JqZWN0DQpzZXQuc2VlZCgxMjMpDQpic19zYW1wbGVzIDwtIGJvb3RzdHJhcHMoYWR2ZXJ0aXNpbmdfdHJhaW4sIHRpbWVzID0gMTAsIHN0cmF0YSA9IHNhbGVzKQ0KDQojIEZpdCBvdXIgbW9kZWwgYWNyb3NzIHRoZSBib290c3RyYXBwZWQgc2FtcGxlcw0KbWxyX2ZpdF9icyA8LSBtbHJfd2Zsb3cgJT4lDQogIGZpdF9yZXNhbXBsZXMoYnNfc2FtcGxlcykNCg0KYGBgDQojIyMjIFF1ZXN0aW9uIDY6IA0KYGBge3J9DQojIENhbGN1bGF0ZSBhdmVyYWdlIGJvb3RzdHJhcCBSTVNFDQptbHJfZml0X2JzICU+JQ0KICBjb2xsZWN0X21ldHJpY3Moc3VtbWFyaXplID0gRkFMU0UpICU+JQ0KICBmaWx0ZXIoLm1ldHJpYyA9PSAicm1zZSIpICU+JQ0KICBzdW1tYXJpemUoYXZnX3Jtc2UgPSBtZWFuKC5lc3RpbWF0ZSkpDQoNCmBgYA0KIyMjIyBRdWVzdGlvbiA3OiANCmBgYHtyfQ0KY29sbGVjdF9tZXRyaWNzKG1scl9maXRfYnMsIHN1bW1hcml6ZSA9IEZBTFNFKSAlPiUNCiAgZmlsdGVyKC5tZXRyaWMgPT0gInJtc2UiKSAlPiUNCiAgc3VtbWFyaXplKG1pbl9ybXNlID0gbWluKC5lc3RpbWF0ZSksIG1heF9ybXNlID0gbWF4KC5lc3RpbWF0ZSkpDQoNCmBgYA0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQo=