Import Data
ikea <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-11-03/ikea.csv')
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Define Recipe
xgboost_recipe <- recipe(price ~ ., data = data) %>%
update_role(item_id, name, new_role = "id") %>%
step_rm(any_of(c("category", "old_price", "sellable_online", "designer", "depth", "height", "width"))) %>%
step_impute_median(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())
Split Data into Training and Testing Sets
set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)
Define XGBoost Model Specification
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
Create Workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
Create Cross-Validation Folds
set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)
Tune Hyperparameters
set.seed(344)
xgboost_tune <- tune_grid(
xgboost_workflow,
resamples = data_cv,
grid = 5,
metrics = metric_set(rmse, mae, rsq)
)
Select Best Model
best_params <- select_best(xgboost_tune, metric = "rmse")
Finalize Model with Best Parameters
final_xgboost <- finalize_workflow(xgboost_workflow, best_params)
Fit Final Model on Training Data
final_model <- fit(final_xgboost, data = train_data)
Evaluate on Test Data
predictions <- predict(final_model, new_data = test_data) %>%
bind_cols(test_data)
## New names:
## • `...1` -> `...2`
final_metrics <- predictions %>%
metrics(truth = price, estimate = .pred)
Scatter Plot of Predicted vs Actual Prices
predictions %>%
ggplot(aes(x = price, y = .pred)) +
geom_point(alpha = 0.3, color = "blue") +
geom_abline(lty = 2, color = "gray50") +
labs(title = "Predicted vs Actual Prices", x = "Actual Price", y = "Predicted Price") +
theme_minimal()

Make Predictions on Test Dataset
test_predictions <- predict(final_model, new_data = test_data)
test_predictions
## # A tibble: 543 × 1
## .pred
## <dbl>
## 1 5.00
## 2 6.14
## 3 6.26
## 4 6.22
## 5 6.18
## 6 6.66
## 7 6.66
## 8 6.66
## 9 6.63
## 10 6.63
## # ℹ 533 more rows