Import Data

ikea <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-11-03/ikea.csv')
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Data Cleaning & Transformation

data <- ikea %>%
    select(-link, -other_colors, -short_description) %>%
    na.omit() %>%
    mutate(across(where(is.character), as.factor)) %>%
    separate_rows(designer, sep = "/") %>%
    mutate(price = log(price))

Define Recipe

xgboost_recipe <- recipe(price ~ ., data = data) %>%
    update_role(item_id, name, new_role = "id") %>%
    step_rm(any_of(c("category", "old_price", "sellable_online", "designer", "depth", "height", "width"))) %>%
    step_impute_median(all_numeric_predictors()) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_zv(all_predictors())

Split Data into Training and Testing Sets

set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

Define XGBoost Model Specification

xgboost_spec <- 
    boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>% 
    set_mode("regression") %>% 
    set_engine("xgboost") 

Create Workflow

xgboost_workflow <- 
    workflow() %>% 
    add_recipe(xgboost_recipe) %>% 
    add_model(xgboost_spec) 

Create Cross-Validation Folds

set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)

Tune Hyperparameters

set.seed(344)
xgboost_tune <- tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5,
    metrics = metric_set(rmse, mae, rsq)
)

Select Best Model

best_params <- select_best(xgboost_tune, metric = "rmse")

Finalize Model with Best Parameters

final_xgboost <- finalize_workflow(xgboost_workflow, best_params)

Fit Final Model on Training Data

final_model <- fit(final_xgboost, data = train_data)

Evaluate on Test Data

predictions <- predict(final_model, new_data = test_data) %>%
    bind_cols(test_data)
## New names:
## • `...1` -> `...2`
final_metrics <- predictions %>%
    metrics(truth = price, estimate = .pred)

Scatter Plot of Predicted vs Actual Prices

predictions %>%
    ggplot(aes(x = price, y = .pred)) +
    geom_point(alpha = 0.3, color = "blue") +
    geom_abline(lty = 2, color = "gray50") +
    labs(title = "Predicted vs Actual Prices", x = "Actual Price", y = "Predicted Price") +
    theme_minimal()

Make Predictions on Test Dataset

test_predictions <- predict(final_model, new_data = test_data)
test_predictions
## # A tibble: 543 × 1
##    .pred
##    <dbl>
##  1  5.00
##  2  6.14
##  3  6.26
##  4  6.22
##  5  6.18
##  6  6.66
##  7  6.66
##  8  6.66
##  9  6.63
## 10  6.63
## # ℹ 533 more rows