Import Data

ikea <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-11-03/ikea.csv')

## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Data Cleaning & Transformation

data <- ikea %>%
    select(-link, -other_colors, -short_description) %>%
    na.omit() %>%
    mutate(across(where(is.character), as.factor)) %>%
    separate_rows(designer, sep = "/") %>%
    mutate(price = log(price))

Define Recipe

xgboost_recipe <- recipe(price ~ ., data = data) %>%
    update_role(item_id, name, new_role = "id") %>%
    step_rm(any_of(c("category", "old_price", "sellable_online", "designer", "depth", "height", "width"))) %>%
    step_impute_median(all_numeric_predictors()) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_zv(all_predictors())

Split Data into Training and Testing Sets

set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

Define XGBoost Model Specification

xgboost_spec <- 
    boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>% 
    set_mode("regression") %>% 
    set_engine("xgboost")

Create Workflow

xgboost_workflow <- 
    workflow() %>% 
    add_recipe(xgboost_recipe) %>% 
    add_model(xgboost_spec)

Create Cross-Validation Folds

set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)

Tune Hyperparameters

set.seed(344)
xgboost_tune <- tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5,
    metrics = metric_set(rmse, mae, rsq)
)

Select Best Model

best_params <- select_best(xgboost_tune, metric = "rmse")

Finalize Model with Best Parameters

final_xgboost <- finalize_workflow(xgboost_workflow, best_params)

Fit Final Model on Training Data

final_model <- fit(final_xgboost, data = train_data)

Evaluate on Test Data

predictions <- predict(final_model, new_data = test_data) %>%
    bind_cols(test_data)

## New names:
## • `...1` -> `...2`

final_metrics <- predictions %>%
    metrics(truth = price, estimate = .pred)

Print Final Model Performance

print(final_metrics)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.939
## 2 rsq     standard       0.416
## 3 mae     standard       0.697

Scatter Plot of Predicted vs Actual Prices

predictions %>%
    ggplot(aes(x = price, y = .pred)) +
    geom_point(alpha = 0.3, color = "blue") +
    geom_abline(lty = 2, color = "gray50") +
    labs(title = "Predicted vs Actual Prices", x = "Actual Price", y = "Predicted Price") +
    theme_minimal()

Make Predictions on Test Dataset

test_predictions <- predict(final_model, new_data = test_data)
test_predictions

## # A tibble: 543 × 1
##    .pred
##    <dbl>
##  1  5.00
##  2  6.14
##  3  6.26
##  4  6.22
##  5  6.18
##  6  6.66
##  7  6.66
##  8  6.66
##  9  6.63
## 10  6.63
## # ℹ 533 more rows

Apply 3: IKEA Price Prediction

Adam Cilley

2025-02-18

Import Data

Data Cleaning & Transformation

Define Recipe

Split Data into Training and Testing Sets

Define XGBoost Model Specification

Create Workflow

Create Cross-Validation Folds

Tune Hyperparameters

Select Best Model

Finalize Model with Best Parameters

Fit Final Model on Training Data

Evaluate on Test Data

Print Final Model Performance

Scatter Plot of Predicted vs Actual Prices

Make Predictions on Test Dataset