Apply 4: IKEA Price Prediction

#Import Data

ikea <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-11-03/ikea.csv')

## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

#Data Cleaning & Transformation (Including Missing Values Handling)

data <- ikea %>%
    select(-link, -other_colors, -short_description) %>%
    mutate(across(where(is.character), as.factor)) %>%
    separate_rows(designer, sep = "/") %>%
    mutate(price = log(price)) %>%
    mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))

#Define Recipe (Including All Predictors)

recipe_spec <- recipe(price ~ ., data = data) %>%
    update_role(item_id, name, new_role = "id") %>%
    step_rm(any_of(c("category", "old_price", "sellable_online", "designer", "depth", "height", "width"))) %>%
    step_impute_median(all_numeric_predictors()) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_zv(all_predictors())

#Split Data into Training and Testing Sets

set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

#Define XGBoost Model Specification

xgboost_spec <- 
    boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>% 
    set_mode("regression") %>% 
    set_engine("xgboost")

#Define Ranger Model Specification

ranger_spec <- 
    rand_forest(trees = 500) %>% 
    set_mode("regression") %>% 
    set_engine("ranger")

#Define SVM Model Specification

svm_spec <- 
    svm_linear() %>%
    set_mode("regression") %>% 
    set_engine("kernlab")

#Create Workflows for Each Model

xgboost_workflow <- 
    workflow() %>% 
    add_recipe(recipe_spec) %>% 
    add_model(xgboost_spec)

ranger_workflow <- 
    workflow() %>% 
    add_recipe(recipe_spec) %>% 
    add_model(ranger_spec)

svm_workflow <- 
    workflow() %>% 
    add_recipe(recipe_spec) %>% 
    add_model(svm_spec)

#Create Cross-Validation Folds

set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)

#Tune Hyperparameters for Each Model

set.seed(344)
xgboost_tune <- tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5,
    metrics = metric_set(rmse, mae, rsq)
)

ranger_tune <- tune_grid(
    ranger_workflow,
    resamples = data_cv,
    grid = 5,
    metrics = metric_set(rmse, mae, rsq)
)

## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?

svm_tune <- tune_grid(
    svm_workflow,
    resamples = data_cv,
    grid = 5,
    metrics = metric_set(rmse, mae, rsq)
)

## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?

#Select Best Models

best_xgboost_params <- select_best(xgboost_tune, metric = "rmse")
best_ranger_params <- select_best(ranger_tune, metric = "rmse")
best_svm_params <- select_best(svm_tune, metric = "rmse")

#Finalize Models with Best Parameters

final_xgboost <- finalize_workflow(xgboost_workflow, best_xgboost_params)
final_ranger <- finalize_workflow(ranger_workflow, best_ranger_params)
final_svm <- finalize_workflow(svm_workflow, best_svm_params)

#Fit Final Models on Training Data

final_xgboost_model <- fit(final_xgboost, data = train_data)
final_ranger_model <- fit(final_ranger, data = train_data)
final_svm_model <- fit(final_svm, data = train_data)

##  Setting default kernel parameters

#Evaluate Models on Test Data

xgboost_predictions <- predict(final_xgboost_model, new_data = test_data) %>%
    bind_cols(test_data)

## New names:
## • `...1` -> `...2`

ranger_predictions <- predict(final_ranger_model, new_data = test_data) %>%
    bind_cols(test_data)

## New names:
## • `...1` -> `...2`

svm_predictions <- predict(final_svm_model, new_data = test_data) %>%
    bind_cols(test_data)

## New names:
## • `...1` -> `...2`

#Calculate Metrics for Each Model

xgboost_metrics <- xgboost_predictions %>%
    metrics(truth = price, estimate = .pred)

ranger_metrics <- ranger_predictions %>%
    metrics(truth = price, estimate = .pred)

svm_metrics <- svm_predictions %>%
    metrics(truth = price, estimate = .pred)

#Print Final Model Performance

print(xgboost_metrics)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       1.11 
## 2 rsq     standard       0.447
## 3 mae     standard       0.841

print(ranger_metrics)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       1.07 
## 2 rsq     standard       0.504
## 3 mae     standard       0.742

print(svm_metrics)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      1.49  
## 2 rsq     standard      0.0351
## 3 mae     standard      1.18

#Summary Report

cat("### Summary Report\n")

## ### Summary Report

cat("**Model Comparison and Improvements:**\n")

## **Model Comparison and Improvements:**

cat("- Included more predictors and refined missing value handling.\n")

## - Included more predictors and refined missing value handling.

cat("- Implemented hyperparameter tuning to optimize model performance.\n")

## - Implemented hyperparameter tuning to optimize model performance.

cat("- Used cross-validation to reduce overfitting risks.\n")

## - Used cross-validation to reduce overfitting risks.

cat("**Performance Metrics:**\n")

## **Performance Metrics:**

cat("- XGBoost: RMSE =", xgboost_metrics %>% filter(.metric == "rmse") %>% pull(.estimate), ", R-squared =", xgboost_metrics %>% filter(.metric == "rsq") %>% pull(.estimate), "\n")

## - XGBoost: RMSE = 1.10808 , R-squared = 0.4471793

cat("- Ranger: RMSE =", ranger_metrics %>% filter(.metric == "rmse") %>% pull(.estimate), ", R-squared =", ranger_metrics %>% filter(.metric == "rsq") %>% pull(.estimate), "\n")

## - Ranger: RMSE = 1.066143 , R-squared = 0.5040618

cat("- SVM: RMSE =", svm_metrics %>% filter(.metric == "rmse") %>% pull(.estimate), ", R-squared =", svm_metrics %>% filter(.metric == "rsq") %>% pull(.estimate), "\n")

## - SVM: RMSE = 1.485275 , R-squared = 0.03511973

Apply 4: IKEA Price Prediction

Adam Cilley

2025-02-24