# Load Libraries
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.2
## ✔ recipes 1.1.0
## Warning: package 'dials' was built under R version 4.4.2
## Warning: package 'infer' was built under R version 4.4.2
## Warning: package 'modeldata' was built under R version 4.4.2
## Warning: package 'parsnip' was built under R version 4.4.2
## Warning: package 'tune' was built under R version 4.4.2
## Warning: package 'workflows' was built under R version 4.4.2
## Warning: package 'workflowsets' was built under R version 4.4.2
## Warning: package 'yardstick' was built under R version 4.4.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.2
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.4.2
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(skimr)
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.2
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
# Import Data
ikea <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-11-03/ikea.csv')
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Examine Data
skimr::skim(ikea)
Data summary
Name |
ikea |
Number of rows |
3694 |
Number of columns |
14 |
_______________________ |
|
Column type frequency: |
|
character |
7 |
logical |
1 |
numeric |
6 |
________________________ |
|
Group variables |
None |
Variable type: character
name |
0 |
1 |
3 |
27 |
0 |
607 |
0 |
category |
0 |
1 |
4 |
36 |
0 |
17 |
0 |
old_price |
0 |
1 |
4 |
13 |
0 |
365 |
0 |
link |
0 |
1 |
52 |
163 |
0 |
2962 |
0 |
other_colors |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
short_description |
0 |
1 |
3 |
63 |
0 |
1706 |
0 |
designer |
0 |
1 |
3 |
1261 |
0 |
381 |
0 |
Variable type: logical
sellable_online |
0 |
1 |
0.99 |
TRU: 3666, FAL: 28 |
Variable type: numeric
…1 |
0 |
1.00 |
1846.50 |
1066.51 |
0 |
923.25 |
1846.5 |
2769.75 |
3693 |
▇▇▇▇▇ |
item_id |
0 |
1.00 |
48632396.79 |
28887094.10 |
58487 |
20390574.00 |
49288078.0 |
70403572.75 |
99932615 |
▇▇▇▇▇ |
price |
0 |
1.00 |
1078.21 |
1374.65 |
3 |
180.90 |
544.7 |
1429.50 |
9585 |
▇▁▁▁▁ |
depth |
1463 |
0.60 |
54.38 |
29.96 |
1 |
38.00 |
47.0 |
60.00 |
257 |
▇▃▁▁▁ |
height |
988 |
0.73 |
101.68 |
61.10 |
1 |
67.00 |
83.0 |
124.00 |
700 |
▇▂▁▁▁ |
width |
589 |
0.84 |
104.47 |
71.13 |
1 |
60.00 |
80.0 |
140.00 |
420 |
▇▅▂▁▁ |
# Data Cleaning & Transformation
data <- ikea %>%
select(-link, -other_colors, -short_description) %>% # Remove unnecessary columns
na.omit() %>%
mutate(across(where(is.character), as.factor)) %>% # Convert character variables to factors
separate_rows(designer, sep = "/") %>% # Transform designer column
mutate(price = log(price)) # Log-transform price for normalization
# Check column names
colnames(data)
## [1] "...1" "item_id" "name" "category"
## [5] "price" "old_price" "sellable_online" "designer"
## [9] "depth" "height" "width"
Define Recipe
xgboost_recipe <- recipe(price ~ ., data = data) %>%
update_role(item_id, name, new_role = "id") %>% # Set ID columns
step_rm(any_of(c("category", "old_price", "sellable_online", "designer", "depth", "height", "width"))) %>% # Remove unnecessary columns
step_impute_median(all_numeric_predictors()) %>% # Handle missing numeric values
step_dummy(all_nominal_predictors()) %>% # Convert categorical variables to dummies
step_zv(all_predictors()) # Remove zero-variance predictors
Prepare Recipe
prep_xgboost_recipe <- prep(xgboost_recipe, training = data)
baked_data <- bake(prep_xgboost_recipe, new_data = NULL)
# Split Data into Training and Testing Sets
set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)
# Define XGBoost Model Specification
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Create Workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
Create Cross-Validation Folds
set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)
# Tune Hyperparameters
set.seed(344)
xgboost_tune <- tune_grid(
xgboost_workflow,
resamples = data_cv,
grid = 5,
metrics = metric_set(rmse, mae, rsq)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
Select Best Model
best_params <- select_best(xgboost_tune, metric = "rmse")
Finalize Model with Best Parameters
final_xgboost <- finalize_workflow(xgboost_workflow, best_params)
Fit Final Model on Training Data
final_model <- fit(final_xgboost, data = train_data)
Evaluate on Test Data
final_results <- predict(final_model, new_data = test_data) %>%
bind_cols(test_data) %>%
metrics(truth = price, estimate = .pred)
## New names:
## • `...1` -> `...2`