Click here to read the data manually.
ikea <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv')
skimr::skim(ikea)
| Name | ikea |
| Number of rows | 3694 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| logical | 1 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| name | 0 | 1 | 3 | 27 | 0 | 607 | 0 |
| category | 0 | 1 | 4 | 36 | 0 | 17 | 0 |
| old_price | 0 | 1 | 4 | 13 | 0 | 365 | 0 |
| link | 0 | 1 | 52 | 163 | 0 | 2962 | 0 |
| other_colors | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| short_description | 0 | 1 | 3 | 63 | 0 | 1706 | 0 |
| designer | 0 | 1 | 3 | 1261 | 0 | 381 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| sellable_online | 0 | 1 | 0.99 | TRU: 3666, FAL: 28 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| …1 | 0 | 1.00 | 1846.50 | 1066.51 | 0 | 923.25 | 1846.5 | 2769.75 | 3693 | ▇▇▇▇▇ |
| item_id | 0 | 1.00 | 48632396.79 | 28887094.10 | 58487 | 20390574.00 | 49288078.0 | 70403572.75 | 99932615 | ▇▇▇▇▇ |
| price | 0 | 1.00 | 1078.21 | 1374.65 | 3 | 180.90 | 544.7 | 1429.50 | 9585 | ▇▁▁▁▁ |
| depth | 1463 | 0.60 | 54.38 | 29.96 | 1 | 38.00 | 47.0 | 60.00 | 257 | ▇▃▁▁▁ |
| height | 988 | 0.73 | 101.68 | 61.10 | 1 | 67.00 | 83.0 | 124.00 | 700 | ▇▂▁▁▁ |
| width | 589 | 0.84 | 104.47 | 71.13 | 1 | 60.00 | 80.0 | 140.00 | 420 | ▇▅▂▁▁ |
data <- ikea %>%
# Treat missing values
filter(!is.na(height), !is.na(width), !is.na(depth)) %>%
mutate(across(is.logical, as.factor)) %>%
# Remove unwanted columns
select(-...1, -link, -old_price, -designer) %>%
# log transform variables with positively skewed distributions
mutate(price = log(price))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(is.logical, as.factor)`.
## Caused by warning:
## ! Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.logical)
##
## # Now:
## data %>% select(where(is.logical))
Products with larger dimensions, especially those with widths over 161.5 cm and depths over 60 cm, show the strongest positive correlations with higher price ranges. The width variable (161.5 cm and above) shows a correlation of 0.579, while the depth variable (over 60 cm) has a correlation of 0.447, indicating that larger-sized furniture tends to be more expensive.
The category ‘Sofas & Armchairs’ stands out with a correlation of 0.379 with higher prices, suggesting that items in this category are generally priced higher. Similarly, ‘Wardrobes’ also show a positive association with price (0.279). Conversely, categories like ‘Children’s Furniture’ (-0.119) and ‘TV & Media Furniture’ (0.0069) show weaker or negative associations with price, suggesting that products in these categories tend to be less expensive.
Products taller than 171 cm show a somewhat positive correlation with price (0.232), while shorter products (less than 71 cm in height) are negatively correlated with price (-0.277). This suggests that taller products are more likely to be associated with higher prices, reflecting an emphasis on height as a cost factor.
The presence of other colors slightly impacts the price, with products available in multiple colors positively correlated (0.136), indicating that these items might cater to broader consumer preferences and therefore command higher prices.
Products that are sellable online show a slight positive correlation with price (0.048), reinforcing that items with online accessibility might be positioned at higher price points, possibly due to broader market reach and convenience factors.
# Step 1: Prepare data by binarising data
binarized_table <- data %>%
select(price, height, width, depth, category, other_colors, sellable_online) %>%
binarize()
binarized_table %>% glimpse()
## Rows: 1,899
## Columns: 35
## $ `price__-Inf_5.68697535633982` <dbl> 1, 1, 0, 1, 1, 1, 0, …
## $ price__5.68697535633982_6.52209279817015 <dbl> 0, 0, 1, 0, 0, 0, 1, …
## $ price__6.52209279817015_7.37085996851068 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ price__7.37085996851068_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `height__-Inf_71` <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ height__71_92 <dbl> 0, 0, 1, 0, 0, 0, 0, …
## $ height__92_171 <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ height__171_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `width__-Inf_60` <dbl> 1, 0, 1, 1, 1, 1, 1, …
## $ width__60_93 <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ width__93_161.5 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ width__161.5_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `depth__-Inf_40` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ depth__40_47 <dbl> 0, 0, 1, 1, 1, 1, 1, …
## $ depth__47_60 <dbl> 1, 1, 0, 0, 0, 0, 0, …
## $ depth__60_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Bar_furniture <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ category__Beds <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Bookcases_&_shelving_units` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Cabinets_&_cupboards` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Chairs <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Chests_of_drawers_&_drawer_units` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Children's_furniture` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Nursery_furniture <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Outdoor_furniture <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sideboards,_buffets_&_console_tables` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sofas_&_armchairs` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Tables_&_desks` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__TV_&_media_furniture` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Wardrobes <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ other_colors__No <dbl> 0, 1, 1, 1, 1, 1, 1, …
## $ other_colors__Yes <dbl> 1, 0, 0, 0, 0, 0, 0, …
## $ sellable_online__TRUE <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `sellable_online__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate the data
corr_tbl <- binarized_table %>%
correlate(price__7.37085996851068_Inf)
corr_tbl
## # A tibble: 35 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 price 7.37085996851068_Inf 1
## 2 width 161.5_Inf 0.579
## 3 depth 60_Inf 0.447
## 4 category Sofas_&_armchairs 0.379
## 5 width -Inf_60 -0.374
## 6 price -Inf_5.68697535633982 -0.336
## 7 price 6.52209279817015_7.37085996851068 -0.333
## 8 price 5.68697535633982_6.52209279817015 -0.331
## 9 category Wardrobes 0.279
## 10 height -Inf_71 -0.277
## # ℹ 25 more rows
# Step 3: Plot
corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Based on the boxplot and histogram below, we can see that the category ‘wardrobes’ is on average the most expensive, while Children’s furniture is on average the least expensive. When saying this, we must state that prices greatly fluctuate in some categories, as we can see in the TV & media furnitures category.
data %>%
ggplot(aes(price, as.factor(category))) +
geom_boxplot()
category <- ikea %>%
select(price, category) %>%
mutate(price = log(price))
# Calculate average price per category
category_avg_price <- category %>%
group_by(category) %>%
summarise(avg_price = mean(price, na.rm = TRUE)) %>%
ungroup()
# Plot bar plot of average prices per category
ggplot(category_avg_price, aes(x = reorder(category, avg_price), y = avg_price)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "Average Price per Product Category",
x = "Product Category",
y = "Average Price (log-transformed)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The next boxplots tells us how the price of a product is correlated to the fact if that product is also sold in different colors. We can see that when a product is also available in different colors, the price on average is slightly more than if not.
data %>%
ggplot(aes(price, as.factor(other_colors))) +
geom_boxplot()
Similar to previous predictor, we can see that when a product is also sold online, and thus more available to consumers, the average price of that product is more expensive. We can also see, in this case, products that are also sold online vary less in price than the products that are not sold online.
data %>%
ggplot(aes(price, sellable_online)) +
geom_boxplot()
Finally, we look at the short description of these products. The diagram shows us that when the word ‘sliding’ is included in the short description of a product, it is likely to be more expensive than when the word ‘conference’ is included.
data %>%
# tokenize title
unnest_tokens(output = word, input = short_description) %>%
# calculate average price per word
group_by(word) %>%
summarise(price = mean(price),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\d")) %>%
slice_max(order_by = price, n = 20) %>%
# Plot
ggplot(aes(price, fct_reorder(word, price))) +
geom_point() +
labs(y = "Words in Short Description",
x = "Average Price when Word is Included")
Split Data
# data <- sample_n(data, 100)
set.seed(1234)
ikea_split <- rsample::initial_split(data, prop = 0.75)
ikea_train <- training(ikea_split)
ikea_test <- testing(ikea_split)
Preprocess Data
# Specify recipe
ikea_recipe <-
recipe(price ~ ., data = ikea_train) %>%
update_role(item_id, new_role = "id variable") %>%
step_tokenize(short_description) %>%
step_tokenfilter(short_description, max_tokens = 100) %>%
step_tfidf(short_description) %>%
step_other(category, name) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_YeoJohnson(width, height, depth) %>%
step_impute_knn(all_predictors())
ikea_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 1,424
## Columns: 121
## $ item_id <dbl> 19282962, 29320911, 49276549…
## $ depth <dbl> 6.921938, 5.331305, 4.942159…
## $ height <dbl> 12.95187, 21.33670, 14.11837…
## $ width <dbl> 9.034855, 7.442056, 5.426853…
## $ price <dbl> 7.595890, 7.833996, 6.429719…
## $ tfidf_short_description_1 <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_120x40x64 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_140x200 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_147x147 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_150x44x236 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_150x60x236 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_150x66x236 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_2 <dbl> 0.8840660, 0.0000000, 0.5304…
## $ tfidf_short_description_200x66x236 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_25x51x70 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_3 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_4 <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_41x101 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_41x61 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_5 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_50x51x70 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_60x50x128 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_61x101 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_74 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_75 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_80x30x202 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_99x44x56 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_and <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_armchair <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_armrest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_armrests <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_backrest <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_bar <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_baskets <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_bed <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_bench <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_bookcase <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_box <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_cabinet <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_cabinets <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_castors <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_chair <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_chaise <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_changing <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_chest <dbl> 0.0000000, 0.0000000, 0.5867…
## $ `tfidf_short_description_children's` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_clothes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_cm <dbl> 0.0000000, 0.2905183, 0.1743…
## $ tfidf_short_description_combination <dbl> 0.0000000, 0.6909039, 0.0000…
## $ tfidf_short_description_corner <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_cover <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_desk <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_door <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_doors <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_drawer <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_drawers <dbl> 0.0000000, 0.0000000, 0.5004…
## $ tfidf_short_description_feet <dbl> 0.00000, 0.00000, 0.00000, 0…
## $ tfidf_short_description_foldable <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_for <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_frame <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_glass <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_highchair <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_in <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_inserts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_junior <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_leg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_legs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_lock <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_longue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_mesh <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_modular <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_module <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_mounted <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_of <dbl> 0.0000000, 0.0000000, 0.5753…
## $ tfidf_short_description_on <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_outdoor <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_panel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_plinth <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_rail <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_seat <dbl> 0.7549716, 0.0000000, 0.0000…
## $ tfidf_short_description_section <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_sections <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_shelf <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_shelves <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_shelving <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_sliding <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_smart <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_sofa <dbl> 0.7660576, 0.0000000, 0.0000…
## $ tfidf_short_description_step <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_stool <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_storage <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_table <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_top <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_tv <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_underframe <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_unit <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_upright <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_w <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_wall <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_wardrobe <dbl> 0.0000000, 0.7815769, 0.0000…
## $ tfidf_short_description_wire <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_with <dbl> 0.0000000, 0.0000000, 0.0000…
## $ name_BESTÅ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_PAX <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ category_Bookcases...shelving.units <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ category_Cabinets...cupboards <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0…
## $ category_Chairs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Chests.of.drawers...drawer.units <dbl> 0, 0, 1, 0, 1, 0, 1, 0, 1, 0…
## $ category_Sofas...armchairs <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Tables...desks <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_TV...media.furniture <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Wardrobes <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_other <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ sellable_online_FALSE. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sellable_online_TRUE. <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ other_colors_No <dbl> 1, 1, 0, 1, 1, 0, 0, 0, 1, 0…
## $ other_colors_Yes <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 0, 1…
# Specify model
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(ikea_recipe) %>%
add_model(xgboost_spec)
# Create cross-validation folds
set.seed(2345)
ikea_cv <- vfold_cv(ikea_train, v = 5)
# Perform hyperparameter tuning
set.seed(3456)
xgboost_tune <-
tune_grid(
xgboost_workflow,
resamples = ikea_cv,
grid = 5
)
## Warning: package 'xgboost' was built under R version 4.3.3
The model performed well in predicting IKEA product prices. With an RMSE of 0.364, prediction errors are minimal, and an R-squared of 0.914 shows the model explains 91.4% of price variance. Cross-validation results confirm consistent performance, and the geom_point plot shows predictions align closely with actual values. Overall, the model is reliable for price prediction.
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## mtry trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 88 1637 28 0.0878 rmse standard 0.419 5 0.0163 Preproces…
## 2 40 418 22 0.138 rmse standard 0.426 5 0.0192 Preproces…
## 3 114 848 11 0.00770 rmse standard 0.448 5 0.0188 Preproces…
## 4 58 1543 37 0.0105 rmse standard 0.453 5 0.0172 Preproces…
## 5 13 70 3 0.00225 rmse standard 5.20 5 0.0265 Preproces…
# Update the model by selecting the best hyperparameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
# Fit the model on the entire training data and test it on the test data
ikea_fit <- tune::last_fit(xgboost_fw, ikea_split)
tune::collect_metrics(ikea_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.364 Preprocessor1_Model1
## 2 rsq standard 0.914 Preprocessor1_Model1
tune::collect_predictions(ikea_fit) %>%
ggplot(aes(x = price, y = .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed() +
labs(
title = "Predicted vs Actual Prices",
x = "Actual Price (log-transformed)",
y = "Predicted Price"
)
# Predict values on the test dataset
ikea_test_predictions <- predict(ikea_fit$.workflow[[1]], new_data = ikea_test)
# Combine predicted and actual prices from the test set
ikea_results <- ikea_test %>%
select(price) %>%
bind_cols(ikea_test_predictions)
# Scatter plot of predicted vs actual prices for test data
ikea_results %>%
ggplot(aes(x = price, y = .pred)) +
geom_point(alpha = 0.3, color = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed() +
labs(
title = "Predicted vs Actual Prices (Test Data)",
x = "Actual Price (log-transformed)",
y = "Predicted Price"
)