CA 1 Predicting IKEA furniture prices

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Import data
ikea <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv")

## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

ikea %>%
    select(...1, price, depth:width) %>%
    pivot_longer(depth:width, names_to = "dim") %>%
    ggplot(aes(value, price, color = dim)) +
    geom_point(alpha = 0.4, show.legend = FALSE) +
    scale_y_log10() +
    facet_wrap(~ dim, scales = "free_x") +
    labs(x = NULL)

## Warning: Removed 3040 rows containing missing values (`geom_point()`).

ikea_df <- ikea %>%
    select(price, name, category, depth, height, width) %>%
    mutate(price = log10(price)) %>%
    mutate_if(is.character, factor)

Build a model

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

set.seed(123)
ikea_split <- initial_split(ikea_df, strata = price)
ikea_train <- training(ikea_split)
ikea_test <- testing(ikea_split)

set.seed(234)
ikea_folds <- bootstraps(ikea_train, strata = price)
ikea_folds

## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits              id         
##    <list>              <chr>      
##  1 <split [2770/994]>  Bootstrap01
##  2 <split [2770/1003]> Bootstrap02
##  3 <split [2770/1037]> Bootstrap03
##  4 <split [2770/1010]> Bootstrap04
##  5 <split [2770/1014]> Bootstrap05
##  6 <split [2770/1007]> Bootstrap06
##  7 <split [2770/1036]> Bootstrap07
##  8 <split [2770/1016]> Bootstrap08
##  9 <split [2770/1021]> Bootstrap09
## 10 <split [2770/1043]> Bootstrap10
## # ℹ 15 more rows

library(usemodels)
use_ranger(price ~ ., data = ikea_train)

## ranger_recipe <- 
##   recipe(formula = price ~ ., data = ikea_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("classification") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(67013)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

library(textrecipes)
ranger_recipe <-
  recipe(formula = price ~ ., data = ikea_train) %>%
  step_other(name, category, threshold = 0.01) %>%
## Clean up the names
  step_clean_levels(name, category) %>%
    
    ## Imputing na data
  step_impute_knn(depth, height, width)

ranger_spec <-
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  set_mode("regression") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
  tune_grid(ranger_workflow,
    resamples = ikea_folds,
    grid = 11
  )

## i Creating pre-processing data to finalize unknown parameter: mtry

Explore results

show_best(ranger_tune, metric = "rmse")

## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2     4 rmse    standard   0.340    25 0.00204 Preprocessor1_Model10
## 2     4    10 rmse    standard   0.348    25 0.00225 Preprocessor1_Model05
## 3     5     6 rmse    standard   0.349    25 0.00234 Preprocessor1_Model06
## 4     3    18 rmse    standard   0.350    25 0.00219 Preprocessor1_Model01
## 5     2    21 rmse    standard   0.352    25 0.00200 Preprocessor1_Model08

show_best(ranger_tune, metric = "rsq")

## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2     4 rsq     standard   0.726    25 0.00336 Preprocessor1_Model10
## 2     4    10 rsq     standard   0.713    25 0.00371 Preprocessor1_Model05
## 3     5     6 rsq     standard   0.711    25 0.00381 Preprocessor1_Model06
## 4     3    18 rsq     standard   0.709    25 0.00370 Preprocessor1_Model01
## 5     2    21 rsq     standard   0.708    25 0.00348 Preprocessor1_Model08

autoplot(ranger_tune)

final_rf <- ranger_workflow %>%
  finalize_workflow(select_best(ranger_tune))

## Warning: No value of `metric` was given; metric 'rmse' will be used.

final_rf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_other()
## • step_clean_levels()
## • step_impute_knn()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 2
##   trees = 1000
##   min_n = 4
## 
## Computational engine: ranger

ikea_fit <- last_fit(final_rf, ikea_split)
ikea_fit

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits             id               .metrics .notes   .predictions .workflow 
##   <list>             <chr>            <list>   <list>   <list>       <list>    
## 1 <split [2770/924]> train/test split <tibble> <tibble> <tibble>     <workflow>

collect_metrics(ikea_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.318 Preprocessor1_Model1
## 2 rsq     standard       0.752 Preprocessor1_Model1

Graph test data

collect_predictions(ikea_fit) %>%
  ggplot(aes(price, .pred)) +
  geom_abline(lty = 2, color = "gray50") +
  geom_point(alpha = 0.5, color = "midnightblue") +
  coord_fixed()

For future predictions

predict(ikea_fit$.workflow[[1]], ikea_test[15, ])

## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  2.42

Feature Importance

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

imp_spec <- ranger_spec %>%
  finalize_model(select_best(ranger_tune)) %>%
  set_engine("ranger", importance = "permutation")

## Warning: No value of `metric` was given; metric 'rmse' will be used.

workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(imp_spec) %>%
  fit(ikea_train) %>%
  pull_workflow_fit() %>%
  vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))

## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## ℹ Please use `extract_fit_parsnip()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

1. Question and Data:

- What is the research question? Clearly state the research question you aim to address using the new dataset.
- Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions.
- What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics.

1. Answer

The quiestion is if we can use the IKEA dataset to train a model that can predict the price of a piece of furniture based on parameters such as height, widht, category and a few more.

The dataset is made up of 3694 observations with 14 categories of different variables. Based on the feature imporance test we did above the 5 most important ones when predicting the price are width, depth, heigh category, and name.

The top 3 variables combined describe the size of the furniture piece and therefore they do a good job predicting the price since they account for a estimate of the materials that went into it as well as the shipping costs to get it to the IKEA warehouse would be higher for a piece like that resulting in a higher sale price.

2. Data Exploration and Transformation:

- Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial.

2. Answer

The transformed data did not have any na values since we imputed those, this was chose over excluding the observations with one or more na values since doing that would have resulted in a data set a lot smaller and a loss of valuable data. We also made sure to only have what seemed to be the most relevant data left for training and testing which meant that we started off with 14 variables but that was cut down to 6 after we transformed the data for modeling. This was necessary to make the whole process more efficient and because of using data such as the link or if it for sale online is most likely not adding anything positive to our predictions.

3. Data Preparation and Modeling:

- What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset.
- What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question.

3. Answer

Data Preparation Steps Initial Data Splitting: Split the data into a training set and a test set using initial_split function, setting the strata parameter to ‘price’.

Relevance: It’s essential to have separate training and test datasets to properly evaluate model performance.

Feature Selection: I selected relevant features like ‘price’, ‘name’, ‘category’, ‘depth’, ‘height’, and ‘width’.

Relevance: Unnecessary features can reduce model performance and interpretability.

Data Transformation: I applied logarithmic transformation to the ‘price’ variable using log10.

Relevance: Log transformations can help linearize relationships and manage skewed distributions.

Converting Characters to Factors: I used mutate_if to convert character columns to factors.

Relevance: Categorical data in character format needs to be converted into a factor for many machine learning algorithms to process it effectively.

Data Imputation: I used step_impute_knn to impute missing values for ‘depth’, ‘height’, and ‘width’.

Relevance: Missing values can cause errors or reduce the performance of some machine learning algorithms.

Level Aggregation: I applied step_other to aggregate rare levels in ‘name’ and ‘category’ into a common “other” level.

Relevance: Rare levels can lead to overfitting.

Level Cleaning: I used step_clean_levels to clean up factor levels.

Relevance: Cleaning up unused levels can make the dataset cleaner and more efficient to process.

4. Model Evaluation:

- What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question.

4. Answer

Metrics Used for Model Evaluation:

The metrics used for evaluating the machine learning model in your analysis are RMSE (Root Mean Square Error) and R-squared.

RMSE (Root Mean Square Error): This is a measure of the differences between the actual and predicted values. The goal is to minimize this value. In my analysis, the RMSE for the test set was approximately 0.318, indicating the average difference between the observed actual outcomes and predictions made by the model.

R-squared: This metric represents the proportion of the variance for the dependent variable that’s explained by the independent variables in the model. In this case, R-squared was approximately 0.752 for the test set, indicating that about 75.2% of the variance in the dependent variable is accounted for, which is relatively high.

Significance of Metrics in Context of the research question:

RMSE: A lower RMSE suggests a better fit of the model to the data, which means the model will be more reliable for making future predictions. A lower RMSE might indicate a successful model in predicting the price based on the dimensions and other features of the IKEA products.

R-squared: A higher value typically indicates that the model explains a large portion of the variance, which could be beneficial when trying to understand which features are most influential in predicting the price. A higher R-squared can suggest that I’ve accounted for many of the factors that contribute to the variation in price, although it doesn’t necessarily imply causation.

Both metrics together give a comprehensive view of the model’s performance, allowing you to assess both the accuracy and the goodness-of-fit of the model to your data. This is crucial for making reliable predictions and interpretations aligned with the research question.

5. Conclusion:

- What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. Relate these findings back to the research question and any similarities or differences compared to the CA assignment.

5. Answer

Major Findings: Feature Importance: Among the product dimensions and other features considered,Product Dimensions had a significant impact on the price especially the width. These variables ranked highest in feature importance.

Model Performance: The predictive model achieved an R-squared value of 0.752 and an RMSE of 0.318 on the test set, which suggests it’s relatively reliable in estimating the product prices based on the features considered. However, it seemed to perform better on the items with higher price tags and did have a tendency to overestimate the price on some items with lower price tags.