Explore data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ikea <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv")
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
ikea %>%
    select(...1, price, depth:width) %>%
    pivot_longer(depth:width, names_to = "dim") %>% 
    ggplot(aes(value, price, color = dim)) +
    geom_point(alpha = 0.4, show.legend = FALSE) +
    scale_y_log10() +
    facet_wrap(~ dim, scales = "free_x") + labs(x = NULL)
## Warning: Removed 3040 rows containing missing values (`geom_point()`).

ikea_df <- ikea %>% 
    select(price, name, category, depth, height, width) %>%
    mutate(price = log10(price)) %>%
    mutate_if(is.character, factor)

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
ikea_split <- initial_split(ikea_df, strata = price)
ikea_train <- training(ikea_split)
ikea_test <- testing(ikea_split)

set.seed(234)
ikea_folds <- bootstraps(ikea_train, strata = price)
ikea_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits              id         
##    <list>              <chr>      
##  1 <split [2770/994]>  Bootstrap01
##  2 <split [2770/1003]> Bootstrap02
##  3 <split [2770/1037]> Bootstrap03
##  4 <split [2770/1010]> Bootstrap04
##  5 <split [2770/1014]> Bootstrap05
##  6 <split [2770/1007]> Bootstrap06
##  7 <split [2770/1036]> Bootstrap07
##  8 <split [2770/1016]> Bootstrap08
##  9 <split [2770/1021]> Bootstrap09
## 10 <split [2770/1043]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)

use_ranger(price ~., data = ikea_train)
## ranger_recipe <- 
##   recipe(formula = price ~ ., data = ikea_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("classification") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(67013)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(textrecipes)

ranger_recipe <-
  recipe(formula = price ~ ., data = ikea_train) %>%
  step_other(name, category, threshold = 0.01) %>%
  step_clean_levels(name, category) %>%
  step_impute_knn(depth, height, width)

ranger_spec <-
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  set_mode("regression") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
  tune_grid(ranger_workflow,
    resamples = ikea_folds,
    grid = 11)
## i Creating pre-processing data to finalize unknown parameter: mtry

Explore results

show_best(ranger_tune, metric = "rmse")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2     4 rmse    standard   0.340    25 0.00202 Preprocessor1_Model10
## 2     4    10 rmse    standard   0.348    25 0.00229 Preprocessor1_Model05
## 3     5     6 rmse    standard   0.349    25 0.00233 Preprocessor1_Model06
## 4     3    18 rmse    standard   0.350    25 0.00219 Preprocessor1_Model01
## 5     2    21 rmse    standard   0.352    25 0.00198 Preprocessor1_Model08
show_best(ranger_tune, metric = "rsq")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2     4 rsq     standard   0.726    25 0.00333 Preprocessor1_Model10
## 2     4    10 rsq     standard   0.713    25 0.00379 Preprocessor1_Model05
## 3     5     6 rsq     standard   0.711    25 0.00385 Preprocessor1_Model06
## 4     3    18 rsq     standard   0.709    25 0.00369 Preprocessor1_Model01
## 5     2    21 rsq     standard   0.707    25 0.00349 Preprocessor1_Model08
autoplot(ranger_tune)

final_rf <- ranger_workflow %>%
  finalize_workflow(select_best(ranger_tune))
## Warning: No value of `metric` was given; metric 'rmse' will be used.
final_rf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_other()
## • step_clean_levels()
## • step_impute_knn()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 2
##   trees = 1000
##   min_n = 4
## 
## Computational engine: ranger
ikea_fit <- last_fit(final_rf, ikea_split)
ikea_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits             id               .metrics .notes   .predictions .workflow 
##   <list>             <chr>            <list>   <list>   <list>       <list>    
## 1 <split [2770/924]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(ikea_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.318 Preprocessor1_Model1
## 2 rsq     standard       0.752 Preprocessor1_Model1
collect_predictions(ikea_fit) %>%
  ggplot(aes(price, .pred)) +
  geom_abline(lty = 2, color = "gray50") +
  geom_point(alpha = 0.5, color = "midnightblue") +
  coord_fixed()

predict(ikea_fit$.workflow[[1]], ikea_test[15, ])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  2.41
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_spec <- ranger_spec %>%
  finalize_model(select_best(ranger_tune)) %>%
  set_engine("ranger", importance = "permutation")
## Warning: No value of `metric` was given; metric 'rmse' will be used.
workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(imp_spec) %>%
  fit(ikea_train) %>%
  extract_fit_parsnip() %>%
  vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))

Questions

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is: How do the different measurments (depth, width, height) and other variables of IKEA products influence their price?

    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The dataset as 14 columns, 3,694 entries. The primary characteristics that I used in the code were width. The dataset has information regarding IKEA products which includes their names, dimensions, prices, and the category they fall into.

    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The key variables used in the analysis are the depth, width, and height. Those variables were used in the analysis to predict prices of the products.

  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original dataset was called IKEA and had variables such as the names of products, product prices, the product categories, the dimensions of the products, etc. The analysis was done to find a potential relationship between these characteristics. The transformed dataset was called IKEA_df and had some transformations. One transformation was using the log10(price) function to help deal with skewed distributions in the orignial dataset. This improved the models ability to predict the price of the product. Factor levels were also cleaned which is a difference between the datasets. We used the function step_clean_levels to make sure there weren’t any factor levels in the training set that were not in the test set. We also used step_impute_knn to get rid of missing values in the dimension columns (depth, width, height) to make sure the dataset was complete. All of these changes were important to be sure that the dataset was strong for making an an accurate predictive model.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. The names of the data preperation steps mentioned in the video are handling missing values, transforming data, data splitting, resampling, and data cleaning.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning model used in the analysis is random forest. Random forest can find relationships in the data and can also handle missing data. This machine learning model is helpful when trying to understand which factors are most important in predicting furniture prices.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The metrics used in the model evaluation are root mean square error and r-squared. Root mean square error finds the average of the errors. The lower the root mean square error the closer the model’s predictions are. This means that the lower the root mean square error on the price of IKEA furniture, the better the model is at predicting the price. R-squared finds the proportion variance in the dependent variable, which is price, that is predictable from the independent variables, which is characteristics like width. The range is 0-1. the closer to 0 the value is the less the model explains about the variability. The significance of these metrics in the context of the research question is to know how well the model accurately predicts the price based on characteristics and how much variance there is. Root mean square error gives the accuracy of the prediction, and r-squared finds the model’s ability to get the variance in furniture prices based on features. Used together, they give a clearer view of the model’s precision performance.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset.

Of all of product variables the dimensions of the products, especially the width, seemed to have the biggest influence on the items price. Based off of the root mean square error and r-squared values the model is pretty accurate when predicting the products prices based off of product dimensions. The root mean square error was 0.318, and we know that the lower the root mean square error, the better the model is at predicting. The r-squared value was 0.752 and we know that the closer the value is to 1 the more the model explains about variability.