library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ikea <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv")
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
ikea %>%
select(...1, price, depth:width) %>%
pivot_longer(depth:width, names_to = "dim") %>%
ggplot(aes(value, price, color = dim)) +
geom_point(alpha = 0.4, show.legend = FALSE) +
scale_y_log10() +
facet_wrap(~ dim, scales = "free_x") + labs(x = NULL)
## Warning: Removed 3040 rows containing missing values (`geom_point()`).
ikea_df <- ikea %>%
select(price, name, category, depth, height, width) %>%
mutate(price = log10(price)) %>%
mutate_if(is.character, factor)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
ikea_split <- initial_split(ikea_df, strata = price)
ikea_train <- training(ikea_split)
ikea_test <- testing(ikea_split)
set.seed(234)
ikea_folds <- bootstraps(ikea_train, strata = price)
ikea_folds
## # Bootstrap sampling using stratification
## # A tibble: 25 × 2
## splits id
## <list> <chr>
## 1 <split [2770/994]> Bootstrap01
## 2 <split [2770/1003]> Bootstrap02
## 3 <split [2770/1037]> Bootstrap03
## 4 <split [2770/1010]> Bootstrap04
## 5 <split [2770/1014]> Bootstrap05
## 6 <split [2770/1007]> Bootstrap06
## 7 <split [2770/1036]> Bootstrap07
## 8 <split [2770/1016]> Bootstrap08
## 9 <split [2770/1021]> Bootstrap09
## 10 <split [2770/1043]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_ranger(price ~., data = ikea_train)
## ranger_recipe <-
## recipe(formula = price ~ ., data = ikea_train)
##
## ranger_spec <-
## rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
## set_mode("classification") %>%
## set_engine("ranger")
##
## ranger_workflow <-
## workflow() %>%
## add_recipe(ranger_recipe) %>%
## add_model(ranger_spec)
##
## set.seed(67013)
## ranger_tune <-
## tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(textrecipes)
ranger_recipe <-
recipe(formula = price ~ ., data = ikea_train) %>%
step_other(name, category, threshold = 0.01) %>%
step_clean_levels(name, category) %>%
step_impute_knn(depth, height, width)
ranger_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_mode("regression") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
tune_grid(ranger_workflow,
resamples = ikea_folds,
grid = 11)
## i Creating pre-processing data to finalize unknown parameter: mtry
show_best(ranger_tune, metric = "rmse")
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 4 rmse standard 0.340 25 0.00202 Preprocessor1_Model10
## 2 4 10 rmse standard 0.348 25 0.00229 Preprocessor1_Model05
## 3 5 6 rmse standard 0.349 25 0.00233 Preprocessor1_Model06
## 4 3 18 rmse standard 0.350 25 0.00219 Preprocessor1_Model01
## 5 2 21 rmse standard 0.352 25 0.00198 Preprocessor1_Model08
show_best(ranger_tune, metric = "rsq")
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 4 rsq standard 0.726 25 0.00333 Preprocessor1_Model10
## 2 4 10 rsq standard 0.713 25 0.00379 Preprocessor1_Model05
## 3 5 6 rsq standard 0.711 25 0.00385 Preprocessor1_Model06
## 4 3 18 rsq standard 0.709 25 0.00369 Preprocessor1_Model01
## 5 2 21 rsq standard 0.707 25 0.00349 Preprocessor1_Model08
autoplot(ranger_tune)
final_rf <- ranger_workflow %>%
finalize_workflow(select_best(ranger_tune))
## Warning: No value of `metric` was given; metric 'rmse' will be used.
final_rf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_other()
## • step_clean_levels()
## • step_impute_knn()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 2
## trees = 1000
## min_n = 4
##
## Computational engine: ranger
ikea_fit <- last_fit(final_rf, ikea_split)
ikea_fit
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [2770/924]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(ikea_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.318 Preprocessor1_Model1
## 2 rsq standard 0.752 Preprocessor1_Model1
collect_predictions(ikea_fit) %>%
ggplot(aes(price, .pred)) +
geom_abline(lty = 2, color = "gray50") +
geom_point(alpha = 0.5, color = "midnightblue") +
coord_fixed()
predict(ikea_fit$.workflow[[1]], ikea_test[15, ])
## # A tibble: 1 × 1
## .pred
## <dbl>
## 1 2.41
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
imp_spec <- ranger_spec %>%
finalize_model(select_best(ranger_tune)) %>%
set_engine("ranger", importance = "permutation")
## Warning: No value of `metric` was given; metric 'rmse' will be used.
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(imp_spec) %>%
fit(ikea_train) %>%
extract_fit_parsnip() %>%
vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))
What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is: How do the different measurments (depth, width, height) and other variables of IKEA products influence their price?
Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The dataset as 14 columns, 3,694 entries. The primary characteristics that I used in the code were width. The dataset has information regarding IKEA products which includes their names, dimensions, prices, and the category they fall into.
What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The key variables used in the analysis are the depth, width, and height. Those variables were used in the analysis to predict prices of the products.
Of all of product variables the dimensions of the products, especially the width, seemed to have the biggest influence on the items price. Based off of the root mean square error and r-squared values the model is pretty accurate when predicting the products prices based off of product dimensions. The root mean square error was 0.318, and we know that the lower the root mean square error, the better the model is at predicting. The r-squared value was 0.752 and we know that the closer the value is to 1 the more the model explains about variability.