Inventory


Price Distribution


Variability Price by Model

Price by fuel type with year

MPG by Model

Bagged Decision Tree Model
set.seed(123)
split <- cars %>% mutate(price = log(price)) %>% initial_split(strata = price)
train <- training(split)
test <- testing(split)
set.seed(456)
folds <- vfold_cv(train, v = 5, strata = price)
recipe <- recipe(price ~ year + engineSize + mileage + model, train) %>%
textrecipes::step_tokenize(model) %>%
textrecipes::step_stopwords(model) %>%
textrecipes::step_tokenfilter(model, max_tokens = 10) %>%
textrecipes::step_tf(model)
prep(recipe)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 4
##
## Training data contained 5052 data points and no missing data.
##
## Operations:
##
## Tokenization for model [trained]
## Stop word removal for model [trained]
## Text filtering for model [trained]
## Term frequency with model [trained]
model <- bag_tree(mode = 'regression', min_n = 10) %>%
set_engine('rpart', times = 24)
work <- workflow() %>% add_recipe(recipe) %>% add_model(model)
set.seed(777)
fit <- fit(work, train)
set.seed(456)
frs <- fit_resamples(work, folds)
collect_metrics(frs)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.125 5 0.00338 Preprocessor1_Model1
## 2 rsq standard 0.930 5 0.00323 Preprocessor1_Model1
Predict on test set
pred <- augment(fit, test)
pred %>%
ggplot(aes(exp(price), exp(.pred))) +
geom_point(alpha = 0.42) +
geom_smooth(method = 'lm', formula = 'y ~ x', se = FALSE) +
scale_x_log10(labels = scales::dollar_format()) +
scale_y_log10(labels = scales::dollar_format()) +
labs(x = 'Price', y = 'Predicted Price')

Variable Importance
last_fit <- last_fit(work, split)
important <- last_fit$.workflow[[1]] %>% extract_fit_parsnip()
important$fit$imp %>%
slice_max(value, n = 10) %>%
ggplot(aes(value, fct_reorder(term, value)), fill = term)+
geom_col(alpha = 0.42, fill = "#FF6666", color = "black")+
labs(x = "Variable Importance Score", y = NULL)
