Exploratory Data Analysis

## 
## Call:
## lm(formula = price ~ engineSize + year, data = cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13699.2  -1877.5   -346.7   1277.1  27880.7 
## 
## Coefficients:
##                Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept) -2645463.15    36390.61  -72.70 <0.0000000000000002 ***
## engineSize     10886.30       91.17  119.41 <0.0000000000000002 ***
## year            1310.01       18.04   72.61 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3260 on 6735 degrees of freedom
## Multiple R-squared:  0.736,  Adjusted R-squared:  0.736 
## F-statistic:  9390 on 2 and 6735 DF,  p-value: < 0.00000000000000022

Inventory

Price Distribution

Variability Price by Model

Price by fuel type with year

MPG by Model

Bagged Decision Tree Model

set.seed(123)
split <- cars %>% mutate(price = log(price)) %>% initial_split(strata = price)
train <- training(split)
test <- testing(split)

set.seed(456)
folds <- vfold_cv(train, v = 5, strata = price)

recipe <- recipe(price ~ year + engineSize + mileage + model, train) %>%
    textrecipes::step_tokenize(model) %>%
    textrecipes::step_stopwords(model) %>%
    textrecipes::step_tokenfilter(model, max_tokens = 10) %>%
    textrecipes::step_tf(model)

prep(recipe)
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          4
## 
## Training data contained 5052 data points and no missing data.
## 
## Operations:
## 
## Tokenization for model [trained]
## Stop word removal for model [trained]
## Text filtering for model [trained]
## Term frequency with model [trained]
model <- bag_tree(mode = 'regression', min_n = 10) %>%
    set_engine('rpart', times = 24)

work <- workflow() %>% add_recipe(recipe) %>% add_model(model)

set.seed(777)
fit <- fit(work, train)

set.seed(456)
frs <- fit_resamples(work, folds)
collect_metrics(frs)
## # A tibble: 2 x 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.125     5 0.00338 Preprocessor1_Model1
## 2 rsq     standard   0.930     5 0.00323 Preprocessor1_Model1

Predict on test set

pred <- augment(fit, test)

pred %>%
    ggplot(aes(exp(price), exp(.pred))) + 
    geom_point(alpha = 0.42) +
    geom_smooth(method = 'lm', formula = 'y ~ x', se = FALSE) +
    scale_x_log10(labels = scales::dollar_format()) +
    scale_y_log10(labels = scales::dollar_format()) +
    labs(x = 'Price', y = 'Predicted Price')

Variable Importance

last_fit <- last_fit(work, split)

important <- last_fit$.workflow[[1]] %>% extract_fit_parsnip()

important$fit$imp %>%                               
slice_max(value, n = 10) %>%
    ggplot(aes(value, fct_reorder(term, value)), fill = term)+
    geom_col(alpha = 0.42, fill = "#FF6666", color = "black")+
    labs(x = "Variable Importance Score", y = NULL)