airbnb

Model Bagged Decision Tree

corrplot::corrplot(cor(air %>% select_if(is.numeric)))

set.seed(123)
split <- air %>% mutate(price = log(price)) %>% initial_split(strata = price)
train <- training(split)
test <- testing(split)

set.seed(456)
folds <- vfold_cv(train, strata = price, v = 5)

recipe <- recipe(price ~ city + bedrooms + accommodates + room_type,  data = train)

prep(recipe)

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          4
## 
## Training data contained 187041 data points and no missing data.

model <- baguette::bag_tree(mode = 'regression', min_n = 10) %>%
    set_engine('rpart', times = 25)

work <- workflow() %>% add_recipe(recipe) %>% add_model(model)

doParallel::registerDoParallel(cores = 6)
set.seed(777)
fit <- fit(work, data = train)
fit

## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: bag_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## 0 Recipe Steps
## 
## -- Model -----------------------------------------------------------------------
## Bagged CART (regression with 25 members)
## 
## Variable importance scores include:
## 
## # A tibble: 4 x 4
##   term          value std.error  used
##   <chr>         <dbl>     <dbl> <int>
## 1 city         81556.      95.4    25
## 2 bedrooms     32399.     111.     25
## 3 accommodates 25270.     100.     25
## 4 room_type    11025.     129.     25

# eval bagged training model ---------------------------------------------------------------------------------
doParallel::registerDoParallel(cores = 6)
set.seed(456)
frs <- fit_resamples(work, folds)
collect_metrics(frs)

## # A tibble: 2 x 6
##   .metric .estimator  mean     n  std_err .config             
##   <chr>   <chr>      <dbl> <int>    <dbl> <chr>               
## 1 rmse    standard   0.621     5 0.000742 Preprocessor1_Model1
## 2 rsq     standard   0.637     5 0.00161  Preprocessor1_Model1

# predict on test --------------------------------------------------------------------------------------------
pred <- augment(fit, test)
pred %>%                                                                                   
    ggplot(aes(exp(price), exp(.pred))) +                         
    geom_point(alpha = 0.42, color = 'chartreuse') +
    geom_smooth(method = 'lm', formula = 'y ~ x', se = FALSE) +
    scale_x_log10(labels = scales::dollar_format()) +
    scale_y_log10(labels = scales::dollar_format()) +
    labs(x = 'Price', y = 'Predicted Price')

# variable importance ---------------------------------------------------------------------------------------
last_fit <- last_fit(work, split)

important <- last_fit$.workflow[[1]] %>% pull_workflow_fit() # extract_fit_parsnip()

## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

important$fit$imp %>%                                            
slice_max(value, n = 10) %>%
    ggplot(aes(value, fct_reorder(term, value)), fill = term)+
    geom_col(alpha = 0.42, fill = "#FF6666", color = "black")+
    labs(x = "Variable Importance Score", y = NULL)

airbnb

Jeff Craggy

8/15/2021

Bangkok Price < \$1,000.00

Bangkok Price > \$1,000.00

World Map (zoom in and click circle for name and price)

Model Bagged Decision Tree