## * __  _    __   .    o           *  . 
##  / /_(_)__/ /_ ___  _____ _______ ___ 
## / __/ / _  / // / |/ / -_) __(_-</ -_)
## \__/_/\_,_/\_, /|___/\__/_/ /___/\__/ 
##      *  . /___/      o      .       *
## Rows: 249,390
## Columns: 20
## $ name                      <chr> "Beautiful Flat in le Village Montmartre, Pa~
## $ host_since                <date> 2011-12-03, 2013-11-29, 2014-07-31, 2013-12~
## $ host_location             <chr> "Paris, Ile-de-France, France", "Paris, Ile-~
## $ host_is_superhost         <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA~
## $ host_total_listings_count <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ host_has_profile_pic      <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR~
## $ host_identity_verified    <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE,~
## $ neighbourhood             <chr> "Buttes-Montmartre", "Buttes-Montmartre", "E~
## $ city                      <chr> "Paris", "Paris", "Paris", "Paris", "Paris",~
## $ latitude                  <dbl> 48.88668, 48.88617, 48.88112, 48.84571, 48.8~
## $ longitude                 <dbl> 2.33343, 2.34515, 2.31712, 2.30584, 2.26979,~
## $ property_type             <chr> "Entire apartment", "Entire apartment", "Ent~
## $ room_type                 <chr> "Entire place", "Entire place", "Entire plac~
## $ accommodates              <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ bedrooms                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ amenities                 <chr> "[\"Heating\", \"Kitchen\", \"Washer\", \"Wi~
## $ price                     <dbl> 63.09524, 142.85714, 105.95238, 69.04762, 71~
## $ minimum_nights            <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ maximum_nights            <dbl> 1125, 1125, 1125, 1125, 1125, 1125, 1125, 11~
## $ instant_bookable          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA~
##  [1] "Paris"          "New York"       "Bangkok"        "Rio de Janeiro"
##  [5] "Sydney"         "Istanbul"       "Rome"           "Hong Kong"     
##  [9] "Mexico City"    "Cape Town"

Bangkok Price < \$1,000.00

Bangkok Price > \$1,000.00

World Map (zoom in and click circle for name and price)

Model Bagged Decision Tree

corrplot::corrplot(cor(air %>% select_if(is.numeric)))

set.seed(123)
split <- air %>% mutate(price = log(price)) %>% initial_split(strata = price)
train <- training(split)
test <- testing(split)

set.seed(456)
folds <- vfold_cv(train, strata = price, v = 5)

recipe <- recipe(price ~ city + bedrooms + accommodates + room_type,  data = train)

prep(recipe)
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          4
## 
## Training data contained 187041 data points and no missing data.
model <- baguette::bag_tree(mode = 'regression', min_n = 10) %>%
    set_engine('rpart', times = 25)

work <- workflow() %>% add_recipe(recipe) %>% add_model(model)

doParallel::registerDoParallel(cores = 6)
set.seed(777)
fit <- fit(work, data = train)
fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: bag_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## 0 Recipe Steps
## 
## -- Model -----------------------------------------------------------------------
## Bagged CART (regression with 25 members)
## 
## Variable importance scores include:
## 
## # A tibble: 4 x 4
##   term          value std.error  used
##   <chr>         <dbl>     <dbl> <int>
## 1 city         81556.      95.4    25
## 2 bedrooms     32399.     111.     25
## 3 accommodates 25270.     100.     25
## 4 room_type    11025.     129.     25
# eval bagged training model ---------------------------------------------------------------------------------
doParallel::registerDoParallel(cores = 6)
set.seed(456)
frs <- fit_resamples(work, folds)
collect_metrics(frs)
## # A tibble: 2 x 6
##   .metric .estimator  mean     n  std_err .config             
##   <chr>   <chr>      <dbl> <int>    <dbl> <chr>               
## 1 rmse    standard   0.621     5 0.000742 Preprocessor1_Model1
## 2 rsq     standard   0.637     5 0.00161  Preprocessor1_Model1
# predict on test --------------------------------------------------------------------------------------------
pred <- augment(fit, test)
pred %>%                                                                                   
    ggplot(aes(exp(price), exp(.pred))) +                         
    geom_point(alpha = 0.42, color = 'chartreuse') +
    geom_smooth(method = 'lm', formula = 'y ~ x', se = FALSE) +
    scale_x_log10(labels = scales::dollar_format()) +
    scale_y_log10(labels = scales::dollar_format()) +
    labs(x = 'Price', y = 'Predicted Price')

# variable importance ---------------------------------------------------------------------------------------
last_fit <- last_fit(work, split)

important <- last_fit$.workflow[[1]] %>% pull_workflow_fit() # extract_fit_parsnip()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
important$fit$imp %>%                                            
slice_max(value, n = 10) %>%
    ggplot(aes(value, fct_reorder(term, value)), fill = term)+
    geom_col(alpha = 0.42, fill = "#FF6666", color = "black")+
    labs(x = "Variable Importance Score", y = NULL)