## * __ _ __ . o * .
## / /_(_)__/ /_ ___ _____ _______ ___
## / __/ / _ / // / |/ / -_) __(_-</ -_)
## \__/_/\_,_/\_, /|___/\__/_/ /___/\__/
## * . /___/ o . *
## Rows: 249,390
## Columns: 20
## $ name <chr> "Beautiful Flat in le Village Montmartre, Pa~
## $ host_since <date> 2011-12-03, 2013-11-29, 2014-07-31, 2013-12~
## $ host_location <chr> "Paris, Ile-de-France, France", "Paris, Ile-~
## $ host_is_superhost <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA~
## $ host_total_listings_count <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR~
## $ host_identity_verified <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE,~
## $ neighbourhood <chr> "Buttes-Montmartre", "Buttes-Montmartre", "E~
## $ city <chr> "Paris", "Paris", "Paris", "Paris", "Paris",~
## $ latitude <dbl> 48.88668, 48.88617, 48.88112, 48.84571, 48.8~
## $ longitude <dbl> 2.33343, 2.34515, 2.31712, 2.30584, 2.26979,~
## $ property_type <chr> "Entire apartment", "Entire apartment", "Ent~
## $ room_type <chr> "Entire place", "Entire place", "Entire plac~
## $ accommodates <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ bedrooms <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ amenities <chr> "[\"Heating\", \"Kitchen\", \"Washer\", \"Wi~
## $ price <dbl> 63.09524, 142.85714, 105.95238, 69.04762, 71~
## $ minimum_nights <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ maximum_nights <dbl> 1125, 1125, 1125, 1125, 1125, 1125, 1125, 11~
## $ instant_bookable <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA~
## [1] "Paris" "New York" "Bangkok" "Rio de Janeiro"
## [5] "Sydney" "Istanbul" "Rome" "Hong Kong"
## [9] "Mexico City" "Cape Town"



Bangkok Price < \$1,000.00
Bangkok Price > \$1,000.00

World Map (zoom in and click circle for name and price)
Model Bagged Decision Tree
corrplot::corrplot(cor(air %>% select_if(is.numeric)))

set.seed(123)
split <- air %>% mutate(price = log(price)) %>% initial_split(strata = price)
train <- training(split)
test <- testing(split)
set.seed(456)
folds <- vfold_cv(train, strata = price, v = 5)
recipe <- recipe(price ~ city + bedrooms + accommodates + room_type, data = train)
prep(recipe)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 4
##
## Training data contained 187041 data points and no missing data.
model <- baguette::bag_tree(mode = 'regression', min_n = 10) %>%
set_engine('rpart', times = 25)
work <- workflow() %>% add_recipe(recipe) %>% add_model(model)
doParallel::registerDoParallel(cores = 6)
set.seed(777)
fit <- fit(work, data = train)
fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: bag_tree()
##
## -- Preprocessor ----------------------------------------------------------------
## 0 Recipe Steps
##
## -- Model -----------------------------------------------------------------------
## Bagged CART (regression with 25 members)
##
## Variable importance scores include:
##
## # A tibble: 4 x 4
## term value std.error used
## <chr> <dbl> <dbl> <int>
## 1 city 81556. 95.4 25
## 2 bedrooms 32399. 111. 25
## 3 accommodates 25270. 100. 25
## 4 room_type 11025. 129. 25
# eval bagged training model ---------------------------------------------------------------------------------
doParallel::registerDoParallel(cores = 6)
set.seed(456)
frs <- fit_resamples(work, folds)
collect_metrics(frs)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.621 5 0.000742 Preprocessor1_Model1
## 2 rsq standard 0.637 5 0.00161 Preprocessor1_Model1
# predict on test --------------------------------------------------------------------------------------------
pred <- augment(fit, test)
pred %>%
ggplot(aes(exp(price), exp(.pred))) +
geom_point(alpha = 0.42, color = 'chartreuse') +
geom_smooth(method = 'lm', formula = 'y ~ x', se = FALSE) +
scale_x_log10(labels = scales::dollar_format()) +
scale_y_log10(labels = scales::dollar_format()) +
labs(x = 'Price', y = 'Predicted Price')

# variable importance ---------------------------------------------------------------------------------------
last_fit <- last_fit(work, split)
important <- last_fit$.workflow[[1]] %>% pull_workflow_fit() # extract_fit_parsnip()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
important$fit$imp %>%
slice_max(value, n = 10) %>%
ggplot(aes(value, fct_reorder(term, value)), fill = term)+
geom_col(alpha = 0.42, fill = "#FF6666", color = "black")+
labs(x = "Variable Importance Score", y = NULL)
