The data

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.7     ✔ recipes      1.1.1
## ✔ dials        1.4.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
library(xgboost)
## 
## Adjuntando el paquete: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(ggplot2)
library(readr)
## 
## Adjuntando el paquete: 'readr'
## The following object is masked from 'package:yardstick':
## 
##     spec
## The following object is masked from 'package:scales':
## 
##     col_factor
library(vip) 
## 
## Adjuntando el paquete: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
avg_prices <- data.frame(
  neighbourhood = c("eixample", "ciutat vella","sant marti","sants-montjuic", "sarria-sant gervasi","nou barris","horta-guinardo", "gracia","sant andreu","les corts"),
  avg_price_m2 = c(5881,4751,4728,4220,6242, 2610,3872,5153,3624, 5613),
  avg_price = c(684012, 392645,435215,299140,980439, 201074,310891,500411,288534, 779088),
  latitude = c(41.389887, 41.382183, 41.412146, 41.374394, 41.402357, 41.437541, 41.423826, 41.406744, 41.432717, 41.387295),
  longitude = c(2.161808, 2.176437, 2.204667, 2.140377, 2.134925, 2.175310, 2.161701, 2.158146, 2.189304, 2.126046),
  pct_year_occupation = c(0.75, 0.8, 0.6, 0.7, 0.7, 0.5, 0.55, 0.7, 0.6, 0.7)
)

airbnb_clean <- read_csv('C:/Users/Edgar/Desktop/Challenge_2/airbnb_clean.csv')
## Rows: 19833 Columns: 113
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): host_response_time, neighbourhood_group_cleansed, property_type, ...
## dbl  (23): host_id, host_response_rate, host_listings_count, latitude, longi...
## lgl  (83): host_is_superhost, host_has_profile_pic, host_identity_verified, ...
## date  (3): host_since, first_review, last_review
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(airbnb_clean)
## spc_tbl_ [19,833 × 113] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ host_id                                                : num [1:19833] 71615 71615 82522 90417 108310 ...
##  $ host_since                                             : Date[1:19833], format: "2010-01-19" "2010-01-19" ...
##  $ host_response_time                                     : chr [1:19833] "within an hour" "within an hour" "within a few hours" "within an hour" ...
##  $ host_response_rate                                     : num [1:19833] 99 99 100 100 100 100 100 92 92 100 ...
##  $ host_is_superhost                                      : logi [1:19833] FALSE FALSE FALSE TRUE TRUE FALSE ...
##  $ host_listings_count                                    : num [1:19833] 45 45 2 5 1 9 9 41 41 1 ...
##  $ host_has_profile_pic                                   : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ host_identity_verified                                 : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
##  $ neighbourhood_group_cleansed                           : chr [1:19833] "sant marti" "eixample" "sant marti" "sant marti" ...
##  $ latitude                                               : num [1:19833] 41.4 41.4 41.4 41.4 41.4 ...
##  $ longitude                                              : num [1:19833] 2.19 2.17 2.2 2.22 2.16 ...
##  $ is_location_exact                                      : logi [1:19833] TRUE TRUE TRUE FALSE TRUE TRUE ...
##  $ property_type                                          : chr [1:19833] "apartment" "apartment" "apartment" "apartment" ...
##  $ room_type                                              : chr [1:19833] "entire home/apt" "entire home/apt" "private room" "entire home/apt" ...
##  $ accommodates                                           : num [1:19833] 6 8 2 6 2 2 3 4 5 1 ...
##  $ bathrooms                                              : num [1:19833] 1 2 1 2 1 1 1 1 1.5 1 ...
##  $ bedrooms                                               : num [1:19833] 2 3 1 3 1 1 1 1 3 1 ...
##  $ beds                                                   : num [1:19833] 4 6 1 8 1 2 2 1 3 1 ...
##  $ price                                                  : num [1:19833] 130 60 33 210 45 42 53 75 85 30 ...
##  $ cleaning_fee                                           : num [1:19833] 42 50 NA 80 NA NA NA 55 105 0 ...
##  $ minimum_nights                                         : num [1:19833] 3 1 2 3 1 3 3 1 1 29 ...
##  $ maximum_nights                                         : num [1:19833] 730 1125 1125 1125 730 ...
##  $ has_availability                                       : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ number_of_reviews                                      : num [1:19833] 1 15 119 45 241 4 40 174 79 19 ...
##  $ first_review                                           : Date[1:19833], format: "2015-10-10" "2013-05-27" ...
##  $ last_review                                            : Date[1:19833], format: "2015-10-10" "2019-07-02" ...
##  $ review_scores_rating                                   : num [1:19833] 80 87 90 95 95 95 87 92 88 99 ...
##  $ review_scores_accuracy                                 : num [1:19833] 10 9 10 10 10 9 9 9 9 10 ...
##  $ review_scores_cleanliness                              : num [1:19833] 10 9 9 10 10 10 9 9 9 10 ...
##  $ review_scores_checkin                                  : num [1:19833] 2 10 10 10 10 10 9 8 9 10 ...
##  $ review_scores_communication                            : num [1:19833] 10 10 10 10 10 10 9 9 10 10 ...
##  $ review_scores_location                                 : num [1:19833] 10 9 9 9 10 9 8 9 9 9 ...
##  $ review_scores_value                                    : num [1:19833] 8 8 9 9 9 9 9 9 9 9 ...
##  $ instant_bookable                                       : logi [1:19833] FALSE TRUE FALSE TRUE TRUE FALSE ...
##  $ has_verificator__email                                 : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_verificator__phone                                 : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_verificator__reviews                               : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_verificator__jumio                                 : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
##  $ has_verificator__government_id                         : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
##  $ has_verificator__offline_government_id                 : logi [1:19833] FALSE FALSE TRUE TRUE TRUE FALSE ...
##  $ has_verificator__selfie                                : logi [1:19833] FALSE FALSE FALSE TRUE TRUE FALSE ...
##  $ has_verificator__identity_manual                       : logi [1:19833] FALSE FALSE FALSE TRUE TRUE FALSE ...
##  $ has_verificator__facebook                              : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ has_verificator__work_email                            : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ has_amenity__TV                                        : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
##  $ has_amenity__Internet                                  : logi [1:19833] TRUE TRUE FALSE TRUE FALSE FALSE ...
##  $ has_amenity__Wifi                                      : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Air.conditioning                          : logi [1:19833] TRUE TRUE FALSE FALSE TRUE TRUE ...
##  $ has_amenity__Kitchen                                   : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Elevator                                  : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
##  $ has_amenity__Free.street.parking                       : logi [1:19833] TRUE TRUE FALSE FALSE FALSE FALSE ...
##  $ has_amenity__Heating                                   : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Family.kid.friendly                       : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Washer                                    : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Dryer                                     : logi [1:19833] TRUE FALSE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Essentials                                : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Shampoo                                   : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Hair.dryer                                : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Hot.water                                 : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
##  $ has_amenity__Host.greets.you                           : logi [1:19833] TRUE TRUE FALSE TRUE FALSE TRUE ...
##  $ has_amenity__Paid.parking.on.premises                  : logi [1:19833] TRUE TRUE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Buzzer.wireless.intercom                  : logi [1:19833] FALSE TRUE FALSE TRUE FALSE TRUE ...
##  $ has_amenity__Hangers                                   : logi [1:19833] FALSE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__Iron                                      : logi [1:19833] FALSE TRUE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Laptop.friendly.workspace                 : logi [1:19833] FALSE TRUE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Crib                                      : logi [1:19833] FALSE TRUE FALSE TRUE FALSE FALSE ...
##  $ has_amenity__Paid.parking.off.premises                 : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__First.aid.kit                             : logi [1:19833] FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ has_amenity__Self.check.in                             : logi [1:19833] FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ has_amenity__Bed.linens                                : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Extra.pillows.and.blankets                : logi [1:19833] FALSE FALSE TRUE FALSE FALSE TRUE ...
##  $ has_amenity__Microwave                                 : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Coffee.maker                              : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Refrigerator                              : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Dishwasher                                : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Dishes.and.silverware                     : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Cooking.basics                            : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Oven                                      : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Stove                                     : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ has_amenity__Patio.or.balcony                          : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Luggage.dropoff.allowed                   : logi [1:19833] FALSE FALSE TRUE TRUE TRUE TRUE ...
##  $ has_amenity__No.stairs.or.steps.to.enter               : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Wide.entrance.for.guests                  : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Well.lit.path.to.entrance                 : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Wide.entryway                             : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ has_amenity__Smoke.detector                            : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ has_amenity__Carbon.monoxide.detector                  : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ has_amenity__Fire.extinguisher                         : logi [1:19833] FALSE FALSE FALSE TRUE TRUE FALSE ...
##  $ has_amenity__High.chair                                : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ has_amenity__Pack..n.Play.travel.crib                  : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ has_amenity__Long.term.stays.allowed                   : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ has_amenity__Wide.hallways                             : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ has_amenity__Smoking.allowed                           : logi [1:19833] FALSE FALSE FALSE FALSE TRUE FALSE ...
##  $ has_amenity__Lock.on.bedroom.door                      : logi [1:19833] FALSE FALSE FALSE FALSE TRUE TRUE ...
##  $ has_amenity__translation.missing..en.hosting_amenity_50: logi [1:19833] FALSE FALSE FALSE FALSE FALSE TRUE ...
##  $ has_amenity__Private.living.room                       : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ has_amenity__Cable.TV                                  : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ has_amenity__Safety.card                               : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ has_amenity__24.hour.check.in                          : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   [list output truncated]
##  - attr(*, "spec")=
##   .. cols(
##   ..   host_id = col_double(),
##   ..   host_since = col_date(format = ""),
##   ..   host_response_time = col_character(),
##   ..   host_response_rate = col_double(),
##   ..   host_is_superhost = col_logical(),
##   ..   host_listings_count = col_double(),
##   ..   host_has_profile_pic = col_logical(),
##   ..   host_identity_verified = col_logical(),
##   ..   neighbourhood_group_cleansed = col_character(),
##   ..   latitude = col_double(),
##   ..   longitude = col_double(),
##   ..   is_location_exact = col_logical(),
##   ..   property_type = col_character(),
##   ..   room_type = col_character(),
##   ..   accommodates = col_double(),
##   ..   bathrooms = col_double(),
##   ..   bedrooms = col_double(),
##   ..   beds = col_double(),
##   ..   price = col_double(),
##   ..   cleaning_fee = col_double(),
##   ..   minimum_nights = col_double(),
##   ..   maximum_nights = col_double(),
##   ..   has_availability = col_logical(),
##   ..   number_of_reviews = col_double(),
##   ..   first_review = col_date(format = ""),
##   ..   last_review = col_date(format = ""),
##   ..   review_scores_rating = col_double(),
##   ..   review_scores_accuracy = col_double(),
##   ..   review_scores_cleanliness = col_double(),
##   ..   review_scores_checkin = col_double(),
##   ..   review_scores_communication = col_double(),
##   ..   review_scores_location = col_double(),
##   ..   review_scores_value = col_double(),
##   ..   instant_bookable = col_logical(),
##   ..   has_verificator__email = col_logical(),
##   ..   has_verificator__phone = col_logical(),
##   ..   has_verificator__reviews = col_logical(),
##   ..   has_verificator__jumio = col_logical(),
##   ..   has_verificator__government_id = col_logical(),
##   ..   has_verificator__offline_government_id = col_logical(),
##   ..   has_verificator__selfie = col_logical(),
##   ..   has_verificator__identity_manual = col_logical(),
##   ..   has_verificator__facebook = col_logical(),
##   ..   has_verificator__work_email = col_logical(),
##   ..   has_amenity__TV = col_logical(),
##   ..   has_amenity__Internet = col_logical(),
##   ..   has_amenity__Wifi = col_logical(),
##   ..   has_amenity__Air.conditioning = col_logical(),
##   ..   has_amenity__Kitchen = col_logical(),
##   ..   has_amenity__Elevator = col_logical(),
##   ..   has_amenity__Free.street.parking = col_logical(),
##   ..   has_amenity__Heating = col_logical(),
##   ..   has_amenity__Family.kid.friendly = col_logical(),
##   ..   has_amenity__Washer = col_logical(),
##   ..   has_amenity__Dryer = col_logical(),
##   ..   has_amenity__Essentials = col_logical(),
##   ..   has_amenity__Shampoo = col_logical(),
##   ..   has_amenity__Hair.dryer = col_logical(),
##   ..   has_amenity__Hot.water = col_logical(),
##   ..   has_amenity__Host.greets.you = col_logical(),
##   ..   has_amenity__Paid.parking.on.premises = col_logical(),
##   ..   has_amenity__Buzzer.wireless.intercom = col_logical(),
##   ..   has_amenity__Hangers = col_logical(),
##   ..   has_amenity__Iron = col_logical(),
##   ..   has_amenity__Laptop.friendly.workspace = col_logical(),
##   ..   has_amenity__Crib = col_logical(),
##   ..   has_amenity__Paid.parking.off.premises = col_logical(),
##   ..   has_amenity__First.aid.kit = col_logical(),
##   ..   has_amenity__Self.check.in = col_logical(),
##   ..   has_amenity__Bed.linens = col_logical(),
##   ..   has_amenity__Extra.pillows.and.blankets = col_logical(),
##   ..   has_amenity__Microwave = col_logical(),
##   ..   has_amenity__Coffee.maker = col_logical(),
##   ..   has_amenity__Refrigerator = col_logical(),
##   ..   has_amenity__Dishwasher = col_logical(),
##   ..   has_amenity__Dishes.and.silverware = col_logical(),
##   ..   has_amenity__Cooking.basics = col_logical(),
##   ..   has_amenity__Oven = col_logical(),
##   ..   has_amenity__Stove = col_logical(),
##   ..   has_amenity__Patio.or.balcony = col_logical(),
##   ..   has_amenity__Luggage.dropoff.allowed = col_logical(),
##   ..   has_amenity__No.stairs.or.steps.to.enter = col_logical(),
##   ..   has_amenity__Wide.entrance.for.guests = col_logical(),
##   ..   has_amenity__Well.lit.path.to.entrance = col_logical(),
##   ..   has_amenity__Wide.entryway = col_logical(),
##   ..   has_amenity__Smoke.detector = col_logical(),
##   ..   has_amenity__Carbon.monoxide.detector = col_logical(),
##   ..   has_amenity__Fire.extinguisher = col_logical(),
##   ..   has_amenity__High.chair = col_logical(),
##   ..   has_amenity__Pack..n.Play.travel.crib = col_logical(),
##   ..   has_amenity__Long.term.stays.allowed = col_logical(),
##   ..   has_amenity__Wide.hallways = col_logical(),
##   ..   has_amenity__Smoking.allowed = col_logical(),
##   ..   has_amenity__Lock.on.bedroom.door = col_logical(),
##   ..   has_amenity__translation.missing..en.hosting_amenity_50 = col_logical(),
##   ..   has_amenity__Private.living.room = col_logical(),
##   ..   has_amenity__Cable.TV = col_logical(),
##   ..   has_amenity__Safety.card = col_logical(),
##   ..   has_amenity__24.hour.check.in = col_logical(),
##   ..   has_amenity__Private.entrance = col_logical(),
##   ..   has_amenity__Breakfast = col_logical(),
##   ..   has_amenity__translation.missing..en.hosting_amenity_49 = col_logical(),
##   ..   has_amenity__Room.darkening.shades = col_logical(),
##   ..   has_amenity__Pets.allowed = col_logical(),
##   ..   has_amenity__Pocket.wifi = col_logical(),
##   ..   has_amenity__Extra.space.around.bed = col_logical(),
##   ..   has_amenity__Accessible.height.bed = col_logical(),
##   ..   has_amenity__Bathtub = col_logical(),
##   ..   has_amenity__Wide.entrance = col_logical(),
##   ..   has_amenity__.toilet = col_logical(),
##   ..   has_amenity__Ethernet.connection = col_logical(),
##   ..   log_price = col_double(),
##   ..   n_NA_cols = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
glimpse(airbnb_clean)
## Rows: 19,833
## Columns: 113
## $ host_id                                                 <dbl> 71615, 71615, …
## $ host_since                                              <date> 2010-01-19, 2…
## $ host_response_time                                      <chr> "within an hou…
## $ host_response_rate                                      <dbl> 99, 99, 100, 1…
## $ host_is_superhost                                       <lgl> FALSE, FALSE, …
## $ host_listings_count                                     <dbl> 45, 45, 2, 5, …
## $ host_has_profile_pic                                    <lgl> TRUE, TRUE, TR…
## $ host_identity_verified                                  <lgl> TRUE, TRUE, TR…
## $ neighbourhood_group_cleansed                            <chr> "sant marti", …
## $ latitude                                                <dbl> 41.40889, 41.4…
## $ longitude                                               <dbl> 2.18555, 2.173…
## $ is_location_exact                                       <lgl> TRUE, TRUE, TR…
## $ property_type                                           <chr> "apartment", "…
## $ room_type                                               <chr> "entire home/a…
## $ accommodates                                            <dbl> 6, 8, 2, 6, 2,…
## $ bathrooms                                               <dbl> 1.0, 2.0, 1.0,…
## $ bedrooms                                                <dbl> 2, 3, 1, 3, 1,…
## $ beds                                                    <dbl> 4, 6, 1, 8, 1,…
## $ price                                                   <dbl> 130, 60, 33, 2…
## $ cleaning_fee                                            <dbl> 42, 50, NA, 80…
## $ minimum_nights                                          <dbl> 3, 1, 2, 3, 1,…
## $ maximum_nights                                          <dbl> 730, 1125, 112…
## $ has_availability                                        <lgl> TRUE, TRUE, TR…
## $ number_of_reviews                                       <dbl> 1, 15, 119, 45…
## $ first_review                                            <date> 2015-10-10, 2…
## $ last_review                                             <date> 2015-10-10, 2…
## $ review_scores_rating                                    <dbl> 80, 87, 90, 95…
## $ review_scores_accuracy                                  <dbl> 10, 9, 10, 10,…
## $ review_scores_cleanliness                               <dbl> 10, 9, 9, 10, …
## $ review_scores_checkin                                   <dbl> 2, 10, 10, 10,…
## $ review_scores_communication                             <dbl> 10, 10, 10, 10…
## $ review_scores_location                                  <dbl> 10, 9, 9, 9, 1…
## $ review_scores_value                                     <dbl> 8, 8, 9, 9, 9,…
## $ instant_bookable                                        <lgl> FALSE, TRUE, F…
## $ has_verificator__email                                  <lgl> TRUE, TRUE, TR…
## $ has_verificator__phone                                  <lgl> TRUE, TRUE, TR…
## $ has_verificator__reviews                                <lgl> TRUE, TRUE, TR…
## $ has_verificator__jumio                                  <lgl> TRUE, TRUE, TR…
## $ has_verificator__government_id                          <lgl> TRUE, TRUE, TR…
## $ has_verificator__offline_government_id                  <lgl> FALSE, FALSE, …
## $ has_verificator__selfie                                 <lgl> FALSE, FALSE, …
## $ has_verificator__identity_manual                        <lgl> FALSE, FALSE, …
## $ has_verificator__facebook                               <lgl> FALSE, FALSE, …
## $ has_verificator__work_email                             <lgl> FALSE, FALSE, …
## $ has_amenity__TV                                         <lgl> TRUE, TRUE, TR…
## $ has_amenity__Internet                                   <lgl> TRUE, TRUE, FA…
## $ has_amenity__Wifi                                       <lgl> TRUE, TRUE, TR…
## $ has_amenity__Air.conditioning                           <lgl> TRUE, TRUE, FA…
## $ has_amenity__Kitchen                                    <lgl> TRUE, TRUE, TR…
## $ has_amenity__Elevator                                   <lgl> TRUE, TRUE, TR…
## $ has_amenity__Free.street.parking                        <lgl> TRUE, TRUE, FA…
## $ has_amenity__Heating                                    <lgl> TRUE, TRUE, TR…
## $ has_amenity__Family.kid.friendly                        <lgl> TRUE, TRUE, TR…
## $ has_amenity__Washer                                     <lgl> TRUE, TRUE, TR…
## $ has_amenity__Dryer                                      <lgl> TRUE, FALSE, T…
## $ has_amenity__Essentials                                 <lgl> TRUE, TRUE, TR…
## $ has_amenity__Shampoo                                    <lgl> TRUE, TRUE, TR…
## $ has_amenity__Hair.dryer                                 <lgl> TRUE, TRUE, TR…
## $ has_amenity__Hot.water                                  <lgl> TRUE, TRUE, TR…
## $ has_amenity__Host.greets.you                            <lgl> TRUE, TRUE, FA…
## $ has_amenity__Paid.parking.on.premises                   <lgl> TRUE, TRUE, TR…
## $ has_amenity__Buzzer.wireless.intercom                   <lgl> FALSE, TRUE, F…
## $ has_amenity__Hangers                                    <lgl> FALSE, TRUE, T…
## $ has_amenity__Iron                                       <lgl> FALSE, TRUE, T…
## $ has_amenity__Laptop.friendly.workspace                  <lgl> FALSE, TRUE, T…
## $ has_amenity__Crib                                       <lgl> FALSE, TRUE, F…
## $ has_amenity__Paid.parking.off.premises                  <lgl> FALSE, FALSE, …
## $ has_amenity__First.aid.kit                              <lgl> FALSE, FALSE, …
## $ has_amenity__Self.check.in                              <lgl> FALSE, FALSE, …
## $ has_amenity__Bed.linens                                 <lgl> FALSE, FALSE, …
## $ has_amenity__Extra.pillows.and.blankets                 <lgl> FALSE, FALSE, …
## $ has_amenity__Microwave                                  <lgl> FALSE, FALSE, …
## $ has_amenity__Coffee.maker                               <lgl> FALSE, FALSE, …
## $ has_amenity__Refrigerator                               <lgl> FALSE, FALSE, …
## $ has_amenity__Dishwasher                                 <lgl> FALSE, FALSE, …
## $ has_amenity__Dishes.and.silverware                      <lgl> FALSE, FALSE, …
## $ has_amenity__Cooking.basics                             <lgl> FALSE, FALSE, …
## $ has_amenity__Oven                                       <lgl> FALSE, FALSE, …
## $ has_amenity__Stove                                      <lgl> FALSE, FALSE, …
## $ has_amenity__Patio.or.balcony                           <lgl> FALSE, FALSE, …
## $ has_amenity__Luggage.dropoff.allowed                    <lgl> FALSE, FALSE, …
## $ has_amenity__No.stairs.or.steps.to.enter                <lgl> FALSE, FALSE, …
## $ has_amenity__Wide.entrance.for.guests                   <lgl> FALSE, FALSE, …
## $ has_amenity__Well.lit.path.to.entrance                  <lgl> FALSE, FALSE, …
## $ has_amenity__Wide.entryway                              <lgl> FALSE, FALSE, …
## $ has_amenity__Smoke.detector                             <lgl> FALSE, FALSE, …
## $ has_amenity__Carbon.monoxide.detector                   <lgl> FALSE, FALSE, …
## $ has_amenity__Fire.extinguisher                          <lgl> FALSE, FALSE, …
## $ has_amenity__High.chair                                 <lgl> FALSE, FALSE, …
## $ has_amenity__Pack..n.Play.travel.crib                   <lgl> FALSE, FALSE, …
## $ has_amenity__Long.term.stays.allowed                    <lgl> FALSE, FALSE, …
## $ has_amenity__Wide.hallways                              <lgl> FALSE, FALSE, …
## $ has_amenity__Smoking.allowed                            <lgl> FALSE, FALSE, …
## $ has_amenity__Lock.on.bedroom.door                       <lgl> FALSE, FALSE, …
## $ has_amenity__translation.missing..en.hosting_amenity_50 <lgl> FALSE, FALSE, …
## $ has_amenity__Private.living.room                        <lgl> FALSE, FALSE, …
## $ has_amenity__Cable.TV                                   <lgl> FALSE, FALSE, …
## $ has_amenity__Safety.card                                <lgl> FALSE, FALSE, …
## $ has_amenity__24.hour.check.in                           <lgl> FALSE, FALSE, …
## $ has_amenity__Private.entrance                           <lgl> FALSE, FALSE, …
## $ has_amenity__Breakfast                                  <lgl> FALSE, FALSE, …
## $ has_amenity__translation.missing..en.hosting_amenity_49 <lgl> FALSE, FALSE, …
## $ has_amenity__Room.darkening.shades                      <lgl> FALSE, FALSE, …
## $ has_amenity__Pets.allowed                               <lgl> FALSE, FALSE, …
## $ has_amenity__Pocket.wifi                                <lgl> FALSE, FALSE, …
## $ has_amenity__Extra.space.around.bed                     <lgl> FALSE, FALSE, …
## $ has_amenity__Accessible.height.bed                      <lgl> FALSE, FALSE, …
## $ has_amenity__Bathtub                                    <lgl> FALSE, FALSE, …
## $ has_amenity__Wide.entrance                              <lgl> FALSE, FALSE, …
## $ has_amenity__.toilet                                    <lgl> FALSE, FALSE, …
## $ has_amenity__Ethernet.connection                        <lgl> FALSE, FALSE, …
## $ log_price                                               <dbl> 4.867534, 4.09…
## $ n_NA_cols                                               <dbl> 0, 0, 1, 0, 1,…

Exercise 1: Train-test split

To begin, we split our dataset into training (80%) and testing (20%) sets using the initial_split function from tidymodels. Since our target variable is log_price, we applied stratification to ensure that both sets maintained a similar price distribution. We also set a seed (123) to guarantee reproducibility.

set.seed(123)
airbnb_split <- initial_split(airbnb_clean, prop = 0.8, strata = log_price)
airbnb_train <- training(airbnb_split)
airbnb_test <- testing(airbnb_split)

Exercise 2: K-fold cross-validation

Secondly, we implemented 10-fold cross-validation on the training data using vfold_cv. We stratified by log_price to maintain consistency in price distributions across all validation folds.

airbnb_folds <- vfold_cv(data = airbnb_train,
                  v = 10,
                  strata = log_price)

Exercise 3: Recipe, metrics, model and workflow

For data preprocessing, we designed a recipe that:

To evaluate our model, we selected RMSE, MAE, and R-squared as key performance metrics. We then created a workflow that integrated our preprocessing steps with an XGBoost model, allowing us to tune key hyperparameters such as the number of trees, tree depth, minimum node size, and learning rate.

# Recipe 
airbnb_recipe <- recipe(formula = log_price ~ ., data = airbnb_train) %>%
  step_rm(host_id, host_since, first_review, last_review, host_has_profile_pic,
          host_response_time, host_response_rate, host_identity_verified,
          is_location_exact, n_NA_cols, price) %>%
  step_rm(matches("review_")) %>%
  step_mutate(cleaning_fee = as.numeric(ifelse(is.na(cleaning_fee) | cleaning_fee == 0, 0, 1))) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())  %>%
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_zv(all_predictors()) %>%
  step_mutate(across(where(is.logical), as.numeric))

airbnb_prep <- prep(airbnb_recipe, training = airbnb_train)
juice(airbnb_prep) %>% glimpse()
## Rows: 15,865
## Columns: 131
## $ host_is_superhost                                       <dbl> 0, 0, 1, 1, 0,…
## $ host_listings_count                                     <dbl> -0.3229847, -0…
## $ latitude                                                <dbl> 0.90858229, -0…
## $ longitude                                               <dbl> 1.70223211, -1…
## $ accommodates                                            <dbl> -0.6115208, -1…
## $ bathrooms                                               <dbl> -0.4940825, -0…
## $ bedrooms                                                <dbl> -0.5595624, -0…
## $ beds                                                    <dbl> -0.6743047, -0…
## $ cleaning_fee                                            <dbl> -1.4661367, -1…
## $ minimum_nights                                          <dbl> -0.38691601, 1…
## $ maximum_nights                                          <dbl> -0.01121042, -…
## $ number_of_reviews                                       <dbl> 1.4918692, -0.…
## $ instant_bookable                                        <dbl> 0, 0, 1, 1, 0,…
## $ has_verificator__email                                  <dbl> 1, 1, 1, 1, 1,…
## $ has_verificator__phone                                  <dbl> 1, 1, 1, 1, 1,…
## $ has_verificator__reviews                                <dbl> 1, 1, 1, 1, 1,…
## $ has_verificator__jumio                                  <dbl> 1, 0, 1, 1, 1,…
## $ has_verificator__government_id                          <dbl> 1, 0, 1, 1, 1,…
## $ has_verificator__offline_government_id                  <dbl> 1, 0, 1, 1, 1,…
## $ has_verificator__selfie                                 <dbl> 0, 0, 0, 0, 1,…
## $ has_verificator__identity_manual                        <dbl> 0, 0, 0, 0, 1,…
## $ has_verificator__facebook                               <dbl> 0, 1, 0, 0, 0,…
## $ has_verificator__work_email                             <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__TV                                         <dbl> 1, 0, 1, 1, 0,…
## $ has_amenity__Internet                                   <dbl> 0, 1, 1, 1, 1,…
## $ has_amenity__Wifi                                       <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Air.conditioning                           <dbl> 0, 1, 0, 0, 0,…
## $ has_amenity__Kitchen                                    <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Elevator                                   <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Free.street.parking                        <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Heating                                    <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Family.kid.friendly                        <dbl> 1, 0, 0, 0, 1,…
## $ has_amenity__Washer                                     <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Dryer                                      <dbl> 1, 1, 0, 0, 0,…
## $ has_amenity__Essentials                                 <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Shampoo                                    <dbl> 1, 0, 1, 1, 0,…
## $ has_amenity__Hair.dryer                                 <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Hot.water                                  <dbl> 1, 0, 1, 1, 1,…
## $ has_amenity__Host.greets.you                            <dbl> 0, 0, 1, 1, 1,…
## $ has_amenity__Paid.parking.on.premises                   <dbl> 1, 0, 1, 1, 1,…
## $ has_amenity__Buzzer.wireless.intercom                   <dbl> 0, 1, 1, 0, 1,…
## $ has_amenity__Hangers                                    <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Iron                                       <dbl> 1, 0, 1, 1, 1,…
## $ has_amenity__Laptop.friendly.workspace                  <dbl> 1, 1, 0, 0, 0,…
## $ has_amenity__Crib                                       <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Paid.parking.off.premises                  <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__First.aid.kit                              <dbl> 1, 0, 0, 0, 1,…
## $ has_amenity__Self.check.in                              <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Bed.linens                                 <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Extra.pillows.and.blankets                 <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Microwave                                  <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Coffee.maker                               <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Refrigerator                               <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Dishwasher                                 <dbl> 1, 1, 0, 0, 0,…
## $ has_amenity__Dishes.and.silverware                      <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Cooking.basics                             <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Oven                                       <dbl> 1, 1, 0, 0, 0,…
## $ has_amenity__Stove                                      <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Patio.or.balcony                           <dbl> 1, 1, 0, 0, 1,…
## $ has_amenity__Luggage.dropoff.allowed                    <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__No.stairs.or.steps.to.enter                <dbl> 1, 0, 0, 0, 1,…
## $ has_amenity__Wide.entrance.for.guests                   <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Well.lit.path.to.entrance                  <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Wide.entryway                              <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Smoke.detector                             <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Carbon.monoxide.detector                   <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Fire.extinguisher                          <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__High.chair                                 <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Pack..n.Play.travel.crib                   <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Long.term.stays.allowed                    <dbl> 0, 1, 0, 0, 1,…
## $ has_amenity__Wide.hallways                              <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Smoking.allowed                            <dbl> 0, 0, 0, 0, 1,…
## $ has_amenity__Lock.on.bedroom.door                       <dbl> 0, 0, 1, 1, 0,…
## $ has_amenity__translation.missing..en.hosting_amenity_50 <dbl> 0, 0, 1, 1, 0,…
## $ has_amenity__Private.living.room                        <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Cable.TV                                   <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Safety.card                                <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__24.hour.check.in                           <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Private.entrance                           <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Breakfast                                  <dbl> 0, 0, 1, 1, 0,…
## $ has_amenity__translation.missing..en.hosting_amenity_49 <dbl> 0, 0, 1, 1, 0,…
## $ has_amenity__Room.darkening.shades                      <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Pets.allowed                               <dbl> 0, 0, 0, 0, 1,…
## $ has_amenity__Pocket.wifi                                <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Extra.space.around.bed                     <dbl> 0, 0, 0, 0, 1,…
## $ has_amenity__Accessible.height.bed                      <dbl> 0, 0, 0, 0, 1,…
## $ has_amenity__Bathtub                                    <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Wide.entrance                              <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__.toilet                                    <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Ethernet.connection                        <dbl> 0, 0, 0, 0, 0,…
## $ log_price                                               <dbl> 3.496508, 3.40…
## $ neighbourhood_group_cleansed_ciutat.vella               <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_eixample                   <dbl> 0, 0, 1, 1, 0,…
## $ neighbourhood_group_cleansed_gracia                     <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_horta.guinardo             <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_les.corts                  <dbl> 0, 1, 0, 0, 0,…
## $ neighbourhood_group_cleansed_nou.barris                 <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_sant.andreu                <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_sant.marti                 <dbl> 1, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_sants.montjuic             <dbl> 0, 0, 0, 0, 1,…
## $ neighbourhood_group_cleansed_sarria.sant.gervasi        <dbl> 0, 0, 0, 0, 0,…
## $ property_type_aparthotel                                <dbl> 0, 0, 0, 0, 0,…
## $ property_type_apartment                                 <dbl> 1, 1, 1, 1, 1,…
## $ property_type_barn                                      <dbl> 0, 0, 0, 0, 0,…
## $ property_type_bed.and.breakfast                         <dbl> 0, 0, 0, 0, 0,…
## $ property_type_boat                                      <dbl> 0, 0, 0, 0, 0,…
## $ property_type_boutique.hotel                            <dbl> 0, 0, 0, 0, 0,…
## $ property_type_cabin                                     <dbl> 0, 0, 0, 0, 0,…
## $ property_type_camper.rv                                 <dbl> 0, 0, 0, 0, 0,…
## $ property_type_casa.particular..cuba.                    <dbl> 0, 0, 0, 0, 0,…
## $ property_type_chalet                                    <dbl> 0, 0, 0, 0, 0,…
## $ property_type_condominium                               <dbl> 0, 0, 0, 0, 0,…
## $ property_type_cottage                                   <dbl> 0, 0, 0, 0, 0,…
## $ property_type_dome.house                                <dbl> 0, 0, 0, 0, 0,…
## $ property_type_earth.house                               <dbl> 0, 0, 0, 0, 0,…
## $ property_type_farm.stay                                 <dbl> 0, 0, 0, 0, 0,…
## $ property_type_guest.suite                               <dbl> 0, 0, 0, 0, 0,…
## $ property_type_guesthouse                                <dbl> 0, 0, 0, 0, 0,…
## $ property_type_hostel                                    <dbl> 0, 0, 0, 0, 0,…
## $ property_type_hotel                                     <dbl> 0, 0, 0, 0, 0,…
## $ property_type_house                                     <dbl> 0, 0, 0, 0, 0,…
## $ property_type_loft                                      <dbl> 0, 0, 0, 0, 0,…
## $ property_type_nature.lodge                              <dbl> 0, 0, 0, 0, 0,…
## $ property_type_other                                     <dbl> 0, 0, 0, 0, 0,…
## $ property_type_serviced.apartment                        <dbl> 0, 0, 0, 0, 0,…
## $ property_type_tiny.house                                <dbl> 0, 0, 0, 0, 0,…
## $ property_type_townhouse                                 <dbl> 0, 0, 0, 0, 0,…
## $ property_type_villa                                     <dbl> 0, 0, 0, 0, 0,…
## $ room_type_entire.home.apt                               <dbl> 0, 0, 0, 0, 0,…
## $ room_type_private.room                                  <dbl> 1, 1, 1, 1, 1,…
## $ room_type_shared.room                                   <dbl> 0, 0, 0, 0, 0,…
# Metrics 
airbnb_metrics <- metric_set(rmse, mae, rsq)

# Workflow 
airbnb_model <- boost_tree(
  trees = tune(), 
  min_n = tune(), 
  tree_depth = tune(), 
  learn_rate = tune()) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

airbnb_wf <- workflow() %>% 
  add_recipe(airbnb_recipe) %>%
  add_model(airbnb_model)

Exercise 5: Model extraction

Once trained, we extracted our final workflow as the airbnb_predictor object, which includes both the pre-processing steps and the trained model. This ensures that future predictions can be made without reapplying transformations separately.

best_params <- tune_results %>%
  select_best(metric = "rmse")  

final_wf <- finalize_workflow(airbnb_wf, best_params)  

final_fit <- final_wf %>%
  last_fit(split = airbnb_split) 

airbnb_predictor <- extract_workflow(final_fit)  

save(airbnb_predictor, file = 'C:/Users/Edgar/Desktop/Challenge_2/airbnb_predictor.RData')

final_fit %>%
  pluck(".workflow", 1) %>% 
  extract_fit_parsnip() %>%
  vip(num_features = 10)

Exercise 6: Prediction and visualization

We used our model to predict prices and visualized the results to assess accuracy. A scatter plot comparing actual vs. predicted log prices showed strong alignment along the diagonal, indicating good predictive performance. After converting predictions back to actual prices, some expected variance appeared at higher price points, which is typical for models using log-transformed targets.

airbnb_predictions <- predict(airbnb_predictor, new_data = airbnb_clean) %>%
  bind_cols(airbnb_clean) 

airbnb_metrics <- airbnb_predictions %>%
  metrics(truth = log_price, estimate = .pred)
airbnb_metrics
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.186 
## 2 rsq     standard      0.953 
## 3 mae     standard      0.0660
save(airbnb_predictions, file = 'C:/Users/Edgar/Desktop/Challenge_2/airbnb_predictions.RData')

ggplot(airbnb_predictions, aes(x = log_price, y = .pred)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Actual vs. Predicted Log Prices",
       x = "Actual Log Price",
       y = "Predicted Log Price") +
  theme_minimal()

#plot in actual price
predictions_converted <- airbnb_predictions %>%
  mutate(predicted_price = exp(.pred), actual_price = exp(log_price))

rsq(predictions_converted, truth = actual_price, estimate = predicted_price)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.931
ggplot(predictions_converted, aes(x = actual_price, y = predicted_price)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Actual vs. Predicted Prices",
       x = "Actual Price",
       y = "Predicted Price") +
  coord_cartesian(xlim=c(0, 1000), ylim=c(0, 1000)) +
  theme_minimal()

Exercise 7: Geo-spatial visualization

To explore potential spatial patterns in prediction errors, we created an interactive Leaflet map. Data points were colored based on the difference between actual and predicted log prices, helping to identify neighborhoods where the model performed better or worse. While we did not conduct a deep dive into these patterns, the visualization provided valuable insights into geographic variations.

library(dplyr)
library(leaflet)
library(mapview)

map_data <- airbnb_predictions %>%
  mutate(row_id = row_number()) %>%  # Create row_id for airbnb_predictions
  select(row_id, .pred) %>%
  left_join(
    airbnb_clean %>%
      mutate(row_id = row_number()) %>%  # Create row_id for airbnb_clean
      select(row_id, longitude, latitude, neighbourhood_group_cleansed, log_price), 
    by = "row_id"
  ) %>%
  mutate(price_diff = .pred - log_price)

pal <- colorNumeric(palette = "RdYlBu", domain = map_data$price_diff)

# Create interactive map
map <- leaflet(map_data) %>%
  addTiles() %>%
  addCircleMarkers(
    ~longitude, ~latitude, 
    color = ~pal(price_diff), 
    radius =0.5, 
    fillOpacity = 0.9,
    popup = ~paste0("Actual Price: ", log_price, "<br>",
                    "Predicted Price: ", .pred, "<br>",
                    "Difference: ", price_diff)
  ) %>%
  addLegend(
    pal = pal, values = ~price_diff, title = "Price Difference",
    position = "bottomright"
  )

map
save(map, file = "C:/Users/Edgar/Desktop/Challenge_2/neighbourhood_maps.RData")


# Create a list of unique neighborhoods
neighborhoods <- unique(map_data$neighbourhood_group_cleansed)

create_neighborhood_map <- function(neighborhood) {
  # Subset the data for this neighborhood
  neighborhood_data <- map_data %>% filter(neighbourhood_group_cleansed == neighborhood)
  
  pal <- colorNumeric(palette = "RdYlBu", domain = neighborhood_data$price_diff)
  
  # Create interactive map for the neighborhood
  map <- leaflet(neighborhood_data) %>%
    addTiles() %>%
    addCircleMarkers(
      ~longitude, ~latitude, 
      color = ~pal(price_diff), 
      radius = 5, 
      fillOpacity = 0.7,
      popup = ~paste0("Actual Price: ", log_price, "<br>",
                      "Predicted Price: ", .pred, "<br>",
                      "Difference: ", price_diff)
    ) %>%
    addLegend(
      pal = pal, 
      values = ~price_diff, 
      title = paste("Price Difference\n", neighborhood),  # Add neighborhood name in the legend title
      position = "bottomright"
    )
  
  return(map)
}

# Create a list to store the maps with neighborhood names as list names
neighborhood_maps <- setNames(
  lapply(neighborhoods, create_neighborhood_map), 
  neighborhoods
)

# Print each map with its neighborhood name
for (neighborhood_name in names(neighborhood_maps)) {
  print(neighborhood_maps[[neighborhood_name]])
}

save(neighborhood_maps, file = 'C:/Users/Edgar/Desktop/Challenge_2/neighborhood_maps.RData')

Exercise 8: Investment analysis

In the final stage, we applied our model to analyze the best neighborhoods for investment in Barcelona, assuming a €3 million budget. Our approach involved:

The analysis revealed that the most favorable investment locations were:

Our optimized investment strategy allocated €2,427,910 across these top-performing neighborhoods, leaving €572,090 in reserve. This portfolio would generate an estimated €361,212 in annual returns, yielding a 14.88% return rate, with a weighted average ROI time of 7.03 years.

A key takeaway from this analysis is that central, tourist-heavy neighborhoods did not necessarily offer the best investment opportunities. While these areas had higher rental prices, their significantly higher acquisition costs led to lower returns. This challenges the common assumption that the most popular districts always provide the highest profitability.

# Create samples code
create_sample <- function(
  ## Function to create new samples from airbnb_clean
  ## It is necessary to have in the environment the data frame avg_prices
  
    df =                                       airbnb_clean,      # The original data frame (airbnb_clean)
    df_prices =                                avg_prices,        # The data frame containing average prices, coordinates and estimated occupation
    origin_sample =                            1,                 # The desired index row of airbnb_clean
    host_response_time_new =                   'within an hour',
    host_response_rate_new =                    100,
    host_listings_count_new =                   1,
    neighbourhood_new,
    latitude_new,
    longitude_new,
    property_type_new =                        'apartment',
    room_type_new =                            'entire home/apt',
    accommodates_new =                         4,
    bathrooms_new =                            1,
    bedrooms_new =                             2,
    beds_new =                                 2,
    cleaning_fee_new =                         20,
    minimum_nights_new =                       1,
    maximum_nights_new =                       30,
    review_scores_rating_new =                 100,
    review_scores_accuracy_new =               10,
    review_scores_cleanliness_new =            10,
    review_scores_checkin_new =                10,
    review_scores_communication_new =          10,
    review_scores_location_new =               10,
    review_scores_value_new =                  10
){
  require(tidyverse)
  final_df <- df[origin_sample, ] %>% 
    mutate(
      host_response_time = host_response_time_new,
      host_response_rate = host_response_rate_new,
      host_listings_count = host_listings_count_new,
      neighbourhood_group_cleansed = neighbourhood_new,
      latitude = df_prices$latitude[which(df_prices$neighbourhood == neighbourhood_new)],
      longitude = df_prices$longitude[which(df_prices$neighbourhood == neighbourhood_new)],
      property_type = property_type_new,
      room_type = room_type_new,
      accommodates = accommodates_new,
      bathrooms = bathrooms_new,
      bedrooms = bedrooms_new,
      beds = beds_new,
      cleaning_fee = cleaning_fee_new,
      minimum_nights = minimum_nights_new,
      maximum_nights = maximum_nights_new,
      review_scores_rating = review_scores_rating_new,
      review_scores_accuracy = review_scores_accuracy_new,
      review_scores_cleanliness = review_scores_cleanliness_new,
      review_scores_checkin = review_scores_checkin_new,
      review_scores_communication = review_scores_communication_new,
      review_scores_location = review_scores_location_new,
      review_scores_value = review_scores_value_new,
    ) %>% 
    mutate(across(where(is.logical), ~ TRUE))
}

# Function to generate samples for each neighborhood
generate_neighborhood_samples <- function() {
  origin_sample <- sample(1:nrow(airbnb_clean), size = 1, replace = FALSE)

  neighborhood_samples <- map_dfr(avg_prices$neighbourhood, function(neighborhood) {
    # Ensure 'neighbourhood' is correctly used in the function call
    create_sample(
      df = airbnb_clean,
      df_prices = avg_prices,
      origin_sample = origin_sample,
      neighbourhood = neighborhood,  # Correctly use 'neighbourhood' as the argument
      cleaning_fee_new = 20
    )
  })
  
  return(neighborhood_samples)
}

neighborhood_samples <- generate_neighborhood_samples()
## Cargando paquete requerido: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ xgboost::slice()    masks dplyr::slice()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Now make predictions using the model
predicted_prices <- predict(airbnb_predictor, new_data = neighborhood_samples) %>%
  bind_cols(neighborhood_samples %>% select(neighbourhood_group_cleansed))

predicted_prices <- predicted_prices %>%
  mutate(
    predicted_price = exp(.pred),
    neighbourhood = neighbourhood_group_cleansed
  ) %>%
  select(neighbourhood, predicted_price)

# Investment analysis dataframe
investment_analysis <- predicted_prices %>%
  left_join(avg_prices, by = "neighbourhood") %>%
  mutate(
    # Calculate the number of properties that can be purchased in each neighborhood
    max_properties = floor(3000000 / avg_price),
    
    # Calculate daily revenue from each property
    daily_revenue = predicted_price,
    
    # Calculate annual revenue considering occupancy rate
    annual_revenue = daily_revenue * 365 * pct_year_occupation,
    
    # Calculate return on investment time in years
    roi_years = avg_price / annual_revenue,
    
    # Total investment required for max properties
    total_investment = max_properties * avg_price,
    
    # Total annual return for max properties
    total_annual_return = max_properties * annual_revenue,
    
    # Return rate (annual return as percentage of investment)
    return_rate = (total_annual_return / total_investment) * 100
  )

# Optimal investment portfolio
investment_portfolio <- investment_analysis %>%
  arrange(roi_years) %>%
  mutate(
    cumulative_investment = cumsum(avg_price),
    can_afford = cumulative_investment <= 3000000
  ) %>%
  filter(can_afford) %>%
  summarise(
    total_properties = n(),
    total_investment = sum(avg_price),
    remaining_budget = 3000000 - sum(avg_price),
    weighted_avg_roi = sum(roi_years * avg_price) / sum(avg_price),
    total_annual_return = sum(annual_revenue),
    portfolio_return_rate = (total_annual_return / total_investment) * 100
  )

# Display results
print("Analysis of Investment by Neighborhood:")
## [1] "Analysis of Investment by Neighborhood:"
investment_analysis %>%
  select(
    neighbourhood,
    avg_price,
    max_properties,
    predicted_price,
    pct_year_occupation,
    annual_revenue,
    roi_years,
    return_rate
  ) %>%
  arrange(roi_years) %>%
  print(n = 10)
## # A tibble: 10 × 8
##    neighbourhood    avg_price max_properties predicted_price pct_year_occupation
##    <chr>                <dbl>          <dbl>           <dbl>               <dbl>
##  1 sant andreu         288534             10            210.                0.6 
##  2 nou barris          201074             14            174.                0.5 
##  3 sants-montjuic      299140             10            178.                0.7 
##  4 ciutat vella        392645              7            161.                0.8 
##  5 horta-guinardo      310891              9            172.                0.55
##  6 sant marti          435215              6            198.                0.6 
##  7 gracia              500411              5            160.                0.7 
##  8 eixample            684012              4            152.                0.75
##  9 les corts           779088              3            171.                0.7 
## 10 sarria-sant ger…    980439              3            137.                0.7 
## # ℹ 3 more variables: annual_revenue <dbl>, roi_years <dbl>, return_rate <dbl>
print("\nOptimal Investment Portfolio:")
## [1] "\nOptimal Investment Portfolio:"
print(investment_portfolio)
## # A tibble: 1 × 6
##   total_properties total_investment remaining_budget weighted_avg_roi
##              <int>            <dbl>            <dbl>            <dbl>
## 1                7          2427910           572090             8.92
## # ℹ 2 more variables: total_annual_return <dbl>, portfolio_return_rate <dbl>
# Visualization of ROI by neighborhood
ggplot(investment_analysis, aes(x = reorder(neighbourhood, -roi_years), y = roi_years)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "ROI time by neighborhood",
    x = "Neighborhood",
    y = "Years to recover investment"
  ) +
  theme_minimal()

# Visualize return rate by neighborhood
ggplot(investment_analysis, aes(x = reorder(neighbourhood, return_rate), y = return_rate)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Annual return rate by neighborhood (%)",
    x = "Neighborhood",
    y = "Annual return rate (%)"
  ) +
  theme_minimal()

# Report to store the data
cat("\n=== Barcelona investment report ===\n")
## 
## === Barcelona investment report ===
cat("\nBudget: €3,000,000\n")
## 
## Budget: €3,000,000
cat("\nProperty Specifications:")
## 
## Property Specifications:
cat("\n- 4 people capacity (2 bedrooms, 2 beds, 1 bathroom)")
## 
## - 4 people capacity (2 bedrooms, 2 beds, 1 bathroom)
cat("\n- €20 cleaning fee")
## 
## - €20 cleaning fee
cat("\n- Perfect ratings and host responsiveness")
## 
## - Perfect ratings and host responsiveness
cat("\n- All services and amenities offered\n")
## 
## - All services and amenities offered
cat("\nBest neighborhoods for investment (by ROI):\n")
## 
## Best neighborhoods for investment (by ROI):
investment_analysis %>%
  arrange(roi_years) %>%
  select(neighbourhood, roi_years, return_rate) %>%
  head(3) %>%
  mutate(
    neighbourhood = paste(neighbourhood),
    roi_years = paste(round(roi_years, 2), "years"),
    return_rate = paste(round(return_rate, 2), "%")
  ) %>%
  knitr::kable(col.names = c("Neighborhood", "ROI Time", "Annual Return Rate"), format = "pipe")
Neighborhood ROI Time Annual Return Rate
sant andreu 6.26 years 15.98 %
nou barris 6.32 years 15.81 %
sants-montjuic 6.59 years 15.17 %
cat("\nRecommended investment strategy:\n")
## 
## Recommended investment strategy:
investment_analysis %>%
  arrange(roi_years) %>%
  filter(row_number() <= investment_portfolio$total_properties) %>%
  select(neighbourhood, avg_price, annual_revenue, roi_years) %>%
  mutate(
    neighbourhood = paste(neighbourhood),
    avg_price = paste0("€", format(round(avg_price), big.mark = ",")),
    annual_revenue = paste0("€", format(round(annual_revenue), big.mark = ",")),
    roi_years = paste(round(roi_years, 2), "years")
  ) %>%
  knitr::kable(col.names = c("Neighborhood", "Property Cost", "Annual Revenue", "ROI Time"), format = "pipe")
Neighborhood Property Cost Annual Revenue ROI Time
sant andreu €288,534 €46,094 6.26 years
nou barris €201,074 €31,791 6.32 years
sants-montjuic €299,140 €45,373 6.59 years
ciutat vella €392,645 €46,878 8.38 years
horta-guinardo €310,891 €34,445 9.03 years
sant marti €435,215 €43,305 10.05 years
gracia €500,411 €40,788 12.27 years
cat("\nTotal Investment:", paste0("€", format(round(investment_portfolio$total_investment), big.mark = ",")))
## 
## Total Investment: €2,427,910
cat("\nRemaining Budget:", paste0("€", format(round(investment_portfolio$remaining_budget), big.mark = ",")))
## 
## Remaining Budget: €572,090
cat("\nTotal Annual Return:", paste0("€", format(round(investment_portfolio$total_annual_return), big.mark = ",")))
## 
## Total Annual Return: €288,673
cat("\nPortfolio Return Rate:", paste0(round(investment_portfolio$portfolio_return_rate, 2), "%"))
## 
## Portfolio Return Rate: 11.89%
cat("\nWeighted Average ROI Time:", paste(round(investment_portfolio$weighted_avg_roi, 2), "years"))
## 
## Weighted Average ROI Time: 8.92 years