library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.7 ✔ recipes 1.1.1
## ✔ dials 1.4.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(xgboost)
##
## Adjuntando el paquete: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(ggplot2)
library(readr)
##
## Adjuntando el paquete: 'readr'
## The following object is masked from 'package:yardstick':
##
## spec
## The following object is masked from 'package:scales':
##
## col_factor
library(vip)
##
## Adjuntando el paquete: 'vip'
## The following object is masked from 'package:utils':
##
## vi
avg_prices <- data.frame(
neighbourhood = c("eixample", "ciutat vella","sant marti","sants-montjuic", "sarria-sant gervasi","nou barris","horta-guinardo", "gracia","sant andreu","les corts"),
avg_price_m2 = c(5881,4751,4728,4220,6242, 2610,3872,5153,3624, 5613),
avg_price = c(684012, 392645,435215,299140,980439, 201074,310891,500411,288534, 779088),
latitude = c(41.389887, 41.382183, 41.412146, 41.374394, 41.402357, 41.437541, 41.423826, 41.406744, 41.432717, 41.387295),
longitude = c(2.161808, 2.176437, 2.204667, 2.140377, 2.134925, 2.175310, 2.161701, 2.158146, 2.189304, 2.126046),
pct_year_occupation = c(0.75, 0.8, 0.6, 0.7, 0.7, 0.5, 0.55, 0.7, 0.6, 0.7)
)
airbnb_clean <- read_csv('C:/Users/Edgar/Desktop/Challenge_2/airbnb_clean.csv')
## Rows: 19833 Columns: 113
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): host_response_time, neighbourhood_group_cleansed, property_type, ...
## dbl (23): host_id, host_response_rate, host_listings_count, latitude, longi...
## lgl (83): host_is_superhost, host_has_profile_pic, host_identity_verified, ...
## date (3): host_since, first_review, last_review
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(airbnb_clean)
## spc_tbl_ [19,833 × 113] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ host_id : num [1:19833] 71615 71615 82522 90417 108310 ...
## $ host_since : Date[1:19833], format: "2010-01-19" "2010-01-19" ...
## $ host_response_time : chr [1:19833] "within an hour" "within an hour" "within a few hours" "within an hour" ...
## $ host_response_rate : num [1:19833] 99 99 100 100 100 100 100 92 92 100 ...
## $ host_is_superhost : logi [1:19833] FALSE FALSE FALSE TRUE TRUE FALSE ...
## $ host_listings_count : num [1:19833] 45 45 2 5 1 9 9 41 41 1 ...
## $ host_has_profile_pic : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ host_identity_verified : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ neighbourhood_group_cleansed : chr [1:19833] "sant marti" "eixample" "sant marti" "sant marti" ...
## $ latitude : num [1:19833] 41.4 41.4 41.4 41.4 41.4 ...
## $ longitude : num [1:19833] 2.19 2.17 2.2 2.22 2.16 ...
## $ is_location_exact : logi [1:19833] TRUE TRUE TRUE FALSE TRUE TRUE ...
## $ property_type : chr [1:19833] "apartment" "apartment" "apartment" "apartment" ...
## $ room_type : chr [1:19833] "entire home/apt" "entire home/apt" "private room" "entire home/apt" ...
## $ accommodates : num [1:19833] 6 8 2 6 2 2 3 4 5 1 ...
## $ bathrooms : num [1:19833] 1 2 1 2 1 1 1 1 1.5 1 ...
## $ bedrooms : num [1:19833] 2 3 1 3 1 1 1 1 3 1 ...
## $ beds : num [1:19833] 4 6 1 8 1 2 2 1 3 1 ...
## $ price : num [1:19833] 130 60 33 210 45 42 53 75 85 30 ...
## $ cleaning_fee : num [1:19833] 42 50 NA 80 NA NA NA 55 105 0 ...
## $ minimum_nights : num [1:19833] 3 1 2 3 1 3 3 1 1 29 ...
## $ maximum_nights : num [1:19833] 730 1125 1125 1125 730 ...
## $ has_availability : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ number_of_reviews : num [1:19833] 1 15 119 45 241 4 40 174 79 19 ...
## $ first_review : Date[1:19833], format: "2015-10-10" "2013-05-27" ...
## $ last_review : Date[1:19833], format: "2015-10-10" "2019-07-02" ...
## $ review_scores_rating : num [1:19833] 80 87 90 95 95 95 87 92 88 99 ...
## $ review_scores_accuracy : num [1:19833] 10 9 10 10 10 9 9 9 9 10 ...
## $ review_scores_cleanliness : num [1:19833] 10 9 9 10 10 10 9 9 9 10 ...
## $ review_scores_checkin : num [1:19833] 2 10 10 10 10 10 9 8 9 10 ...
## $ review_scores_communication : num [1:19833] 10 10 10 10 10 10 9 9 10 10 ...
## $ review_scores_location : num [1:19833] 10 9 9 9 10 9 8 9 9 9 ...
## $ review_scores_value : num [1:19833] 8 8 9 9 9 9 9 9 9 9 ...
## $ instant_bookable : logi [1:19833] FALSE TRUE FALSE TRUE TRUE FALSE ...
## $ has_verificator__email : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_verificator__phone : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_verificator__reviews : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_verificator__jumio : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ has_verificator__government_id : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ has_verificator__offline_government_id : logi [1:19833] FALSE FALSE TRUE TRUE TRUE FALSE ...
## $ has_verificator__selfie : logi [1:19833] FALSE FALSE FALSE TRUE TRUE FALSE ...
## $ has_verificator__identity_manual : logi [1:19833] FALSE FALSE FALSE TRUE TRUE FALSE ...
## $ has_verificator__facebook : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ has_verificator__work_email : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ has_amenity__TV : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ has_amenity__Internet : logi [1:19833] TRUE TRUE FALSE TRUE FALSE FALSE ...
## $ has_amenity__Wifi : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Air.conditioning : logi [1:19833] TRUE TRUE FALSE FALSE TRUE TRUE ...
## $ has_amenity__Kitchen : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Elevator : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ has_amenity__Free.street.parking : logi [1:19833] TRUE TRUE FALSE FALSE FALSE FALSE ...
## $ has_amenity__Heating : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Family.kid.friendly : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Washer : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Dryer : logi [1:19833] TRUE FALSE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Essentials : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Shampoo : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Hair.dryer : logi [1:19833] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Hot.water : logi [1:19833] TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ has_amenity__Host.greets.you : logi [1:19833] TRUE TRUE FALSE TRUE FALSE TRUE ...
## $ has_amenity__Paid.parking.on.premises : logi [1:19833] TRUE TRUE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Buzzer.wireless.intercom : logi [1:19833] FALSE TRUE FALSE TRUE FALSE TRUE ...
## $ has_amenity__Hangers : logi [1:19833] FALSE TRUE TRUE TRUE TRUE TRUE ...
## $ has_amenity__Iron : logi [1:19833] FALSE TRUE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Laptop.friendly.workspace : logi [1:19833] FALSE TRUE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Crib : logi [1:19833] FALSE TRUE FALSE TRUE FALSE FALSE ...
## $ has_amenity__Paid.parking.off.premises : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__First.aid.kit : logi [1:19833] FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ has_amenity__Self.check.in : logi [1:19833] FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ has_amenity__Bed.linens : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Extra.pillows.and.blankets : logi [1:19833] FALSE FALSE TRUE FALSE FALSE TRUE ...
## $ has_amenity__Microwave : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Coffee.maker : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Refrigerator : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Dishwasher : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Dishes.and.silverware : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Cooking.basics : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Oven : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Stove : logi [1:19833] FALSE FALSE TRUE TRUE FALSE TRUE ...
## $ has_amenity__Patio.or.balcony : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Luggage.dropoff.allowed : logi [1:19833] FALSE FALSE TRUE TRUE TRUE TRUE ...
## $ has_amenity__No.stairs.or.steps.to.enter : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Wide.entrance.for.guests : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Well.lit.path.to.entrance : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Wide.entryway : logi [1:19833] FALSE FALSE TRUE TRUE FALSE FALSE ...
## $ has_amenity__Smoke.detector : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
## $ has_amenity__Carbon.monoxide.detector : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
## $ has_amenity__Fire.extinguisher : logi [1:19833] FALSE FALSE FALSE TRUE TRUE FALSE ...
## $ has_amenity__High.chair : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
## $ has_amenity__Pack..n.Play.travel.crib : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
## $ has_amenity__Long.term.stays.allowed : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
## $ has_amenity__Wide.hallways : logi [1:19833] FALSE FALSE FALSE TRUE FALSE FALSE ...
## $ has_amenity__Smoking.allowed : logi [1:19833] FALSE FALSE FALSE FALSE TRUE FALSE ...
## $ has_amenity__Lock.on.bedroom.door : logi [1:19833] FALSE FALSE FALSE FALSE TRUE TRUE ...
## $ has_amenity__translation.missing..en.hosting_amenity_50: logi [1:19833] FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ has_amenity__Private.living.room : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ has_amenity__Cable.TV : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ has_amenity__Safety.card : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ has_amenity__24.hour.check.in : logi [1:19833] FALSE FALSE FALSE FALSE FALSE FALSE ...
## [list output truncated]
## - attr(*, "spec")=
## .. cols(
## .. host_id = col_double(),
## .. host_since = col_date(format = ""),
## .. host_response_time = col_character(),
## .. host_response_rate = col_double(),
## .. host_is_superhost = col_logical(),
## .. host_listings_count = col_double(),
## .. host_has_profile_pic = col_logical(),
## .. host_identity_verified = col_logical(),
## .. neighbourhood_group_cleansed = col_character(),
## .. latitude = col_double(),
## .. longitude = col_double(),
## .. is_location_exact = col_logical(),
## .. property_type = col_character(),
## .. room_type = col_character(),
## .. accommodates = col_double(),
## .. bathrooms = col_double(),
## .. bedrooms = col_double(),
## .. beds = col_double(),
## .. price = col_double(),
## .. cleaning_fee = col_double(),
## .. minimum_nights = col_double(),
## .. maximum_nights = col_double(),
## .. has_availability = col_logical(),
## .. number_of_reviews = col_double(),
## .. first_review = col_date(format = ""),
## .. last_review = col_date(format = ""),
## .. review_scores_rating = col_double(),
## .. review_scores_accuracy = col_double(),
## .. review_scores_cleanliness = col_double(),
## .. review_scores_checkin = col_double(),
## .. review_scores_communication = col_double(),
## .. review_scores_location = col_double(),
## .. review_scores_value = col_double(),
## .. instant_bookable = col_logical(),
## .. has_verificator__email = col_logical(),
## .. has_verificator__phone = col_logical(),
## .. has_verificator__reviews = col_logical(),
## .. has_verificator__jumio = col_logical(),
## .. has_verificator__government_id = col_logical(),
## .. has_verificator__offline_government_id = col_logical(),
## .. has_verificator__selfie = col_logical(),
## .. has_verificator__identity_manual = col_logical(),
## .. has_verificator__facebook = col_logical(),
## .. has_verificator__work_email = col_logical(),
## .. has_amenity__TV = col_logical(),
## .. has_amenity__Internet = col_logical(),
## .. has_amenity__Wifi = col_logical(),
## .. has_amenity__Air.conditioning = col_logical(),
## .. has_amenity__Kitchen = col_logical(),
## .. has_amenity__Elevator = col_logical(),
## .. has_amenity__Free.street.parking = col_logical(),
## .. has_amenity__Heating = col_logical(),
## .. has_amenity__Family.kid.friendly = col_logical(),
## .. has_amenity__Washer = col_logical(),
## .. has_amenity__Dryer = col_logical(),
## .. has_amenity__Essentials = col_logical(),
## .. has_amenity__Shampoo = col_logical(),
## .. has_amenity__Hair.dryer = col_logical(),
## .. has_amenity__Hot.water = col_logical(),
## .. has_amenity__Host.greets.you = col_logical(),
## .. has_amenity__Paid.parking.on.premises = col_logical(),
## .. has_amenity__Buzzer.wireless.intercom = col_logical(),
## .. has_amenity__Hangers = col_logical(),
## .. has_amenity__Iron = col_logical(),
## .. has_amenity__Laptop.friendly.workspace = col_logical(),
## .. has_amenity__Crib = col_logical(),
## .. has_amenity__Paid.parking.off.premises = col_logical(),
## .. has_amenity__First.aid.kit = col_logical(),
## .. has_amenity__Self.check.in = col_logical(),
## .. has_amenity__Bed.linens = col_logical(),
## .. has_amenity__Extra.pillows.and.blankets = col_logical(),
## .. has_amenity__Microwave = col_logical(),
## .. has_amenity__Coffee.maker = col_logical(),
## .. has_amenity__Refrigerator = col_logical(),
## .. has_amenity__Dishwasher = col_logical(),
## .. has_amenity__Dishes.and.silverware = col_logical(),
## .. has_amenity__Cooking.basics = col_logical(),
## .. has_amenity__Oven = col_logical(),
## .. has_amenity__Stove = col_logical(),
## .. has_amenity__Patio.or.balcony = col_logical(),
## .. has_amenity__Luggage.dropoff.allowed = col_logical(),
## .. has_amenity__No.stairs.or.steps.to.enter = col_logical(),
## .. has_amenity__Wide.entrance.for.guests = col_logical(),
## .. has_amenity__Well.lit.path.to.entrance = col_logical(),
## .. has_amenity__Wide.entryway = col_logical(),
## .. has_amenity__Smoke.detector = col_logical(),
## .. has_amenity__Carbon.monoxide.detector = col_logical(),
## .. has_amenity__Fire.extinguisher = col_logical(),
## .. has_amenity__High.chair = col_logical(),
## .. has_amenity__Pack..n.Play.travel.crib = col_logical(),
## .. has_amenity__Long.term.stays.allowed = col_logical(),
## .. has_amenity__Wide.hallways = col_logical(),
## .. has_amenity__Smoking.allowed = col_logical(),
## .. has_amenity__Lock.on.bedroom.door = col_logical(),
## .. has_amenity__translation.missing..en.hosting_amenity_50 = col_logical(),
## .. has_amenity__Private.living.room = col_logical(),
## .. has_amenity__Cable.TV = col_logical(),
## .. has_amenity__Safety.card = col_logical(),
## .. has_amenity__24.hour.check.in = col_logical(),
## .. has_amenity__Private.entrance = col_logical(),
## .. has_amenity__Breakfast = col_logical(),
## .. has_amenity__translation.missing..en.hosting_amenity_49 = col_logical(),
## .. has_amenity__Room.darkening.shades = col_logical(),
## .. has_amenity__Pets.allowed = col_logical(),
## .. has_amenity__Pocket.wifi = col_logical(),
## .. has_amenity__Extra.space.around.bed = col_logical(),
## .. has_amenity__Accessible.height.bed = col_logical(),
## .. has_amenity__Bathtub = col_logical(),
## .. has_amenity__Wide.entrance = col_logical(),
## .. has_amenity__.toilet = col_logical(),
## .. has_amenity__Ethernet.connection = col_logical(),
## .. log_price = col_double(),
## .. n_NA_cols = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
glimpse(airbnb_clean)
## Rows: 19,833
## Columns: 113
## $ host_id <dbl> 71615, 71615, …
## $ host_since <date> 2010-01-19, 2…
## $ host_response_time <chr> "within an hou…
## $ host_response_rate <dbl> 99, 99, 100, 1…
## $ host_is_superhost <lgl> FALSE, FALSE, …
## $ host_listings_count <dbl> 45, 45, 2, 5, …
## $ host_has_profile_pic <lgl> TRUE, TRUE, TR…
## $ host_identity_verified <lgl> TRUE, TRUE, TR…
## $ neighbourhood_group_cleansed <chr> "sant marti", …
## $ latitude <dbl> 41.40889, 41.4…
## $ longitude <dbl> 2.18555, 2.173…
## $ is_location_exact <lgl> TRUE, TRUE, TR…
## $ property_type <chr> "apartment", "…
## $ room_type <chr> "entire home/a…
## $ accommodates <dbl> 6, 8, 2, 6, 2,…
## $ bathrooms <dbl> 1.0, 2.0, 1.0,…
## $ bedrooms <dbl> 2, 3, 1, 3, 1,…
## $ beds <dbl> 4, 6, 1, 8, 1,…
## $ price <dbl> 130, 60, 33, 2…
## $ cleaning_fee <dbl> 42, 50, NA, 80…
## $ minimum_nights <dbl> 3, 1, 2, 3, 1,…
## $ maximum_nights <dbl> 730, 1125, 112…
## $ has_availability <lgl> TRUE, TRUE, TR…
## $ number_of_reviews <dbl> 1, 15, 119, 45…
## $ first_review <date> 2015-10-10, 2…
## $ last_review <date> 2015-10-10, 2…
## $ review_scores_rating <dbl> 80, 87, 90, 95…
## $ review_scores_accuracy <dbl> 10, 9, 10, 10,…
## $ review_scores_cleanliness <dbl> 10, 9, 9, 10, …
## $ review_scores_checkin <dbl> 2, 10, 10, 10,…
## $ review_scores_communication <dbl> 10, 10, 10, 10…
## $ review_scores_location <dbl> 10, 9, 9, 9, 1…
## $ review_scores_value <dbl> 8, 8, 9, 9, 9,…
## $ instant_bookable <lgl> FALSE, TRUE, F…
## $ has_verificator__email <lgl> TRUE, TRUE, TR…
## $ has_verificator__phone <lgl> TRUE, TRUE, TR…
## $ has_verificator__reviews <lgl> TRUE, TRUE, TR…
## $ has_verificator__jumio <lgl> TRUE, TRUE, TR…
## $ has_verificator__government_id <lgl> TRUE, TRUE, TR…
## $ has_verificator__offline_government_id <lgl> FALSE, FALSE, …
## $ has_verificator__selfie <lgl> FALSE, FALSE, …
## $ has_verificator__identity_manual <lgl> FALSE, FALSE, …
## $ has_verificator__facebook <lgl> FALSE, FALSE, …
## $ has_verificator__work_email <lgl> FALSE, FALSE, …
## $ has_amenity__TV <lgl> TRUE, TRUE, TR…
## $ has_amenity__Internet <lgl> TRUE, TRUE, FA…
## $ has_amenity__Wifi <lgl> TRUE, TRUE, TR…
## $ has_amenity__Air.conditioning <lgl> TRUE, TRUE, FA…
## $ has_amenity__Kitchen <lgl> TRUE, TRUE, TR…
## $ has_amenity__Elevator <lgl> TRUE, TRUE, TR…
## $ has_amenity__Free.street.parking <lgl> TRUE, TRUE, FA…
## $ has_amenity__Heating <lgl> TRUE, TRUE, TR…
## $ has_amenity__Family.kid.friendly <lgl> TRUE, TRUE, TR…
## $ has_amenity__Washer <lgl> TRUE, TRUE, TR…
## $ has_amenity__Dryer <lgl> TRUE, FALSE, T…
## $ has_amenity__Essentials <lgl> TRUE, TRUE, TR…
## $ has_amenity__Shampoo <lgl> TRUE, TRUE, TR…
## $ has_amenity__Hair.dryer <lgl> TRUE, TRUE, TR…
## $ has_amenity__Hot.water <lgl> TRUE, TRUE, TR…
## $ has_amenity__Host.greets.you <lgl> TRUE, TRUE, FA…
## $ has_amenity__Paid.parking.on.premises <lgl> TRUE, TRUE, TR…
## $ has_amenity__Buzzer.wireless.intercom <lgl> FALSE, TRUE, F…
## $ has_amenity__Hangers <lgl> FALSE, TRUE, T…
## $ has_amenity__Iron <lgl> FALSE, TRUE, T…
## $ has_amenity__Laptop.friendly.workspace <lgl> FALSE, TRUE, T…
## $ has_amenity__Crib <lgl> FALSE, TRUE, F…
## $ has_amenity__Paid.parking.off.premises <lgl> FALSE, FALSE, …
## $ has_amenity__First.aid.kit <lgl> FALSE, FALSE, …
## $ has_amenity__Self.check.in <lgl> FALSE, FALSE, …
## $ has_amenity__Bed.linens <lgl> FALSE, FALSE, …
## $ has_amenity__Extra.pillows.and.blankets <lgl> FALSE, FALSE, …
## $ has_amenity__Microwave <lgl> FALSE, FALSE, …
## $ has_amenity__Coffee.maker <lgl> FALSE, FALSE, …
## $ has_amenity__Refrigerator <lgl> FALSE, FALSE, …
## $ has_amenity__Dishwasher <lgl> FALSE, FALSE, …
## $ has_amenity__Dishes.and.silverware <lgl> FALSE, FALSE, …
## $ has_amenity__Cooking.basics <lgl> FALSE, FALSE, …
## $ has_amenity__Oven <lgl> FALSE, FALSE, …
## $ has_amenity__Stove <lgl> FALSE, FALSE, …
## $ has_amenity__Patio.or.balcony <lgl> FALSE, FALSE, …
## $ has_amenity__Luggage.dropoff.allowed <lgl> FALSE, FALSE, …
## $ has_amenity__No.stairs.or.steps.to.enter <lgl> FALSE, FALSE, …
## $ has_amenity__Wide.entrance.for.guests <lgl> FALSE, FALSE, …
## $ has_amenity__Well.lit.path.to.entrance <lgl> FALSE, FALSE, …
## $ has_amenity__Wide.entryway <lgl> FALSE, FALSE, …
## $ has_amenity__Smoke.detector <lgl> FALSE, FALSE, …
## $ has_amenity__Carbon.monoxide.detector <lgl> FALSE, FALSE, …
## $ has_amenity__Fire.extinguisher <lgl> FALSE, FALSE, …
## $ has_amenity__High.chair <lgl> FALSE, FALSE, …
## $ has_amenity__Pack..n.Play.travel.crib <lgl> FALSE, FALSE, …
## $ has_amenity__Long.term.stays.allowed <lgl> FALSE, FALSE, …
## $ has_amenity__Wide.hallways <lgl> FALSE, FALSE, …
## $ has_amenity__Smoking.allowed <lgl> FALSE, FALSE, …
## $ has_amenity__Lock.on.bedroom.door <lgl> FALSE, FALSE, …
## $ has_amenity__translation.missing..en.hosting_amenity_50 <lgl> FALSE, FALSE, …
## $ has_amenity__Private.living.room <lgl> FALSE, FALSE, …
## $ has_amenity__Cable.TV <lgl> FALSE, FALSE, …
## $ has_amenity__Safety.card <lgl> FALSE, FALSE, …
## $ has_amenity__24.hour.check.in <lgl> FALSE, FALSE, …
## $ has_amenity__Private.entrance <lgl> FALSE, FALSE, …
## $ has_amenity__Breakfast <lgl> FALSE, FALSE, …
## $ has_amenity__translation.missing..en.hosting_amenity_49 <lgl> FALSE, FALSE, …
## $ has_amenity__Room.darkening.shades <lgl> FALSE, FALSE, …
## $ has_amenity__Pets.allowed <lgl> FALSE, FALSE, …
## $ has_amenity__Pocket.wifi <lgl> FALSE, FALSE, …
## $ has_amenity__Extra.space.around.bed <lgl> FALSE, FALSE, …
## $ has_amenity__Accessible.height.bed <lgl> FALSE, FALSE, …
## $ has_amenity__Bathtub <lgl> FALSE, FALSE, …
## $ has_amenity__Wide.entrance <lgl> FALSE, FALSE, …
## $ has_amenity__.toilet <lgl> FALSE, FALSE, …
## $ has_amenity__Ethernet.connection <lgl> FALSE, FALSE, …
## $ log_price <dbl> 4.867534, 4.09…
## $ n_NA_cols <dbl> 0, 0, 1, 0, 1,…
To begin, we split our dataset into training (80%) and testing (20%)
sets using the initial_split function from tidymodels.
Since our target variable is log_price, we applied
stratification to ensure that both sets maintained a similar price
distribution. We also set a seed (123) to guarantee reproducibility.
set.seed(123)
airbnb_split <- initial_split(airbnb_clean, prop = 0.8, strata = log_price)
airbnb_train <- training(airbnb_split)
airbnb_test <- testing(airbnb_split)
Secondly, we implemented 10-fold cross-validation on the training
data using vfold_cv. We stratified
by log_price to maintain consistency in price distributions
across all validation folds.
airbnb_folds <- vfold_cv(data = airbnb_train,
v = 10,
strata = log_price)
For data preprocessing, we designed a recipe that:
Removed unnecessary columns (e.g., host IDs, date-related features, and reviews)
Converted the cleaning fee into a binary indicator (1 if present, 0 otherwise)
Imputed missing numerical values using the median
Standardized numerical predictors for consistent scaling
Applied one-hot encoding to categorical features
Removed zero-variance predictors that do not contribute to model performance
To evaluate our model, we selected RMSE, MAE, and R-squared as key performance metrics. We then created a workflow that integrated our preprocessing steps with an XGBoost model, allowing us to tune key hyperparameters such as the number of trees, tree depth, minimum node size, and learning rate.
# Recipe
airbnb_recipe <- recipe(formula = log_price ~ ., data = airbnb_train) %>%
step_rm(host_id, host_since, first_review, last_review, host_has_profile_pic,
host_response_time, host_response_rate, host_identity_verified,
is_location_exact, n_NA_cols, price) %>%
step_rm(matches("review_")) %>%
step_mutate(cleaning_fee = as.numeric(ifelse(is.na(cleaning_fee) | cleaning_fee == 0, 0, 1))) %>%
step_impute_median(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors()) %>%
step_mutate(across(where(is.logical), as.numeric))
airbnb_prep <- prep(airbnb_recipe, training = airbnb_train)
juice(airbnb_prep) %>% glimpse()
## Rows: 15,865
## Columns: 131
## $ host_is_superhost <dbl> 0, 0, 1, 1, 0,…
## $ host_listings_count <dbl> -0.3229847, -0…
## $ latitude <dbl> 0.90858229, -0…
## $ longitude <dbl> 1.70223211, -1…
## $ accommodates <dbl> -0.6115208, -1…
## $ bathrooms <dbl> -0.4940825, -0…
## $ bedrooms <dbl> -0.5595624, -0…
## $ beds <dbl> -0.6743047, -0…
## $ cleaning_fee <dbl> -1.4661367, -1…
## $ minimum_nights <dbl> -0.38691601, 1…
## $ maximum_nights <dbl> -0.01121042, -…
## $ number_of_reviews <dbl> 1.4918692, -0.…
## $ instant_bookable <dbl> 0, 0, 1, 1, 0,…
## $ has_verificator__email <dbl> 1, 1, 1, 1, 1,…
## $ has_verificator__phone <dbl> 1, 1, 1, 1, 1,…
## $ has_verificator__reviews <dbl> 1, 1, 1, 1, 1,…
## $ has_verificator__jumio <dbl> 1, 0, 1, 1, 1,…
## $ has_verificator__government_id <dbl> 1, 0, 1, 1, 1,…
## $ has_verificator__offline_government_id <dbl> 1, 0, 1, 1, 1,…
## $ has_verificator__selfie <dbl> 0, 0, 0, 0, 1,…
## $ has_verificator__identity_manual <dbl> 0, 0, 0, 0, 1,…
## $ has_verificator__facebook <dbl> 0, 1, 0, 0, 0,…
## $ has_verificator__work_email <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__TV <dbl> 1, 0, 1, 1, 0,…
## $ has_amenity__Internet <dbl> 0, 1, 1, 1, 1,…
## $ has_amenity__Wifi <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Air.conditioning <dbl> 0, 1, 0, 0, 0,…
## $ has_amenity__Kitchen <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Elevator <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Free.street.parking <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Heating <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Family.kid.friendly <dbl> 1, 0, 0, 0, 1,…
## $ has_amenity__Washer <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Dryer <dbl> 1, 1, 0, 0, 0,…
## $ has_amenity__Essentials <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Shampoo <dbl> 1, 0, 1, 1, 0,…
## $ has_amenity__Hair.dryer <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Hot.water <dbl> 1, 0, 1, 1, 1,…
## $ has_amenity__Host.greets.you <dbl> 0, 0, 1, 1, 1,…
## $ has_amenity__Paid.parking.on.premises <dbl> 1, 0, 1, 1, 1,…
## $ has_amenity__Buzzer.wireless.intercom <dbl> 0, 1, 1, 0, 1,…
## $ has_amenity__Hangers <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Iron <dbl> 1, 0, 1, 1, 1,…
## $ has_amenity__Laptop.friendly.workspace <dbl> 1, 1, 0, 0, 0,…
## $ has_amenity__Crib <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Paid.parking.off.premises <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__First.aid.kit <dbl> 1, 0, 0, 0, 1,…
## $ has_amenity__Self.check.in <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Bed.linens <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Extra.pillows.and.blankets <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Microwave <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Coffee.maker <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Refrigerator <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Dishwasher <dbl> 1, 1, 0, 0, 0,…
## $ has_amenity__Dishes.and.silverware <dbl> 1, 1, 1, 1, 1,…
## $ has_amenity__Cooking.basics <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Oven <dbl> 1, 1, 0, 0, 0,…
## $ has_amenity__Stove <dbl> 1, 1, 1, 1, 0,…
## $ has_amenity__Patio.or.balcony <dbl> 1, 1, 0, 0, 1,…
## $ has_amenity__Luggage.dropoff.allowed <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__No.stairs.or.steps.to.enter <dbl> 1, 0, 0, 0, 1,…
## $ has_amenity__Wide.entrance.for.guests <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Well.lit.path.to.entrance <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Wide.entryway <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__Smoke.detector <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Carbon.monoxide.detector <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Fire.extinguisher <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__High.chair <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Pack..n.Play.travel.crib <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Long.term.stays.allowed <dbl> 0, 1, 0, 0, 1,…
## $ has_amenity__Wide.hallways <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Smoking.allowed <dbl> 0, 0, 0, 0, 1,…
## $ has_amenity__Lock.on.bedroom.door <dbl> 0, 0, 1, 1, 0,…
## $ has_amenity__translation.missing..en.hosting_amenity_50 <dbl> 0, 0, 1, 1, 0,…
## $ has_amenity__Private.living.room <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Cable.TV <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Safety.card <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__24.hour.check.in <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Private.entrance <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Breakfast <dbl> 0, 0, 1, 1, 0,…
## $ has_amenity__translation.missing..en.hosting_amenity_49 <dbl> 0, 0, 1, 1, 0,…
## $ has_amenity__Room.darkening.shades <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Pets.allowed <dbl> 0, 0, 0, 0, 1,…
## $ has_amenity__Pocket.wifi <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Extra.space.around.bed <dbl> 0, 0, 0, 0, 1,…
## $ has_amenity__Accessible.height.bed <dbl> 0, 0, 0, 0, 1,…
## $ has_amenity__Bathtub <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Wide.entrance <dbl> 1, 0, 0, 0, 0,…
## $ has_amenity__.toilet <dbl> 0, 0, 0, 0, 0,…
## $ has_amenity__Ethernet.connection <dbl> 0, 0, 0, 0, 0,…
## $ log_price <dbl> 3.496508, 3.40…
## $ neighbourhood_group_cleansed_ciutat.vella <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_eixample <dbl> 0, 0, 1, 1, 0,…
## $ neighbourhood_group_cleansed_gracia <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_horta.guinardo <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_les.corts <dbl> 0, 1, 0, 0, 0,…
## $ neighbourhood_group_cleansed_nou.barris <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_sant.andreu <dbl> 0, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_sant.marti <dbl> 1, 0, 0, 0, 0,…
## $ neighbourhood_group_cleansed_sants.montjuic <dbl> 0, 0, 0, 0, 1,…
## $ neighbourhood_group_cleansed_sarria.sant.gervasi <dbl> 0, 0, 0, 0, 0,…
## $ property_type_aparthotel <dbl> 0, 0, 0, 0, 0,…
## $ property_type_apartment <dbl> 1, 1, 1, 1, 1,…
## $ property_type_barn <dbl> 0, 0, 0, 0, 0,…
## $ property_type_bed.and.breakfast <dbl> 0, 0, 0, 0, 0,…
## $ property_type_boat <dbl> 0, 0, 0, 0, 0,…
## $ property_type_boutique.hotel <dbl> 0, 0, 0, 0, 0,…
## $ property_type_cabin <dbl> 0, 0, 0, 0, 0,…
## $ property_type_camper.rv <dbl> 0, 0, 0, 0, 0,…
## $ property_type_casa.particular..cuba. <dbl> 0, 0, 0, 0, 0,…
## $ property_type_chalet <dbl> 0, 0, 0, 0, 0,…
## $ property_type_condominium <dbl> 0, 0, 0, 0, 0,…
## $ property_type_cottage <dbl> 0, 0, 0, 0, 0,…
## $ property_type_dome.house <dbl> 0, 0, 0, 0, 0,…
## $ property_type_earth.house <dbl> 0, 0, 0, 0, 0,…
## $ property_type_farm.stay <dbl> 0, 0, 0, 0, 0,…
## $ property_type_guest.suite <dbl> 0, 0, 0, 0, 0,…
## $ property_type_guesthouse <dbl> 0, 0, 0, 0, 0,…
## $ property_type_hostel <dbl> 0, 0, 0, 0, 0,…
## $ property_type_hotel <dbl> 0, 0, 0, 0, 0,…
## $ property_type_house <dbl> 0, 0, 0, 0, 0,…
## $ property_type_loft <dbl> 0, 0, 0, 0, 0,…
## $ property_type_nature.lodge <dbl> 0, 0, 0, 0, 0,…
## $ property_type_other <dbl> 0, 0, 0, 0, 0,…
## $ property_type_serviced.apartment <dbl> 0, 0, 0, 0, 0,…
## $ property_type_tiny.house <dbl> 0, 0, 0, 0, 0,…
## $ property_type_townhouse <dbl> 0, 0, 0, 0, 0,…
## $ property_type_villa <dbl> 0, 0, 0, 0, 0,…
## $ room_type_entire.home.apt <dbl> 0, 0, 0, 0, 0,…
## $ room_type_private.room <dbl> 1, 1, 1, 1, 1,…
## $ room_type_shared.room <dbl> 0, 0, 0, 0, 0,…
# Metrics
airbnb_metrics <- metric_set(rmse, mae, rsq)
# Workflow
airbnb_model <- boost_tree(
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune()) %>%
set_engine("xgboost") %>%
set_mode("regression")
airbnb_wf <- workflow() %>%
add_recipe(airbnb_recipe) %>%
add_model(airbnb_model)
We generated a random grid of five different hyperparameter
combinations and used tune_grid to evaluate them across our
10-fold cross-validation splits. This process was computationally
intensive but necessary to optimize model performance. We identified the
best configuration based on RMSE and then finalized our workflow,
fitting it on the full training data set.
#grid <- grid_random(hardhat::extract_parameter_set_dials(airbnb_model),
# size = 5)
#
#system.time({
# tune_results <- tune_grid(
# airbnb_wf,
# resamples = airbnb_folds,
# grid = grid,
# metrics = airbnb_metrics,
# control = control_grid(save_pred = TRUE, verbose = TRUE)
# )
#})
#save(tune_results, file = 'C:/Users/Edgar/Desktop/Challenge_2/tune_results_grid.RData')
# Load the airbnb_predictor.RData file in order to not run the grid search again
load('C:/Users/Edgar/Desktop/Challenge_2/airbnb_predictor.RData')
load('C:/Users/Edgar/Desktop/Challenge_2/tune_results_grid.RData')
Once trained, we extracted our final workflow as
the airbnb_predictor object, which includes both the
pre-processing steps and the trained model. This ensures that future
predictions can be made without reapplying transformations
separately.
best_params <- tune_results %>%
select_best(metric = "rmse")
final_wf <- finalize_workflow(airbnb_wf, best_params)
final_fit <- final_wf %>%
last_fit(split = airbnb_split)
airbnb_predictor <- extract_workflow(final_fit)
save(airbnb_predictor, file = 'C:/Users/Edgar/Desktop/Challenge_2/airbnb_predictor.RData')
final_fit %>%
pluck(".workflow", 1) %>%
extract_fit_parsnip() %>%
vip(num_features = 10)
We used our model to predict prices and visualized the results to assess accuracy. A scatter plot comparing actual vs. predicted log prices showed strong alignment along the diagonal, indicating good predictive performance. After converting predictions back to actual prices, some expected variance appeared at higher price points, which is typical for models using log-transformed targets.
airbnb_predictions <- predict(airbnb_predictor, new_data = airbnb_clean) %>%
bind_cols(airbnb_clean)
airbnb_metrics <- airbnb_predictions %>%
metrics(truth = log_price, estimate = .pred)
airbnb_metrics
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.186
## 2 rsq standard 0.953
## 3 mae standard 0.0660
save(airbnb_predictions, file = 'C:/Users/Edgar/Desktop/Challenge_2/airbnb_predictions.RData')
ggplot(airbnb_predictions, aes(x = log_price, y = .pred)) +
geom_point(alpha = 0.5, color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Actual vs. Predicted Log Prices",
x = "Actual Log Price",
y = "Predicted Log Price") +
theme_minimal()
#plot in actual price
predictions_converted <- airbnb_predictions %>%
mutate(predicted_price = exp(.pred), actual_price = exp(log_price))
rsq(predictions_converted, truth = actual_price, estimate = predicted_price)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.931
ggplot(predictions_converted, aes(x = actual_price, y = predicted_price)) +
geom_point(alpha = 0.5, color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Actual vs. Predicted Prices",
x = "Actual Price",
y = "Predicted Price") +
coord_cartesian(xlim=c(0, 1000), ylim=c(0, 1000)) +
theme_minimal()
To explore potential spatial patterns in prediction errors, we created an interactive Leaflet map. Data points were colored based on the difference between actual and predicted log prices, helping to identify neighborhoods where the model performed better or worse. While we did not conduct a deep dive into these patterns, the visualization provided valuable insights into geographic variations.
library(dplyr)
library(leaflet)
library(mapview)
map_data <- airbnb_predictions %>%
mutate(row_id = row_number()) %>% # Create row_id for airbnb_predictions
select(row_id, .pred) %>%
left_join(
airbnb_clean %>%
mutate(row_id = row_number()) %>% # Create row_id for airbnb_clean
select(row_id, longitude, latitude, neighbourhood_group_cleansed, log_price),
by = "row_id"
) %>%
mutate(price_diff = .pred - log_price)
pal <- colorNumeric(palette = "RdYlBu", domain = map_data$price_diff)
# Create interactive map
map <- leaflet(map_data) %>%
addTiles() %>%
addCircleMarkers(
~longitude, ~latitude,
color = ~pal(price_diff),
radius =0.5,
fillOpacity = 0.9,
popup = ~paste0("Actual Price: ", log_price, "<br>",
"Predicted Price: ", .pred, "<br>",
"Difference: ", price_diff)
) %>%
addLegend(
pal = pal, values = ~price_diff, title = "Price Difference",
position = "bottomright"
)
map
save(map, file = "C:/Users/Edgar/Desktop/Challenge_2/neighbourhood_maps.RData")
# Create a list of unique neighborhoods
neighborhoods <- unique(map_data$neighbourhood_group_cleansed)
create_neighborhood_map <- function(neighborhood) {
# Subset the data for this neighborhood
neighborhood_data <- map_data %>% filter(neighbourhood_group_cleansed == neighborhood)
pal <- colorNumeric(palette = "RdYlBu", domain = neighborhood_data$price_diff)
# Create interactive map for the neighborhood
map <- leaflet(neighborhood_data) %>%
addTiles() %>%
addCircleMarkers(
~longitude, ~latitude,
color = ~pal(price_diff),
radius = 5,
fillOpacity = 0.7,
popup = ~paste0("Actual Price: ", log_price, "<br>",
"Predicted Price: ", .pred, "<br>",
"Difference: ", price_diff)
) %>%
addLegend(
pal = pal,
values = ~price_diff,
title = paste("Price Difference\n", neighborhood), # Add neighborhood name in the legend title
position = "bottomright"
)
return(map)
}
# Create a list to store the maps with neighborhood names as list names
neighborhood_maps <- setNames(
lapply(neighborhoods, create_neighborhood_map),
neighborhoods
)
# Print each map with its neighborhood name
for (neighborhood_name in names(neighborhood_maps)) {
print(neighborhood_maps[[neighborhood_name]])
}
save(neighborhood_maps, file = 'C:/Users/Edgar/Desktop/Challenge_2/neighborhood_maps.RData')
In the final stage, we applied our model to analyze the best neighborhoods for investment in Barcelona, assuming a €3 million budget. Our approach involved:
Generating standardized sample listings for each neighborhood.
Using our model to predict rental prices.
Calculating key investment indicators, including ROI time and annual returns.
The analysis revealed that the most favorable investment locations were:
Sant Andreu: Fastest ROI at 4.79 years, €60,255 annual revenue
Nou Barris: Close second with 4.98 years ROI, €40,336 annual revenue
Sants-Montjuïc: Ranked third at 5.78 years ROI, €51,736 annual revenue
Our optimized investment strategy allocated €2,427,910 across these top-performing neighborhoods, leaving €572,090 in reserve. This portfolio would generate an estimated €361,212 in annual returns, yielding a 14.88% return rate, with a weighted average ROI time of 7.03 years.
A key takeaway from this analysis is that central, tourist-heavy neighborhoods did not necessarily offer the best investment opportunities. While these areas had higher rental prices, their significantly higher acquisition costs led to lower returns. This challenges the common assumption that the most popular districts always provide the highest profitability.
# Create samples code
create_sample <- function(
## Function to create new samples from airbnb_clean
## It is necessary to have in the environment the data frame avg_prices
df = airbnb_clean, # The original data frame (airbnb_clean)
df_prices = avg_prices, # The data frame containing average prices, coordinates and estimated occupation
origin_sample = 1, # The desired index row of airbnb_clean
host_response_time_new = 'within an hour',
host_response_rate_new = 100,
host_listings_count_new = 1,
neighbourhood_new,
latitude_new,
longitude_new,
property_type_new = 'apartment',
room_type_new = 'entire home/apt',
accommodates_new = 4,
bathrooms_new = 1,
bedrooms_new = 2,
beds_new = 2,
cleaning_fee_new = 20,
minimum_nights_new = 1,
maximum_nights_new = 30,
review_scores_rating_new = 100,
review_scores_accuracy_new = 10,
review_scores_cleanliness_new = 10,
review_scores_checkin_new = 10,
review_scores_communication_new = 10,
review_scores_location_new = 10,
review_scores_value_new = 10
){
require(tidyverse)
final_df <- df[origin_sample, ] %>%
mutate(
host_response_time = host_response_time_new,
host_response_rate = host_response_rate_new,
host_listings_count = host_listings_count_new,
neighbourhood_group_cleansed = neighbourhood_new,
latitude = df_prices$latitude[which(df_prices$neighbourhood == neighbourhood_new)],
longitude = df_prices$longitude[which(df_prices$neighbourhood == neighbourhood_new)],
property_type = property_type_new,
room_type = room_type_new,
accommodates = accommodates_new,
bathrooms = bathrooms_new,
bedrooms = bedrooms_new,
beds = beds_new,
cleaning_fee = cleaning_fee_new,
minimum_nights = minimum_nights_new,
maximum_nights = maximum_nights_new,
review_scores_rating = review_scores_rating_new,
review_scores_accuracy = review_scores_accuracy_new,
review_scores_cleanliness = review_scores_cleanliness_new,
review_scores_checkin = review_scores_checkin_new,
review_scores_communication = review_scores_communication_new,
review_scores_location = review_scores_location_new,
review_scores_value = review_scores_value_new,
) %>%
mutate(across(where(is.logical), ~ TRUE))
}
# Function to generate samples for each neighborhood
generate_neighborhood_samples <- function() {
origin_sample <- sample(1:nrow(airbnb_clean), size = 1, replace = FALSE)
neighborhood_samples <- map_dfr(avg_prices$neighbourhood, function(neighborhood) {
# Ensure 'neighbourhood' is correctly used in the function call
create_sample(
df = airbnb_clean,
df_prices = avg_prices,
origin_sample = origin_sample,
neighbourhood = neighborhood, # Correctly use 'neighbourhood' as the argument
cleaning_fee_new = 20
)
})
return(neighborhood_samples)
}
neighborhood_samples <- generate_neighborhood_samples()
## Cargando paquete requerido: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xgboost::slice() masks dplyr::slice()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Now make predictions using the model
predicted_prices <- predict(airbnb_predictor, new_data = neighborhood_samples) %>%
bind_cols(neighborhood_samples %>% select(neighbourhood_group_cleansed))
predicted_prices <- predicted_prices %>%
mutate(
predicted_price = exp(.pred),
neighbourhood = neighbourhood_group_cleansed
) %>%
select(neighbourhood, predicted_price)
# Investment analysis dataframe
investment_analysis <- predicted_prices %>%
left_join(avg_prices, by = "neighbourhood") %>%
mutate(
# Calculate the number of properties that can be purchased in each neighborhood
max_properties = floor(3000000 / avg_price),
# Calculate daily revenue from each property
daily_revenue = predicted_price,
# Calculate annual revenue considering occupancy rate
annual_revenue = daily_revenue * 365 * pct_year_occupation,
# Calculate return on investment time in years
roi_years = avg_price / annual_revenue,
# Total investment required for max properties
total_investment = max_properties * avg_price,
# Total annual return for max properties
total_annual_return = max_properties * annual_revenue,
# Return rate (annual return as percentage of investment)
return_rate = (total_annual_return / total_investment) * 100
)
# Optimal investment portfolio
investment_portfolio <- investment_analysis %>%
arrange(roi_years) %>%
mutate(
cumulative_investment = cumsum(avg_price),
can_afford = cumulative_investment <= 3000000
) %>%
filter(can_afford) %>%
summarise(
total_properties = n(),
total_investment = sum(avg_price),
remaining_budget = 3000000 - sum(avg_price),
weighted_avg_roi = sum(roi_years * avg_price) / sum(avg_price),
total_annual_return = sum(annual_revenue),
portfolio_return_rate = (total_annual_return / total_investment) * 100
)
# Display results
print("Analysis of Investment by Neighborhood:")
## [1] "Analysis of Investment by Neighborhood:"
investment_analysis %>%
select(
neighbourhood,
avg_price,
max_properties,
predicted_price,
pct_year_occupation,
annual_revenue,
roi_years,
return_rate
) %>%
arrange(roi_years) %>%
print(n = 10)
## # A tibble: 10 × 8
## neighbourhood avg_price max_properties predicted_price pct_year_occupation
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 sant andreu 288534 10 210. 0.6
## 2 nou barris 201074 14 174. 0.5
## 3 sants-montjuic 299140 10 178. 0.7
## 4 ciutat vella 392645 7 161. 0.8
## 5 horta-guinardo 310891 9 172. 0.55
## 6 sant marti 435215 6 198. 0.6
## 7 gracia 500411 5 160. 0.7
## 8 eixample 684012 4 152. 0.75
## 9 les corts 779088 3 171. 0.7
## 10 sarria-sant ger… 980439 3 137. 0.7
## # ℹ 3 more variables: annual_revenue <dbl>, roi_years <dbl>, return_rate <dbl>
print("\nOptimal Investment Portfolio:")
## [1] "\nOptimal Investment Portfolio:"
print(investment_portfolio)
## # A tibble: 1 × 6
## total_properties total_investment remaining_budget weighted_avg_roi
## <int> <dbl> <dbl> <dbl>
## 1 7 2427910 572090 8.92
## # ℹ 2 more variables: total_annual_return <dbl>, portfolio_return_rate <dbl>
# Visualization of ROI by neighborhood
ggplot(investment_analysis, aes(x = reorder(neighbourhood, -roi_years), y = roi_years)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "ROI time by neighborhood",
x = "Neighborhood",
y = "Years to recover investment"
) +
theme_minimal()
# Visualize return rate by neighborhood
ggplot(investment_analysis, aes(x = reorder(neighbourhood, return_rate), y = return_rate)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
labs(
title = "Annual return rate by neighborhood (%)",
x = "Neighborhood",
y = "Annual return rate (%)"
) +
theme_minimal()
# Report to store the data
cat("\n=== Barcelona investment report ===\n")
##
## === Barcelona investment report ===
cat("\nBudget: €3,000,000\n")
##
## Budget: €3,000,000
cat("\nProperty Specifications:")
##
## Property Specifications:
cat("\n- 4 people capacity (2 bedrooms, 2 beds, 1 bathroom)")
##
## - 4 people capacity (2 bedrooms, 2 beds, 1 bathroom)
cat("\n- €20 cleaning fee")
##
## - €20 cleaning fee
cat("\n- Perfect ratings and host responsiveness")
##
## - Perfect ratings and host responsiveness
cat("\n- All services and amenities offered\n")
##
## - All services and amenities offered
cat("\nBest neighborhoods for investment (by ROI):\n")
##
## Best neighborhoods for investment (by ROI):
investment_analysis %>%
arrange(roi_years) %>%
select(neighbourhood, roi_years, return_rate) %>%
head(3) %>%
mutate(
neighbourhood = paste(neighbourhood),
roi_years = paste(round(roi_years, 2), "years"),
return_rate = paste(round(return_rate, 2), "%")
) %>%
knitr::kable(col.names = c("Neighborhood", "ROI Time", "Annual Return Rate"), format = "pipe")
| Neighborhood | ROI Time | Annual Return Rate |
|---|---|---|
| sant andreu | 6.26 years | 15.98 % |
| nou barris | 6.32 years | 15.81 % |
| sants-montjuic | 6.59 years | 15.17 % |
cat("\nRecommended investment strategy:\n")
##
## Recommended investment strategy:
investment_analysis %>%
arrange(roi_years) %>%
filter(row_number() <= investment_portfolio$total_properties) %>%
select(neighbourhood, avg_price, annual_revenue, roi_years) %>%
mutate(
neighbourhood = paste(neighbourhood),
avg_price = paste0("€", format(round(avg_price), big.mark = ",")),
annual_revenue = paste0("€", format(round(annual_revenue), big.mark = ",")),
roi_years = paste(round(roi_years, 2), "years")
) %>%
knitr::kable(col.names = c("Neighborhood", "Property Cost", "Annual Revenue", "ROI Time"), format = "pipe")
| Neighborhood | Property Cost | Annual Revenue | ROI Time |
|---|---|---|---|
| sant andreu | €288,534 | €46,094 | 6.26 years |
| nou barris | €201,074 | €31,791 | 6.32 years |
| sants-montjuic | €299,140 | €45,373 | 6.59 years |
| ciutat vella | €392,645 | €46,878 | 8.38 years |
| horta-guinardo | €310,891 | €34,445 | 9.03 years |
| sant marti | €435,215 | €43,305 | 10.05 years |
| gracia | €500,411 | €40,788 | 12.27 years |
cat("\nTotal Investment:", paste0("€", format(round(investment_portfolio$total_investment), big.mark = ",")))
##
## Total Investment: €2,427,910
cat("\nRemaining Budget:", paste0("€", format(round(investment_portfolio$remaining_budget), big.mark = ",")))
##
## Remaining Budget: €572,090
cat("\nTotal Annual Return:", paste0("€", format(round(investment_portfolio$total_annual_return), big.mark = ",")))
##
## Total Annual Return: €288,673
cat("\nPortfolio Return Rate:", paste0(round(investment_portfolio$portfolio_return_rate, 2), "%"))
##
## Portfolio Return Rate: 11.89%
cat("\nWeighted Average ROI Time:", paste(round(investment_portfolio$weighted_avg_roi, 2), "years"))
##
## Weighted Average ROI Time: 8.92 years