This project solves a supervised machine learning regression problem.
The goal is to predict the continuous target variable price
for Airbnb listings in New York City.
The analysis uses the New York City Airbnb Open Data dataset published on Kaggle and provided by lecturer:
https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data
The project compares three model families required in the course:
The report follows the workflow used during classes: data preparation, exploratory analysis, train-test split, cross-validation, model comparison, interpretation, and final conclusions.
We treated this dataset as a practical prediction task rather than as a purely theoretical exercise. In real Airbnb pricing, perfect predictions are unlikely: two apartments in the same area can differ because of photos, interior quality, season, host behaviour, or temporary events. For this reason, we focus not only on finding the smallest error, but also on understanding which model is stable enough to trust.
# Install only once if needed:
# install.packages(c(
# "tidyverse", "caret", "skimr", "corrplot", "glmnet", "rpart",
# "rpart.plot", "randomForest", "ranger", "gbm", "xgboost", "pdp"
# ))
#
# XGBoost note:
# Newer xgboost versions can be unstable with caret::train(method = "xgbTree").
# If caret compatibility is needed, use:
# remotes::install_version("xgboost", version = "1.7.6.1")
#
# Keras note:
# Keras models were trained separately and saved to disk. The final report reads
# their validation and test metrics, so the report does not need to retrain them.
library(tidyverse)
library(caret)
library(skimr)
library(corrplot)
library(glmnet)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ranger)
library(gbm)
library(xgboost)
options(contrasts = c("contr.treatment", "contr.treatment"))
DATA_FILE <- params$data_file
OUTPUT_DIR <- params$output_dir
MODELS_DIR <- file.path(OUTPUT_DIR, "saved_models")
PLOTS_DIR <- file.path(OUTPUT_DIR, "plots")
EXTENDED_DIR <- file.path(OUTPUT_DIR, "extended_tuning")
EXTENDED_MODELS_DIR <- file.path(EXTENDED_DIR, "saved_models")
dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)
dir.create(PLOTS_DIR, recursive = TRUE, showWarnings = FALSE)
save_plot <- function(plot_object, filename, width = 8, height = 5) {
ggplot2::ggsave(
filename = file.path(PLOTS_DIR, filename),
plot = plot_object,
width = width,
height = height,
dpi = 150
)
}
regression_metrics <- function(real, predicted) {
predicted.msle <- pmax(predicted, 0)
mse <- mean((real - predicted)^2)
data.frame(
MSE = mse,
RMSE = sqrt(mse),
MAE = mean(abs(real - predicted)),
MedAE = median(abs(real - predicted)),
MAPE = mean(abs(real - predicted) / real),
MSLE = mean((log1p(real) - log1p(predicted.msle))^2),
R2 = 1 - sum((real - predicted)^2) / sum((real - mean(real))^2)
)
}
airbnb.raw <- read.csv(DATA_FILE)
glimpse(airbnb.raw)
## Rows: 31,753
## Columns: 14
## $ price <int> 125, 40, 300, 129, 40, 100, 60, 90, 70,…
## $ neighbourhood_group <chr> "Brooklyn", "Brooklyn", "Manhattan", "M…
## $ neighbourhood <chr> "Prospect Heights", "Bedford-Stuyvesant…
## $ room_type <chr> "Entire home/apt", "Private room", "Ent…
## $ latitude <dbl> 40.67717, 40.68477, 40.71960, 40.71489,…
## $ longitude <dbl> -73.96915, -73.95090, -73.98133, -73.99…
## $ minimum_nights <int> 30, 2, 3, 7, 7, 3, 1, 26, 1, 3, 1, 30, …
## $ number_of_reviews <int> 1, 1, 30, 11, 0, 0, 54, 0, 7, 1, 6, 57,…
## $ calculated_host_listings_count <int> 1, 1, 2, 1, 1, 1, 1, 2, 4, 8, 1, 6, 2, …
## $ availability_365 <int> 341, 1, 182, 188, 0, 352, 0, 90, 347, 3…
## $ is_central <int> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ reviews_density <dbl> 0.002923977, 0.500000000, 0.163934426, …
## $ high_availability <int> 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ host_multi_listing <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, …
dim(airbnb.raw)
## [1] 31753 14
head(airbnb.raw)
summary(airbnb.raw)
## price neighbourhood_group neighbourhood room_type
## Min. : 10.0 Length:31753 Length:31753 Length:31753
## 1st Qu.: 69.0 Class :character Class :character Class :character
## Median :105.0 Mode :character Mode :character Mode :character
## Mean :140.6
## 3rd Qu.:175.0
## Max. :999.0
## latitude longitude minimum_nights number_of_reviews
## Min. :40.50 Min. :-74.24 Min. : 1.000 Min. : 0.00
## 1st Qu.:40.69 1st Qu.:-73.98 1st Qu.: 1.000 1st Qu.: 1.00
## Median :40.72 Median :-73.96 Median : 2.000 Median : 5.00
## Mean :40.73 Mean :-73.95 Mean : 6.977 Mean : 23.35
## 3rd Qu.:40.76 3rd Qu.:-73.94 3rd Qu.: 5.000 3rd Qu.: 24.00
## Max. :40.91 Max. :-73.72 Max. :1250.000 Max. :607.00
## calculated_host_listings_count availability_365 is_central
## Min. : 1.000 Min. : 0.0 Min. :0.000
## 1st Qu.: 1.000 1st Qu.: 0.0 1st Qu.:1.000
## Median : 1.000 Median : 45.0 Median :1.000
## Mean : 7.308 Mean :112.8 Mean :0.853
## 3rd Qu.: 2.000 3rd Qu.:228.0 3rd Qu.:1.000
## Max. :327.000 Max. :365.0 Max. :1.000
## reviews_density high_availability host_multi_listing
## Min. : 0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.00836 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.20495 Median :0.0000 Median :0.0000
## Mean : 3.33422 Mean :0.2735 Mean :0.3409
## 3rd Qu.: 1.91667 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :480.00000 Max. :1.0000 Max. :1.0000
colSums(is.na(airbnb.raw)) %>% sort(decreasing = TRUE)
## price neighbourhood_group
## 0 0
## neighbourhood room_type
## 0 0
## latitude longitude
## 0 0
## minimum_nights number_of_reviews
## 0 0
## calculated_host_listings_count availability_365
## 0 0
## is_central reviews_density
## 0 0
## high_availability host_multi_listing
## 0 0
skim(airbnb.raw)
| Name | airbnb.raw |
| Number of rows | 31753 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| neighbourhood_group | 0 | 1 | 5 | 13 | 0 | 5 | 0 |
| neighbourhood | 0 | 1 | 4 | 26 | 0 | 217 | 0 |
| room_type | 0 | 1 | 11 | 15 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| price | 0 | 1 | 140.56 | 112.91 | 10.00 | 69.00 | 105.00 | 175.00 | 999.00 | ▇▁▁▁▁ |
| latitude | 0 | 1 | 40.73 | 0.05 | 40.50 | 40.69 | 40.72 | 40.76 | 40.91 | ▁▁▇▅▁ |
| longitude | 0 | 1 | -73.95 | 0.05 | -74.24 | -73.98 | -73.96 | -73.94 | -73.72 | ▁▁▇▂▁ |
| minimum_nights | 0 | 1 | 6.98 | 21.03 | 1.00 | 1.00 | 2.00 | 5.00 | 1250.00 | ▇▁▁▁▁ |
| number_of_reviews | 0 | 1 | 23.35 | 44.33 | 0.00 | 1.00 | 5.00 | 24.00 | 607.00 | ▇▁▁▁▁ |
| calculated_host_listings_count | 0 | 1 | 7.31 | 33.48 | 1.00 | 1.00 | 1.00 | 2.00 | 327.00 | ▇▁▁▁▁ |
| availability_365 | 0 | 1 | 112.79 | 131.72 | 0.00 | 0.00 | 45.00 | 228.00 | 365.00 | ▇▂▁▁▂ |
| is_central | 0 | 1 | 0.85 | 0.35 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▂▁▁▁▇ |
| reviews_density | 0 | 1 | 3.33 | 12.94 | 0.00 | 0.01 | 0.20 | 1.92 | 480.00 | ▇▁▁▁▁ |
| high_availability | 0 | 1 | 0.27 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| host_multi_listing | 0 | 1 | 0.34 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
The dataset contains listing characteristics such as location,
neighbourhood, room type, review activity, minimum nights, availability
and host activity. The target variable is price.
Our first impression from the data was that this is not a clean textbook regression problem. Prices are very uneven, and neighbourhood information is important but also quite granular. That made us expect tree-based models to do well, because they can capture non-linear differences between locations and room types without forcing one global linear relationship.
The target must be positive. Categorical predictors are converted into factors. Binary variables are recoded into readable labels. Rare neighbourhood levels are grouped using only the training set in order to avoid data leakage.
airbnb.clean <- airbnb.raw %>%
filter(!is.na(price), price > 0) %>%
mutate(
neighbourhood_group = factor(neighbourhood_group),
neighbourhood = factor(neighbourhood),
room_type = factor(room_type),
is_central = factor(is_central, levels = c(0, 1), labels = c("no", "yes")),
high_availability = factor(high_availability, levels = c(0, 1), labels = c("no", "yes")),
host_multi_listing = factor(host_multi_listing, levels = c(0, 1), labels = c("no", "yes"))
)
summary(airbnb.clean$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.0 69.0 105.0 140.6 175.0 999.0
colSums(is.na(airbnb.clean)) %>% sort(decreasing = TRUE)
## price neighbourhood_group
## 0 0
## neighbourhood room_type
## 0 0
## latitude longitude
## 0 0
## minimum_nights number_of_reviews
## 0 0
## calculated_host_listings_count availability_365
## 0 0
## is_central reviews_density
## 0 0
## high_availability host_multi_listing
## 0 0
combine_rare_levels <- function(x, retained_levels) {
x <- as.character(x)
factor(ifelse(x %in% retained_levels, x, "other"))
}
add_engineered_features <- function(data) {
data %>%
mutate(
log_minimum_nights = log1p(minimum_nights),
log_number_of_reviews = log1p(number_of_reviews),
log_host_listings = log1p(calculated_host_listings_count),
log_reviews_density = log1p(reviews_density),
distance_to_midtown_km = sqrt(
((latitude - 40.7580) * 111)^2 +
((longitude - (-73.9855)) * 85)^2
),
location_grid = factor(
paste(round(latitude, 2), round(longitude, 2), sep = "_")
)
)
}
The additional variables are used in the extended tuning stage. They include logarithms of skewed predictors, distance from Midtown Manhattan, and a local coordinate grid.
We decided not to remove expensive listings automatically. Some of them are probably unusual, but they still represent real offers that a pricing model could encounter. Instead of deleting them, we kept them in the main modelling task and used robust interpretation through MAE together with RMSE.
plot.price <- ggplot(airbnb.clean, aes(x = price)) +
geom_histogram(bins = 80, fill = "#2c7fb8", color = "white") +
coord_cartesian(xlim = c(0, quantile(airbnb.clean$price, 0.99))) +
theme_bw() +
labs(
title = "Distribution of Airbnb prices",
x = "Price",
y = "Number of listings"
)
plot.price
save_plot(plot.price, "final_eda_price_distribution.png")
plot.log.price <- ggplot(airbnb.clean, aes(x = log1p(price))) +
geom_histogram(bins = 60, fill = "#41ab5d", color = "white") +
theme_bw() +
labs(
title = "Distribution of log1p(price)",
x = "log1p(price)",
y = "Number of listings"
)
plot.log.price
save_plot(plot.log.price, "final_eda_log_price_distribution.png")
The raw target is right-skewed. This justifies comparing models
trained on price and on log1p(price).
This skewness is one of the reasons why we did not want to rely on a single metric. RMSE reacts strongly to very expensive listings, while MAE gives a more intuitive answer to the question: “how many dollars off is the model on average?”
plot.room <- ggplot(airbnb.clean, aes(x = room_type, y = price, fill = room_type)) +
geom_boxplot(outlier.alpha = 0.15) +
coord_cartesian(ylim = c(0, quantile(airbnb.clean$price, 0.99))) +
theme_bw() +
theme(legend.position = "none") +
labs(
title = "Price by room type",
x = "Room type",
y = "Price"
)
plot.room
save_plot(plot.room, "final_eda_price_by_room_type.png")
plot.borough <- ggplot(
airbnb.clean,
aes(x = reorder(neighbourhood_group, price, median), y = price, fill = neighbourhood_group)
) +
geom_boxplot(outlier.alpha = 0.10) +
coord_cartesian(ylim = c(0, quantile(airbnb.clean$price, 0.99))) +
theme_bw() +
theme(legend.position = "none") +
labs(
title = "Price by borough",
x = "Borough",
y = "Price"
)
plot.borough
save_plot(plot.borough, "final_eda_price_by_borough.png")
plot.spatial <- ggplot(
airbnb.clean,
aes(x = longitude, y = latitude, color = price)
) +
geom_point(alpha = 0.45, size = 0.6) +
scale_color_viridis_c(option = "C", trans = "log10") +
theme_bw() +
labs(
title = "Spatial distribution of prices",
x = "Longitude",
y = "Latitude",
color = "Price"
)
plot.spatial
save_plot(plot.spatial, "final_eda_spatial_prices.png")
numeric.data <- airbnb.clean %>%
select(where(is.numeric))
cor.matrix <- cor(numeric.data, use = "pairwise.complete.obs")
corrplot(
cor.matrix,
method = "color",
type = "upper",
tl.cex = 0.7,
tl.col = "black"
)
set.seed(123)
train.ids <- createDataPartition(
airbnb.clean$price,
p = 0.70,
list = FALSE
)
airbnb.train <- airbnb.clean[train.ids, ]
airbnb.test <- airbnb.clean[-train.ids, ]
real.train <- airbnb.train$price
real.test <- airbnb.test$price
popular.neighbourhoods <- airbnb.train %>%
count(neighbourhood, sort = TRUE) %>%
filter(n >= 100) %>%
pull(neighbourhood) %>%
as.character()
airbnb.train <- airbnb.train %>%
mutate(neighbourhood = combine_rare_levels(neighbourhood, popular.neighbourhoods))
airbnb.test <- airbnb.test %>%
mutate(neighbourhood = combine_rare_levels(neighbourhood, popular.neighbourhoods))
factor.variables <- names(airbnb.train)[sapply(airbnb.train, is.factor)]
for (variable in factor.variables) {
airbnb.test[[variable]] <- factor(
airbnb.test[[variable]],
levels = levels(airbnb.train[[variable]])
)
}
# The saved OLS and GLMNET models were trained after removing near-zero
# variance predictors. The same step must be reproduced before prediction.
nzv.index <- nearZeroVar(airbnb.train %>% select(-price))
nzv.names <- names(airbnb.train %>% select(-price))[nzv.index]
nzv.names
## character(0)
airbnb.train.ols <- airbnb.train %>%
select(-any_of(nzv.names))
airbnb.test.ols <- airbnb.test %>%
select(-any_of(nzv.names))
airbnb.train.tree <- airbnb.train
airbnb.test.tree <- airbnb.test
The split is done before preprocessing decisions that depend on the data. This prevents test-set information from entering the training process.
We kept the test set separate throughout the analysis, even when it was tempting to use it for quick decisions. This is important because a model that looks good after many informal checks on the test set may be partly tuned to that specific test sample.
pred.benchmark.train <- rep(mean(real.train), length(real.train))
pred.benchmark.test <- rep(mean(real.train), length(real.test))
metrics.benchmark.train <- regression_metrics(real.train, pred.benchmark.train)
metrics.benchmark.test <- regression_metrics(real.test, pred.benchmark.test)
rbind(
train = metrics.benchmark.train,
test = metrics.benchmark.test
)
The main metrics are:
For final interpretation, MAE is the easiest metric to explain in business terms. RMSE is useful for model selection because large mistakes matter, but MAE is closer to how a host or platform user would feel the model error.
# Original training command:
# model.ols <- lm(price ~ ., data = airbnb.train.ols)
# saveRDS(model.ols, file.path(MODELS_DIR, "airbnb_ols.rds"))
model.ols <- readRDS(file.path(MODELS_DIR, "airbnb_ols.rds"))
summary(model.ols)
##
## Call:
## lm(formula = price ~ ., data = airbnb.train.ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -222.35 -45.03 -13.81 17.52 846.15
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) -15417.950609 3006.165589 -5.129
## neighbourhood_groupBrooklyn 16.104732 10.190459 1.580
## neighbourhood_groupManhattan 100.020457 9.060477 11.039
## neighbourhood_groupQueens 20.312757 7.503940 2.707
## neighbourhood_groupStaten Island -37.084814 15.489284 -2.394
## neighbourhoodBedford-Stuyvesant 2.623407 7.518619 0.349
## neighbourhoodBushwick 5.112506 8.084026 0.632
## neighbourhoodCarroll Gardens 37.680524 10.645304 3.540
## neighbourhoodChelsea -3.331401 8.734448 -0.381
## neighbourhoodChinatown -45.003830 10.625785 -4.235
## neighbourhoodClinton Hill 29.431944 8.771151 3.356
## neighbourhoodCrown Heights 5.846861 7.837011 0.746
## neighbourhoodDitmars Steinway -3.815037 8.643316 -0.441
## neighbourhoodEast Flatbush -9.419398 9.614256 -0.980
## neighbourhoodEast Harlem -63.875012 9.548876 -6.689
## neighbourhoodEast New York -7.494405 12.241237 -0.612
## neighbourhoodEast Village -29.571197 8.676843 -3.408
## neighbourhoodFinancial District -25.978687 9.720309 -2.673
## neighbourhoodFlatbush -6.828764 8.896873 -0.768
## neighbourhoodFlushing 18.712239 8.583837 2.180
## neighbourhoodFort Greene 33.230401 8.994234 3.695
## neighbourhoodGowanus 32.940883 10.619896 3.102
## neighbourhoodGramercy -24.920101 10.792574 -2.309
## neighbourhoodGreenpoint 26.611680 8.163858 3.260
## neighbourhoodGreenwich Village 2.091020 10.155425 0.206
## neighbourhoodHarlem -72.407619 9.080123 -7.974
## neighbourhoodHell's Kitchen -13.064670 8.338871 -1.567
## neighbourhoodInwood -97.716303 12.697302 -7.696
## neighbourhoodJamaica 21.835299 11.410874 1.914
## neighbourhoodKips Bay -29.867372 10.234283 -2.918
## neighbourhoodLong Island City 19.483584 7.319349 2.662
## neighbourhoodLower East Side -42.710321 9.232878 -4.626
## neighbourhoodMidtown 32.819729 8.749832 3.751
## neighbourhoodMorningside Heights -65.764210 11.186332 -5.879
## neighbourhoodMurray Hill -15.491447 10.155271 -1.525
## neighbourhoodNolita 0.668912 11.664186 0.057
## neighbourhoodother 5.478261 6.325343 0.866
## neighbourhoodPark Slope 43.212795 8.732484 4.949
## neighbourhoodProspect-Lefferts Gardens -5.898835 9.135760 -0.646
## neighbourhoodProspect Heights 26.030411 9.634914 2.702
## neighbourhoodRidgewood -10.889638 8.101254 -1.344
## neighbourhoodSoHo 39.593799 10.799113 3.666
## neighbourhoodSouth Slope 35.024787 10.730664 3.264
## neighbourhoodSunnyside -9.386204 8.280638 -1.134
## neighbourhoodSunset Park -2.671060 9.055908 -0.295
## neighbourhoodTheater District 18.806875 11.368187 1.654
## neighbourhoodUpper East Side -39.304005 8.938378 -4.397
## neighbourhoodUpper West Side -31.650224 8.635101 -3.665
## neighbourhoodWashington Heights -91.779771 10.311394 -8.901
## neighbourhoodWest Village 10.366779 9.189347 1.128
## neighbourhoodWilliamsburg 36.570388 7.320531 4.996
## neighbourhoodWoodside -5.889698 9.876853 -0.596
## room_typePrivate room -91.472272 1.317084 -69.451
## room_typeShared room -113.444715 4.046759 -28.033
## latitude 51.208969 40.629400 1.260
## longitude -182.015142 36.219518 -5.025
## minimum_nights -0.313990 0.028246 -11.116
## number_of_reviews -0.156840 0.014754 -10.630
## calculated_host_listings_count -0.008554 0.021757 -0.393
## availability_365 0.212647 0.011573 18.374
## is_centralyes NA NA NA
## reviews_density 0.058165 0.051135 1.137
## high_availabilityyes -24.606196 3.250890 -7.569
## host_multi_listingyes -2.967514 1.452801 -2.043
## Pr(>|t|)
## (Intercept) 0.00000029408648903 ***
## neighbourhood_groupBrooklyn 0.114036
## neighbourhood_groupManhattan < 0.0000000000000002 ***
## neighbourhood_groupQueens 0.006796 **
## neighbourhood_groupStaten Island 0.016664 *
## neighbourhoodBedford-Stuyvesant 0.727152
## neighbourhoodBushwick 0.527118
## neighbourhoodCarroll Gardens 0.000401 ***
## neighbourhoodChelsea 0.702903
## neighbourhoodChinatown 0.00002291245001899 ***
## neighbourhoodClinton Hill 0.000793 ***
## neighbourhoodCrown Heights 0.455641
## neighbourhoodDitmars Steinway 0.658938
## neighbourhoodEast Flatbush 0.327229
## neighbourhoodEast Harlem 0.00000000002296315 ***
## neighbourhoodEast New York 0.540394
## neighbourhoodEast Village 0.000655 ***
## neighbourhoodFinancial District 0.007532 **
## neighbourhoodFlatbush 0.442765
## neighbourhoodFlushing 0.029272 *
## neighbourhoodFort Greene 0.000221 ***
## neighbourhoodGowanus 0.001926 **
## neighbourhoodGramercy 0.020952 *
## neighbourhoodGreenpoint 0.001117 **
## neighbourhoodGreenwich Village 0.836870
## neighbourhoodHarlem 0.00000000000000161 ***
## neighbourhoodHell's Kitchen 0.117195
## neighbourhoodInwood 0.00000000000001464 ***
## neighbourhoodJamaica 0.055690 .
## neighbourhoodKips Bay 0.003522 **
## neighbourhoodLong Island City 0.007775 **
## neighbourhoodLower East Side 0.00000375092517111 ***
## neighbourhoodMidtown 0.000177 ***
## neighbourhoodMorningside Heights 0.00000000418720647 ***
## neighbourhoodMurray Hill 0.127159
## neighbourhoodNolita 0.954269
## neighbourhoodother 0.386455
## neighbourhoodPark Slope 0.00000075331432597 ***
## neighbourhoodProspect-Lefferts Gardens 0.518489
## neighbourhoodProspect Heights 0.006904 **
## neighbourhoodRidgewood 0.178900
## neighbourhoodSoHo 0.000247 ***
## neighbourhoodSouth Slope 0.001100 **
## neighbourhoodSunnyside 0.257011
## neighbourhoodSunset Park 0.768033
## neighbourhoodTheater District 0.098072 .
## neighbourhoodUpper East Side 0.00001101570229775 ***
## neighbourhoodUpper West Side 0.000248 ***
## neighbourhoodWashington Heights < 0.0000000000000002 ***
## neighbourhoodWest Village 0.259277
## neighbourhoodWilliamsburg 0.00000059101325900 ***
## neighbourhoodWoodside 0.550972
## room_typePrivate room < 0.0000000000000002 ***
## room_typeShared room < 0.0000000000000002 ***
## latitude 0.207541
## longitude 0.00000050646876510 ***
## minimum_nights < 0.0000000000000002 ***
## number_of_reviews < 0.0000000000000002 ***
## calculated_host_listings_count 0.694215
## availability_365 < 0.0000000000000002 ***
## is_centralyes NA
## reviews_density 0.255345
## high_availabilityyes 0.00000000000003906 ***
## host_multi_listingyes 0.041102 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.09 on 22166 degrees of freedom
## Multiple R-squared: 0.3668, Adjusted R-squared: 0.365
## F-statistic: 207.1 on 62 and 22166 DF, p-value: < 0.00000000000000022
pred.ols.train <- predict(model.ols, newdata = airbnb.train.ols)
pred.ols.test <- predict(model.ols, newdata = airbnb.test.ols)
metrics.ols.train <- regression_metrics(real.train, pred.ols.train)
metrics.ols.test <- regression_metrics(real.test, pred.ols.test)
rbind(train = metrics.ols.train, test = metrics.ols.test)
ols.coefficients <- data.frame(
variable = names(coef(model.ols)),
coefficient = as.numeric(coef(model.ols))
) %>%
arrange(desc(abs(coefficient)))
head(ols.coefficients, 15)
control.cv <- trainControl(
method = "cv",
number = 5
)
grid.glmnet <- expand.grid(
alpha = c(0, 0.5, 1),
lambda = 10^seq(-3, 1, length = 20)
)
# Original tuning command:
# set.seed(123)
# model.glmnet <- train(
# price ~ .,
# data = airbnb.train.ols,
# method = "glmnet",
# trControl = control.cv,
# tuneGrid = grid.glmnet,
# metric = "RMSE"
# )
# saveRDS(model.glmnet, file.path(MODELS_DIR, "airbnb_glmnet.rds"))
model.glmnet <- readRDS(file.path(MODELS_DIR, "airbnb_glmnet.rds"))
model.glmnet
## glmnet
##
## 22229 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 17784, 17783, 17782, 17784, 17783
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.0 0.001000000 90.39598 0.3611964 53.65085
## 0.0 0.001832981 90.39598 0.3611964 53.65085
## 0.0 0.003359818 90.39598 0.3611964 53.65085
## 0.0 0.006158482 90.39598 0.3611964 53.65085
## 0.0 0.011288379 90.39598 0.3611964 53.65085
## 0.0 0.020691381 90.39598 0.3611964 53.65085
## 0.0 0.037926902 90.39598 0.3611964 53.65085
## 0.0 0.069519280 90.39598 0.3611964 53.65085
## 0.0 0.127427499 90.39598 0.3611964 53.65085
## 0.0 0.233572147 90.39598 0.3611964 53.65085
## 0.0 0.428133240 90.39598 0.3611964 53.65085
## 0.0 0.784759970 90.39598 0.3611964 53.65085
## 0.0 1.438449888 90.39598 0.3611964 53.65085
## 0.0 2.636650899 90.39598 0.3611964 53.65085
## 0.0 4.832930239 90.39598 0.3611964 53.65085
## 0.0 8.858667904 90.53259 0.3599106 53.66280
## 0.0 16.237767392 90.81296 0.3578368 53.79253
## 0.0 29.763514416 91.37112 0.3546925 54.23014
## 0.0 54.555947812 92.47295 0.3494640 55.31123
## 0.0 100.000000000 94.41679 0.3407376 57.37683
## 0.5 0.001000000 90.24221 0.3630061 53.79135
## 0.5 0.001832981 90.24221 0.3630061 53.79135
## 0.5 0.003359818 90.24221 0.3630061 53.79135
## 0.5 0.006158482 90.24221 0.3630061 53.79135
## 0.5 0.011288379 90.24221 0.3630061 53.79135
## 0.5 0.020691381 90.24221 0.3630061 53.79135
## 0.5 0.037926902 90.24227 0.3630059 53.79126
## 0.5 0.069519280 90.24440 0.3629720 53.78433
## 0.5 0.127427499 90.25182 0.3628644 53.77034
## 0.5 0.233572147 90.26118 0.3627368 53.74542
## 0.5 0.428133240 90.29946 0.3622251 53.71824
## 0.5 0.784759970 90.37382 0.3612806 53.69331
## 0.5 1.438449888 90.53433 0.3593374 53.71599
## 0.5 2.636650899 90.87134 0.3554670 53.89250
## 0.5 4.832930239 91.52329 0.3485840 54.34087
## 0.5 8.858667904 92.98090 0.3327464 55.46401
## 0.5 16.237767392 95.21284 0.3120691 57.40628
## 0.5 29.763514416 99.34455 0.2716039 61.30409
## 0.5 54.555947812 104.91913 0.2325039 67.76424
## 0.5 100.000000000 112.52486 0.2085917 76.38939
## 1.0 0.001000000 90.24062 0.3630260 53.79134
## 1.0 0.001832981 90.24062 0.3630260 53.79134
## 1.0 0.003359818 90.24062 0.3630260 53.79134
## 1.0 0.006158482 90.24062 0.3630260 53.79134
## 1.0 0.011288379 90.24062 0.3630260 53.79134
## 1.0 0.020691381 90.24069 0.3630249 53.79123
## 1.0 0.037926902 90.24447 0.3629715 53.78501
## 1.0 0.069519280 90.25170 0.3628664 53.77193
## 1.0 0.127427499 90.25996 0.3627543 53.74740
## 1.0 0.233572147 90.29933 0.3622236 53.72270
## 1.0 0.428133240 90.37507 0.3612454 53.70110
## 1.0 0.784759970 90.54418 0.3591360 53.73218
## 1.0 1.438449888 90.90319 0.3548002 53.93387
## 1.0 2.636650899 91.57131 0.3472117 54.38527
## 1.0 4.832930239 93.05908 0.3295869 55.49647
## 1.0 8.858667904 95.09350 0.3093418 57.16471
## 1.0 16.237767392 99.18747 0.2632792 60.91656
## 1.0 29.763514416 104.76296 0.2131308 67.48301
## 1.0 54.555947812 113.03935 NaN 76.96256
## 1.0 100.000000000 113.03935 NaN 76.96256
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.01128838.
model.glmnet$bestTune
pred.glmnet.train <- predict(model.glmnet, newdata = airbnb.train.ols)
pred.glmnet.test <- predict(model.glmnet, newdata = airbnb.test.ols)
metrics.glmnet.train <- regression_metrics(real.train, pred.glmnet.train)
metrics.glmnet.test <- regression_metrics(real.test, pred.glmnet.test)
rbind(train = metrics.glmnet.train, test = metrics.glmnet.test)
# Original training command:
# set.seed(123)
# model.tree.default <- rpart(
# price ~ .,
# data = airbnb.train.tree,
# method = "anova"
# )
# saveRDS(model.tree.default, file.path(MODELS_DIR, "airbnb_cart_default.rds"))
model.tree.default <- readRDS(file.path(MODELS_DIR, "airbnb_cart_default.rds"))
pred.tree.default.train <- predict(model.tree.default, newdata = airbnb.train.tree)
pred.tree.default.test <- predict(model.tree.default, newdata = airbnb.test.tree)
metrics.tree.default.train <- regression_metrics(real.train, pred.tree.default.train)
metrics.tree.default.test <- regression_metrics(real.test, pred.tree.default.test)
rbind(train = metrics.tree.default.train, test = metrics.tree.default.test)
# Original pruning commands:
# set.seed(123)
# model.tree.controlled <- rpart(
# price ~ .,
# data = airbnb.train.tree,
# method = "anova",
# control = rpart.control(cp = 0.001, xval = 10)
# )
# best.cp <- model.tree.controlled$cptable[
# which.min(model.tree.controlled$cptable[, "xerror"]),
# "CP"
# ]
# model.tree.pruned <- prune(model.tree.controlled, cp = best.cp)
# saveRDS(model.tree.pruned, file.path(MODELS_DIR, "airbnb_cart_pruned.rds"))
model.tree.pruned <- readRDS(file.path(MODELS_DIR, "airbnb_cart_pruned.rds"))
pred.tree.pruned.train <- predict(model.tree.pruned, newdata = airbnb.train.tree)
pred.tree.pruned.test <- predict(model.tree.pruned, newdata = airbnb.test.tree)
metrics.tree.pruned.train <- regression_metrics(real.train, pred.tree.pruned.train)
metrics.tree.pruned.test <- regression_metrics(real.test, pred.tree.pruned.test)
rbind(train = metrics.tree.pruned.train, test = metrics.tree.pruned.test)
rpart.plot(
model.tree.pruned,
main = "Pruned regression tree"
)
p.airbnb <- ncol(airbnb.train.tree) - 1
# Original bagging command:
# set.seed(123)
# model.bagging <- randomForest(
# price ~ .,
# data = airbnb.train.tree,
# mtry = p.airbnb,
# importance = TRUE
# )
# saveRDS(model.bagging, file.path(MODELS_DIR, "airbnb_bagging.rds"))
# Original Random Forest command:
# set.seed(123)
# model.rf <- randomForest(
# price ~ .,
# data = airbnb.train.tree,
# importance = TRUE
# )
# saveRDS(model.rf, file.path(MODELS_DIR, "airbnb_random_forest.rds"))
model.bagging <- readRDS(file.path(MODELS_DIR, "airbnb_bagging.rds"))
model.rf <- readRDS(file.path(MODELS_DIR, "airbnb_random_forest.rds"))
pred.bag.train <- predict(model.bagging, newdata = airbnb.train.tree)
pred.bag.test <- predict(model.bagging, newdata = airbnb.test.tree)
pred.rf.train <- predict(model.rf, newdata = airbnb.train.tree)
pred.rf.test <- predict(model.rf, newdata = airbnb.test.tree)
metrics.bag.train <- regression_metrics(real.train, pred.bag.train)
metrics.bag.test <- regression_metrics(real.test, pred.bag.test)
metrics.rf.train <- regression_metrics(real.train, pred.rf.train)
metrics.rf.test <- regression_metrics(real.test, pred.rf.test)
rbind(
Bagging_train = metrics.bag.train,
Bagging_test = metrics.bag.test,
RF_train = metrics.rf.train,
RF_test = metrics.rf.test
)
grid.rf <- expand.grid(
mtry = c(3, 5, 7, 9),
splitrule = "variance",
min.node.size = c(5, 10, 20)
)
# Original tuning command:
# set.seed(123)
# model.rf.tuned <- train(
# price ~ .,
# data = airbnb.train.tree,
# method = "ranger",
# trControl = control.cv,
# tuneGrid = grid.rf,
# metric = "RMSE",
# importance = "permutation"
# )
# saveRDS(model.rf.tuned, file.path(MODELS_DIR, "airbnb_random_forest_tuned.rds"))
model.rf.tuned <- readRDS(file.path(MODELS_DIR, "airbnb_random_forest_tuned.rds"))
model.rf.tuned
## Random Forest
##
## 22229 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 17784, 17783, 17782, 17784, 17783
## Resampling results across tuning parameters:
##
## mtry min.node.size RMSE Rsquared MAE
## 2 5 97.29030 0.3260055 61.07318
## 2 10 97.88314 0.3193010 61.73504
## 2 20 97.73341 0.3200835 61.56104
## 4 5 90.85075 0.3865473 54.99254
## 4 10 90.87544 0.3869226 55.06398
## 4 20 91.17436 0.3823284 55.30367
## 6 5 86.81101 0.4272049 51.41793
## 6 10 87.06469 0.4242464 51.63108
## 6 20 87.13046 0.4232197 51.62803
## 8 5 84.83022 0.4450666 49.79568
## 8 10 84.94662 0.4437740 49.85793
## 8 20 85.18869 0.4407271 49.94693
## 10 5 83.98413 0.4514135 49.09499
## 10 10 84.08005 0.4506435 49.14023
## 10 20 84.26683 0.4488577 49.23690
## 12 5 83.61572 0.4542409 48.77377
## 12 10 83.73997 0.4529458 48.89917
## 12 20 83.85036 0.4521699 48.94918
##
## Tuning parameter 'splitrule' was held constant at a value of variance
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 12, splitrule = variance
## and min.node.size = 5.
model.rf.tuned$bestTune
pred.rf.tuned.train <- predict(model.rf.tuned, newdata = airbnb.train.tree)
pred.rf.tuned.test <- predict(model.rf.tuned, newdata = airbnb.test.tree)
metrics.rf.tuned.train <- regression_metrics(real.train, pred.rf.tuned.train)
metrics.rf.tuned.test <- regression_metrics(real.test, pred.rf.tuned.test)
rbind(train = metrics.rf.tuned.train, test = metrics.rf.tuned.test)
importance.rf <- randomForest::importance(model.rf)
importance.rf.df <- data.frame(
variable = rownames(importance.rf),
IncNodePurity = importance.rf[, "IncNodePurity"]
) %>%
arrange(desc(IncNodePurity)) %>%
slice(1:20)
plot.rf.importance <- ggplot(
importance.rf.df,
aes(x = reorder(variable, IncNodePurity), y = IncNodePurity)
) +
geom_col(fill = "#2c7fb8") +
coord_flip() +
theme_bw() +
labs(
title = "Random Forest variable importance",
x = "Variable",
y = "Increase in node purity"
)
plot.rf.importance
save_plot(plot.rf.importance, "final_random_forest_importance.png")
grid.gbm <- expand.grid(
interaction.depth = c(1, 2, 3),
n.trees = c(100, 300, 500),
shrinkage = c(0.01, 0.05),
n.minobsinnode = c(10, 20)
)
# Original tuning command:
# set.seed(123)
# model.gbm <- train(
# price ~ .,
# data = airbnb.train.tree,
# method = "gbm",
# trControl = control.cv,
# tuneGrid = grid.gbm,
# metric = "RMSE",
# verbose = FALSE
# )
# saveRDS(model.gbm, file.path(MODELS_DIR, "airbnb_gbm.rds"))
model.gbm <- readRDS(file.path(MODELS_DIR, "airbnb_gbm.rds"))
model.gbm
## Stochastic Gradient Boosting
##
## 22229 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 17784, 17783, 17782, 17784, 17783
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode n.trees RMSE Rsquared
## 0.01 1 10 100 101.37632 0.2644749
## 0.01 1 10 300 95.68549 0.3092401
## 0.01 1 10 500 93.43678 0.3328510
## 0.01 1 20 100 101.37512 0.2653646
## 0.01 1 20 300 95.67114 0.3088630
## 0.01 1 20 500 93.42151 0.3330108
## 0.01 2 10 100 98.73190 0.2981678
## 0.01 2 10 300 92.26147 0.3511508
## 0.01 2 10 500 89.93124 0.3780368
## 0.01 2 20 100 98.73527 0.2985133
## 0.01 2 20 300 92.26225 0.3510600
## 0.01 2 20 500 89.95995 0.3775847
## 0.01 3 10 100 97.20368 0.3242921
## 0.01 3 10 300 90.33850 0.3771726
## 0.01 3 10 500 88.02433 0.4018948
## 0.01 3 20 100 97.19500 0.3239763
## 0.01 3 20 300 90.37368 0.3763813
## 0.01 3 20 500 88.09297 0.4006288
## 0.05 1 10 100 93.36590 0.3334723
## 0.05 1 10 300 89.69518 0.3748576
## 0.05 1 10 500 88.79215 0.3842214
## 0.05 1 20 100 93.37143 0.3343785
## 0.05 1 20 300 89.67429 0.3752339
## 0.05 1 20 500 88.76947 0.3845758
## 0.05 2 10 100 89.89319 0.3780530
## 0.05 2 10 300 87.02157 0.4088911
## 0.05 2 10 500 86.33595 0.4175179
## 0.05 2 20 100 89.92557 0.3773864
## 0.05 2 20 300 87.07697 0.4081333
## 0.05 2 20 500 86.32500 0.4175460
## 0.05 3 10 100 88.05525 0.4009267
## 0.05 3 10 300 85.53757 0.4285283
## 0.05 3 10 500 84.83256 0.4374377
## 0.05 3 20 100 88.09349 0.4000672
## 0.05 3 20 300 85.68321 0.4264457
## 0.05 3 20 500 85.05745 0.4343600
## MAE
## 63.69982
## 57.60641
## 55.67469
## 63.70730
## 57.58794
## 55.63698
## 61.25522
## 54.84469
## 52.94598
## 61.25523
## 54.80262
## 52.91420
## 60.00274
## 53.42618
## 51.60880
## 60.00653
## 53.43218
## 51.63137
## 55.62788
## 52.89212
## 52.77673
## 55.61048
## 52.91206
## 52.79919
## 52.94567
## 51.03731
## 50.74902
## 52.90823
## 51.03901
## 50.71150
## 51.60729
## 49.97217
## 49.58412
## 51.59365
## 49.95622
## 49.65879
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 500, interaction.depth =
## 3, shrinkage = 0.05 and n.minobsinnode = 10.
model.gbm$bestTune
pred.gbm.train <- predict(model.gbm, newdata = airbnb.train.tree)
pred.gbm.test <- predict(model.gbm, newdata = airbnb.test.tree)
metrics.gbm.train <- regression_metrics(real.train, pred.gbm.train)
metrics.gbm.test <- regression_metrics(real.test, pred.gbm.test)
rbind(train = metrics.gbm.train, test = metrics.gbm.test)
plot.gbm.importance <- plot(
varImp(model.gbm),
top = 20,
main = "GBM variable importance"
)
plot.gbm.importance
XGBoost was trained with the native API instead of
caret::train(method = "xgbTree"), because recent XGBoost
versions may not cooperate reliably with caret.
dummy.xgb <- dummyVars(price ~ ., data = airbnb.train.tree, fullRank = TRUE)
xgb.train.df <- predict(dummy.xgb, newdata = airbnb.train.tree) %>%
as.data.frame()
xgb.test.df <- predict(dummy.xgb, newdata = airbnb.test.tree) %>%
as.data.frame()
names(xgb.train.df) <- make.names(names(xgb.train.df), unique = TRUE)
names(xgb.test.df) <- make.names(names(xgb.test.df), unique = TRUE)
nzv.xgb <- nearZeroVar(xgb.train.df)
if (length(nzv.xgb) > 0) {
xgb.train.df <- xgb.train.df[, -nzv.xgb, drop = FALSE]
xgb.test.df <- xgb.test.df[, -nzv.xgb, drop = FALSE]
}
dtrain <- xgb.DMatrix(
data = as.matrix(xgb.train.df),
label = real.train
)
dtest <- xgb.DMatrix(
data = as.matrix(xgb.test.df),
label = real.test
)
# Original tuning command used xgb.cv() over a grid of eta, max_depth,
# min_child_weight, subsample and colsample_bytree. The best configuration was
# then refitted with xgb.train().
#
# model.xgb <- xgb.train(
# params = params.xgb.best,
# data = dtrain,
# nrounds = best.xgb$nrounds,
# verbose = 0
# )
# saveRDS(model.xgb, file.path(MODELS_DIR, "airbnb_xgboost.rds"))
model.xgb <- readRDS(file.path(MODELS_DIR, "airbnb_xgboost.rds"))
pred.xgb.train <- predict(model.xgb, newdata = dtrain)
pred.xgb.test <- predict(model.xgb, newdata = dtest)
metrics.xgb.train <- regression_metrics(real.train, pred.xgb.train)
metrics.xgb.test <- regression_metrics(real.test, pred.xgb.test)
rbind(train = metrics.xgb.train, test = metrics.xgb.test)
importance.xgb <- xgb.importance(
feature_names = colnames(xgb.train.df),
model = model.xgb
)
head(importance.xgb, 20)
Keras was used instead of the older neuralnet package
because it gives direct control over epochs, batch size, dropout, L2
regularization, early stopping and learning-rate reduction. The models
were trained in a separate tuning script and saved to disk.
# Original simple Keras architecture pattern:
# model.keras <- keras_model_sequential(input_shape = c(ncol(x.nn.train))) %>%
# layer_dense(units = 16, activation = "relu") %>%
# layer_dense(units = 8, activation = "relu") %>%
# layer_dense(units = 1, activation = "linear")
#
# model.keras %>% compile(
# optimizer = optimizer_adam(learning_rate = 0.001),
# loss = "mse",
# metrics = c("mae")
# )
#
# history.keras <- model.keras %>% fit(
# x = x.nn.train,
# y = y.nn.train,
# epochs = 200,
# batch_size = 32,
# validation_split = 0.20,
# callbacks = list(callback_early_stopping(patience = 20)),
# verbose = 0
# )
keras.results.file <- file.path(OUTPUT_DIR, "keras_validation_results.csv")
if (file.exists(keras.results.file)) {
keras.validation.results <- read.csv(keras.results.file)
keras.validation.results
}
The second tuning stage expanded two parts of the project:
log1p(price), additional logarithmic and spatial
predictors,rf.extended.cv.file <- file.path(EXTENDED_DIR, "rf_extended_cv_comparison.csv")
rf.extended.test.file <- file.path(EXTENDED_DIR, "rf_extended_test_metrics.csv")
if (file.exists(rf.extended.cv.file)) {
rf.extended.cv <- read.csv(rf.extended.cv.file)
rf.extended.cv
}
if (file.exists(rf.extended.test.file)) {
rf.extended.test <- read.csv(rf.extended.test.file)
rf.extended.test
}
keras.extended.file <- file.path(EXTENDED_DIR, "keras_extended_validation_results.csv")
keras.extended.test.file <- file.path(EXTENDED_DIR, "keras_extended_test_metrics.csv")
if (file.exists(keras.extended.file)) {
keras.extended.results <- read.csv(keras.extended.file) %>%
arrange(RMSE)
head(keras.extended.results, 10)
}
if (file.exists(keras.extended.test.file)) {
keras.extended.test <- read.csv(keras.extended.test.file)
keras.extended.test
}
In validation, the best extended Keras configuration was a
direct-price model with two hidden layers (128-64), dropout
and L2 regularization. However, on the independent test set, the
extended models did not improve over the best baseline Random
Forest.
This was one of the more interesting parts of the project for us. The neural network tuning was more technically advanced, but it did not automatically translate into the best final result. In this dataset, the simpler Random Forest remained more competitive, which is a useful reminder that model complexity is not the same thing as model quality.
comparison.train <- rbind(
Benchmark = metrics.benchmark.train,
OLS = metrics.ols.train,
GLMNET = metrics.glmnet.train,
CART_default = metrics.tree.default.train,
CART_pruned = metrics.tree.pruned.train,
Bagging = metrics.bag.train,
Random_Forest = metrics.rf.train,
Random_Forest_tuned = metrics.rf.tuned.train,
GBM = metrics.gbm.train,
XGBoost = metrics.xgb.train
)
comparison.test <- rbind(
Benchmark = metrics.benchmark.test,
OLS = metrics.ols.test,
GLMNET = metrics.glmnet.test,
CART_default = metrics.tree.default.test,
CART_pruned = metrics.tree.pruned.test,
Bagging = metrics.bag.test,
Random_Forest = metrics.rf.test,
Random_Forest_tuned = metrics.rf.tuned.test,
GBM = metrics.gbm.test,
XGBoost = metrics.xgb.test
)
comparison.train <- comparison.train %>%
rownames_to_column("model") %>%
arrange(RMSE)
comparison.test <- comparison.test %>%
rownames_to_column("model") %>%
arrange(RMSE)
write.csv(
comparison.train,
file.path(OUTPUT_DIR, "final_script_model_comparison_train.csv"),
row.names = FALSE
)
write.csv(
comparison.test,
file.path(OUTPUT_DIR, "final_script_model_comparison_test.csv"),
row.names = FALSE
)
comparison.train
comparison.test
plot.comparison <- comparison.test %>%
arrange(RMSE) %>%
ggplot(aes(x = reorder(model, RMSE), y = RMSE, fill = model)) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme_bw() +
labs(
title = "Independent test-set RMSE by model",
x = "Model",
y = "RMSE"
)
plot.comparison
save_plot(plot.comparison, "final_model_comparison_rpubs.png")
extended.comparison.file <- file.path(EXTENDED_DIR, "extended_model_comparison_test.csv")
if (file.exists(extended.comparison.file)) {
extended.comparison <- read.csv(extended.comparison.file)
combined.comparison <- bind_rows(
comparison.test,
extended.comparison
) %>%
arrange(RMSE)
combined.comparison
}
The best model in the baseline report is the default Random Forest:
The MAE value means that, on average, the model prediction differs from the true nightly price by about 48 dollars on the independent test set.
In our opinion, this is a reasonable result for this type of data. A 48-dollar average absolute error is not small for a single booking decision, but it is much better than the benchmark and it is understandable given that many price drivers are not present in the dataset.
overfitting.check <- comparison.train %>%
select(model, RMSE_train = RMSE, MAE_train = MAE, R2_train = R2) %>%
inner_join(
comparison.test %>%
select(model, RMSE_test = RMSE, MAE_test = MAE, R2_test = R2),
by = "model"
) %>%
mutate(
RMSE_gap = RMSE_test - RMSE_train,
MAE_gap = MAE_test - MAE_train,
R2_gap = R2_train - R2_test
) %>%
arrange(RMSE_test)
overfitting.check
Random Forest achieved the best test RMSE, but it also shows a clear train-test gap. GBM generalizes more evenly than bagging and default Random Forest, but its test error is slightly higher. Linear models have smaller overfitting gaps, but their predictive performance is weaker.
Because of this, we would not describe Random Forest as a perfect model. We would describe it as the best compromise in this experiment: it predicts better than the alternatives on the test set, but the train-test gap suggests that it has learned some very specific patterns from the training data.
The most useful predictors are related to location, room type, availability and review activity. This is consistent with the business context: nightly price is strongly affected by where the listing is located, what type of accommodation it offers, and how active or visible it is on the platform.
Distance to Midtown Manhattan is an interpretable spatial feature. It converts latitude and longitude into an approximate distance from a central reference point in Manhattan. This helps the model represent the intuition that central locations tend to have different prices than peripheral locations.
This feature is also easy to explain to a non-technical audience. Instead of saying that the model uses raw latitude and longitude, we can say that it also uses an approximate distance from a well-known central point in New York.
The project satisfies the regression task requirements:
price is continuous,The best final model is the baseline Random Forest. It achieved the lowest independent test RMSE among the evaluated models. Its test MAE of about 48 dollars means that the average absolute prediction error is approximately 48 dollars per listing.
The extended tuning did not beat the baseline Random Forest. This is still an important result: more complex feature engineering and larger neural networks do not automatically improve generalization. The best model should be selected based on independent test performance, not on complexity.
If we were continuing this project, we would spend less time making the neural network larger and more time improving the feature set. The missing information about apartment standard, photos, exact dates, seasonality and local events is probably more important than adding another hidden layer.
For future work, the strongest directions are:
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=Polish_Poland.utf8 LC_CTYPE=Polish_Poland.utf8
## [3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C
## [5] LC_TIME=Polish_Poland.utf8
##
## time zone: Europe/Warsaw
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] xgboost_3.2.1.1 gbm_2.2.3 ranger_0.18.0
## [4] randomForest_4.7-1.2 rpart.plot_3.1.4 rpart_4.1.24
## [7] glmnet_4.1-10 Matrix_1.7-3 corrplot_0.95
## [10] skimr_2.2.2 caret_7.0-1 lattice_0.22-7
## [13] lubridate_1.9.4 forcats_1.0.1 stringr_1.5.2
## [16] dplyr_1.2.1 purrr_1.1.0 readr_2.1.5
## [19] tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.3
## [22] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 timeDate_4051.111
## [4] farver_2.1.2 S7_0.2.0 fastmap_1.2.0
## [7] pROC_1.19.0.1 digest_0.6.37 timechange_0.3.0
## [10] lifecycle_1.0.5 survival_3.8-3 magrittr_2.0.4
## [13] compiler_4.5.1 rlang_1.1.7 sass_0.4.10
## [16] tools_4.5.1 yaml_2.3.10 data.table_1.17.8
## [19] knitr_1.51 labeling_0.4.3 plyr_1.8.9
## [22] repr_1.1.7 RColorBrewer_1.1-3 withr_3.0.2
## [25] nnet_7.3-20 grid_4.5.1 stats4_4.5.1
## [28] e1071_1.7-16 future_1.67.0 globals_0.18.0
## [31] scales_1.4.0 iterators_1.0.14 MASS_7.3-65
## [34] cli_3.6.5 rmarkdown_2.31 ragg_1.5.0
## [37] generics_0.1.4 otel_0.2.0 rstudioapi_0.17.1
## [40] future.apply_1.20.0 reshape2_1.4.4 tzdb_0.5.0
## [43] proxy_0.4-27 cachem_1.1.0 splines_4.5.1
## [46] parallel_4.5.1 base64enc_0.1-3 vctrs_0.7.1
## [49] hardhat_1.4.3 jsonlite_2.0.0 hms_1.1.3
## [52] listenv_0.9.1 systemfonts_1.3.1 foreach_1.5.2
## [55] gower_1.0.2 jquerylib_0.1.4 recipes_1.3.1
## [58] glue_1.8.0 parallelly_1.45.1 codetools_0.2-20
## [61] stringi_1.8.7 gtable_0.3.6 shape_1.4.6.1
## [64] pillar_1.11.1 htmltools_0.5.8.1 ipred_0.9-15
## [67] lava_1.8.2 R6_2.6.1 textshaping_1.0.3
## [70] evaluate_1.0.5 bslib_0.9.0 class_7.3-23
## [73] Rcpp_1.1.0 nlme_3.1-168 prodlim_2025.04.28
## [76] xfun_0.53 pkgconfig_2.0.3 ModelMetrics_1.2.2.2