1 Introduction

This project solves a supervised machine learning regression problem. The goal is to predict the continuous target variable price for Airbnb listings in New York City.

The analysis uses the New York City Airbnb Open Data dataset published on Kaggle and provided by lecturer:

https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data

The project compares three model families required in the course:

The report follows the workflow used during classes: data preparation, exploratory analysis, train-test split, cross-validation, model comparison, interpretation, and final conclusions.

We treated this dataset as a practical prediction task rather than as a purely theoretical exercise. In real Airbnb pricing, perfect predictions are unlikely: two apartments in the same area can differ because of photos, interior quality, season, host behaviour, or temporary events. For this reason, we focus not only on finding the smallest error, but also on understanding which model is stable enough to trust.

2 Libraries

# Install only once if needed:
# install.packages(c(
#   "tidyverse", "caret", "skimr", "corrplot", "glmnet", "rpart",
#   "rpart.plot", "randomForest", "ranger", "gbm", "xgboost", "pdp"
# ))
#
# XGBoost note:
# Newer xgboost versions can be unstable with caret::train(method = "xgbTree").
# If caret compatibility is needed, use:
# remotes::install_version("xgboost", version = "1.7.6.1")
#
# Keras note:
# Keras models were trained separately and saved to disk. The final report reads
# their validation and test metrics, so the report does not need to retrain them.

library(tidyverse)
library(caret)
library(skimr)
library(corrplot)
library(glmnet)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ranger)
library(gbm)
library(xgboost)

options(contrasts = c("contr.treatment", "contr.treatment"))
DATA_FILE <- params$data_file
OUTPUT_DIR <- params$output_dir
MODELS_DIR <- file.path(OUTPUT_DIR, "saved_models")
PLOTS_DIR <- file.path(OUTPUT_DIR, "plots")
EXTENDED_DIR <- file.path(OUTPUT_DIR, "extended_tuning")
EXTENDED_MODELS_DIR <- file.path(EXTENDED_DIR, "saved_models")

dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)
dir.create(PLOTS_DIR, recursive = TRUE, showWarnings = FALSE)

save_plot <- function(plot_object, filename, width = 8, height = 5) {
  ggplot2::ggsave(
    filename = file.path(PLOTS_DIR, filename),
    plot = plot_object,
    width = width,
    height = height,
    dpi = 150
  )
}

regression_metrics <- function(real, predicted) {
  predicted.msle <- pmax(predicted, 0)
  mse <- mean((real - predicted)^2)
  
  data.frame(
    MSE = mse,
    RMSE = sqrt(mse),
    MAE = mean(abs(real - predicted)),
    MedAE = median(abs(real - predicted)),
    MAPE = mean(abs(real - predicted) / real),
    MSLE = mean((log1p(real) - log1p(predicted.msle))^2),
    R2 = 1 - sum((real - predicted)^2) / sum((real - mean(real))^2)
  )
}

3 Data

airbnb.raw <- read.csv(DATA_FILE)

glimpse(airbnb.raw)
## Rows: 31,753
## Columns: 14
## $ price                          <int> 125, 40, 300, 129, 40, 100, 60, 90, 70,…
## $ neighbourhood_group            <chr> "Brooklyn", "Brooklyn", "Manhattan", "M…
## $ neighbourhood                  <chr> "Prospect Heights", "Bedford-Stuyvesant…
## $ room_type                      <chr> "Entire home/apt", "Private room", "Ent…
## $ latitude                       <dbl> 40.67717, 40.68477, 40.71960, 40.71489,…
## $ longitude                      <dbl> -73.96915, -73.95090, -73.98133, -73.99…
## $ minimum_nights                 <int> 30, 2, 3, 7, 7, 3, 1, 26, 1, 3, 1, 30, …
## $ number_of_reviews              <int> 1, 1, 30, 11, 0, 0, 54, 0, 7, 1, 6, 57,…
## $ calculated_host_listings_count <int> 1, 1, 2, 1, 1, 1, 1, 2, 4, 8, 1, 6, 2, …
## $ availability_365               <int> 341, 1, 182, 188, 0, 352, 0, 90, 347, 3…
## $ is_central                     <int> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ reviews_density                <dbl> 0.002923977, 0.500000000, 0.163934426, …
## $ high_availability              <int> 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ host_multi_listing             <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, …
dim(airbnb.raw)
## [1] 31753    14
head(airbnb.raw)
summary(airbnb.raw)
##      price       neighbourhood_group neighbourhood       room_type        
##  Min.   : 10.0   Length:31753        Length:31753       Length:31753      
##  1st Qu.: 69.0   Class :character    Class :character   Class :character  
##  Median :105.0   Mode  :character    Mode  :character   Mode  :character  
##  Mean   :140.6                                                            
##  3rd Qu.:175.0                                                            
##  Max.   :999.0                                                            
##     latitude       longitude      minimum_nights     number_of_reviews
##  Min.   :40.50   Min.   :-74.24   Min.   :   1.000   Min.   :  0.00   
##  1st Qu.:40.69   1st Qu.:-73.98   1st Qu.:   1.000   1st Qu.:  1.00   
##  Median :40.72   Median :-73.96   Median :   2.000   Median :  5.00   
##  Mean   :40.73   Mean   :-73.95   Mean   :   6.977   Mean   : 23.35   
##  3rd Qu.:40.76   3rd Qu.:-73.94   3rd Qu.:   5.000   3rd Qu.: 24.00   
##  Max.   :40.91   Max.   :-73.72   Max.   :1250.000   Max.   :607.00   
##  calculated_host_listings_count availability_365   is_central   
##  Min.   :  1.000                Min.   :  0.0    Min.   :0.000  
##  1st Qu.:  1.000                1st Qu.:  0.0    1st Qu.:1.000  
##  Median :  1.000                Median : 45.0    Median :1.000  
##  Mean   :  7.308                Mean   :112.8    Mean   :0.853  
##  3rd Qu.:  2.000                3rd Qu.:228.0    3rd Qu.:1.000  
##  Max.   :327.000                Max.   :365.0    Max.   :1.000  
##  reviews_density     high_availability host_multi_listing
##  Min.   :  0.00000   Min.   :0.0000    Min.   :0.0000    
##  1st Qu.:  0.00836   1st Qu.:0.0000    1st Qu.:0.0000    
##  Median :  0.20495   Median :0.0000    Median :0.0000    
##  Mean   :  3.33422   Mean   :0.2735    Mean   :0.3409    
##  3rd Qu.:  1.91667   3rd Qu.:1.0000    3rd Qu.:1.0000    
##  Max.   :480.00000   Max.   :1.0000    Max.   :1.0000
colSums(is.na(airbnb.raw)) %>% sort(decreasing = TRUE)
##                          price            neighbourhood_group 
##                              0                              0 
##                  neighbourhood                      room_type 
##                              0                              0 
##                       latitude                      longitude 
##                              0                              0 
##                 minimum_nights              number_of_reviews 
##                              0                              0 
## calculated_host_listings_count               availability_365 
##                              0                              0 
##                     is_central                reviews_density 
##                              0                              0 
##              high_availability             host_multi_listing 
##                              0                              0
skim(airbnb.raw)
Data summary
Name airbnb.raw
Number of rows 31753
Number of columns 14
_______________________
Column type frequency:
character 3
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
neighbourhood_group 0 1 5 13 0 5 0
neighbourhood 0 1 4 26 0 217 0
room_type 0 1 11 15 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
price 0 1 140.56 112.91 10.00 69.00 105.00 175.00 999.00 ▇▁▁▁▁
latitude 0 1 40.73 0.05 40.50 40.69 40.72 40.76 40.91 ▁▁▇▅▁
longitude 0 1 -73.95 0.05 -74.24 -73.98 -73.96 -73.94 -73.72 ▁▁▇▂▁
minimum_nights 0 1 6.98 21.03 1.00 1.00 2.00 5.00 1250.00 ▇▁▁▁▁
number_of_reviews 0 1 23.35 44.33 0.00 1.00 5.00 24.00 607.00 ▇▁▁▁▁
calculated_host_listings_count 0 1 7.31 33.48 1.00 1.00 1.00 2.00 327.00 ▇▁▁▁▁
availability_365 0 1 112.79 131.72 0.00 0.00 45.00 228.00 365.00 ▇▂▁▁▂
is_central 0 1 0.85 0.35 0.00 1.00 1.00 1.00 1.00 ▂▁▁▁▇
reviews_density 0 1 3.33 12.94 0.00 0.01 0.20 1.92 480.00 ▇▁▁▁▁
high_availability 0 1 0.27 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
host_multi_listing 0 1 0.34 0.47 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅

The dataset contains listing characteristics such as location, neighbourhood, room type, review activity, minimum nights, availability and host activity. The target variable is price.

Our first impression from the data was that this is not a clean textbook regression problem. Prices are very uneven, and neighbourhood information is important but also quite granular. That made us expect tree-based models to do well, because they can capture non-linear differences between locations and room types without forcing one global linear relationship.

4 Preprocessing

The target must be positive. Categorical predictors are converted into factors. Binary variables are recoded into readable labels. Rare neighbourhood levels are grouped using only the training set in order to avoid data leakage.

airbnb.clean <- airbnb.raw %>%
  filter(!is.na(price), price > 0) %>%
  mutate(
    neighbourhood_group = factor(neighbourhood_group),
    neighbourhood = factor(neighbourhood),
    room_type = factor(room_type),
    is_central = factor(is_central, levels = c(0, 1), labels = c("no", "yes")),
    high_availability = factor(high_availability, levels = c(0, 1), labels = c("no", "yes")),
    host_multi_listing = factor(host_multi_listing, levels = c(0, 1), labels = c("no", "yes"))
  )

summary(airbnb.clean$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    10.0    69.0   105.0   140.6   175.0   999.0
colSums(is.na(airbnb.clean)) %>% sort(decreasing = TRUE)
##                          price            neighbourhood_group 
##                              0                              0 
##                  neighbourhood                      room_type 
##                              0                              0 
##                       latitude                      longitude 
##                              0                              0 
##                 minimum_nights              number_of_reviews 
##                              0                              0 
## calculated_host_listings_count               availability_365 
##                              0                              0 
##                     is_central                reviews_density 
##                              0                              0 
##              high_availability             host_multi_listing 
##                              0                              0
combine_rare_levels <- function(x, retained_levels) {
  x <- as.character(x)
  factor(ifelse(x %in% retained_levels, x, "other"))
}

add_engineered_features <- function(data) {
  data %>%
    mutate(
      log_minimum_nights = log1p(minimum_nights),
      log_number_of_reviews = log1p(number_of_reviews),
      log_host_listings = log1p(calculated_host_listings_count),
      log_reviews_density = log1p(reviews_density),
      distance_to_midtown_km = sqrt(
        ((latitude - 40.7580) * 111)^2 +
          ((longitude - (-73.9855)) * 85)^2
      ),
      location_grid = factor(
        paste(round(latitude, 2), round(longitude, 2), sep = "_")
      )
    )
}

The additional variables are used in the extended tuning stage. They include logarithms of skewed predictors, distance from Midtown Manhattan, and a local coordinate grid.

We decided not to remove expensive listings automatically. Some of them are probably unusual, but they still represent real offers that a pricing model could encounter. Instead of deleting them, we kept them in the main modelling task and used robust interpretation through MAE together with RMSE.

5 Exploratory Data Analysis

plot.price <- ggplot(airbnb.clean, aes(x = price)) +
  geom_histogram(bins = 80, fill = "#2c7fb8", color = "white") +
  coord_cartesian(xlim = c(0, quantile(airbnb.clean$price, 0.99))) +
  theme_bw() +
  labs(
    title = "Distribution of Airbnb prices",
    x = "Price",
    y = "Number of listings"
  )

plot.price

save_plot(plot.price, "final_eda_price_distribution.png")
plot.log.price <- ggplot(airbnb.clean, aes(x = log1p(price))) +
  geom_histogram(bins = 60, fill = "#41ab5d", color = "white") +
  theme_bw() +
  labs(
    title = "Distribution of log1p(price)",
    x = "log1p(price)",
    y = "Number of listings"
  )

plot.log.price

save_plot(plot.log.price, "final_eda_log_price_distribution.png")

The raw target is right-skewed. This justifies comparing models trained on price and on log1p(price).

This skewness is one of the reasons why we did not want to rely on a single metric. RMSE reacts strongly to very expensive listings, while MAE gives a more intuitive answer to the question: “how many dollars off is the model on average?”

plot.room <- ggplot(airbnb.clean, aes(x = room_type, y = price, fill = room_type)) +
  geom_boxplot(outlier.alpha = 0.15) +
  coord_cartesian(ylim = c(0, quantile(airbnb.clean$price, 0.99))) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(
    title = "Price by room type",
    x = "Room type",
    y = "Price"
  )

plot.room

save_plot(plot.room, "final_eda_price_by_room_type.png")
plot.borough <- ggplot(
  airbnb.clean,
  aes(x = reorder(neighbourhood_group, price, median), y = price, fill = neighbourhood_group)
) +
  geom_boxplot(outlier.alpha = 0.10) +
  coord_cartesian(ylim = c(0, quantile(airbnb.clean$price, 0.99))) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(
    title = "Price by borough",
    x = "Borough",
    y = "Price"
  )

plot.borough

save_plot(plot.borough, "final_eda_price_by_borough.png")
plot.spatial <- ggplot(
  airbnb.clean,
  aes(x = longitude, y = latitude, color = price)
) +
  geom_point(alpha = 0.45, size = 0.6) +
  scale_color_viridis_c(option = "C", trans = "log10") +
  theme_bw() +
  labs(
    title = "Spatial distribution of prices",
    x = "Longitude",
    y = "Latitude",
    color = "Price"
  )

plot.spatial

save_plot(plot.spatial, "final_eda_spatial_prices.png")
numeric.data <- airbnb.clean %>%
  select(where(is.numeric))

cor.matrix <- cor(numeric.data, use = "pairwise.complete.obs")

corrplot(
  cor.matrix,
  method = "color",
  type = "upper",
  tl.cex = 0.7,
  tl.col = "black"
)

6 Train-Test Split

set.seed(123)
train.ids <- createDataPartition(
  airbnb.clean$price,
  p = 0.70,
  list = FALSE
)

airbnb.train <- airbnb.clean[train.ids, ]
airbnb.test <- airbnb.clean[-train.ids, ]

real.train <- airbnb.train$price
real.test <- airbnb.test$price

popular.neighbourhoods <- airbnb.train %>%
  count(neighbourhood, sort = TRUE) %>%
  filter(n >= 100) %>%
  pull(neighbourhood) %>%
  as.character()

airbnb.train <- airbnb.train %>%
  mutate(neighbourhood = combine_rare_levels(neighbourhood, popular.neighbourhoods))

airbnb.test <- airbnb.test %>%
  mutate(neighbourhood = combine_rare_levels(neighbourhood, popular.neighbourhoods))

factor.variables <- names(airbnb.train)[sapply(airbnb.train, is.factor)]

for (variable in factor.variables) {
  airbnb.test[[variable]] <- factor(
    airbnb.test[[variable]],
    levels = levels(airbnb.train[[variable]])
  )
}

# The saved OLS and GLMNET models were trained after removing near-zero
# variance predictors. The same step must be reproduced before prediction.
nzv.index <- nearZeroVar(airbnb.train %>% select(-price))
nzv.names <- names(airbnb.train %>% select(-price))[nzv.index]
nzv.names
## character(0)
airbnb.train.ols <- airbnb.train %>%
  select(-any_of(nzv.names))

airbnb.test.ols <- airbnb.test %>%
  select(-any_of(nzv.names))

airbnb.train.tree <- airbnb.train
airbnb.test.tree <- airbnb.test

The split is done before preprocessing decisions that depend on the data. This prevents test-set information from entering the training process.

We kept the test set separate throughout the analysis, even when it was tempting to use it for quick decisions. This is important because a model that looks good after many informal checks on the test set may be partly tuned to that specific test sample.

7 Benchmark and Metrics

pred.benchmark.train <- rep(mean(real.train), length(real.train))
pred.benchmark.test <- rep(mean(real.train), length(real.test))

metrics.benchmark.train <- regression_metrics(real.train, pred.benchmark.train)
metrics.benchmark.test <- regression_metrics(real.test, pred.benchmark.test)

rbind(
  train = metrics.benchmark.train,
  test = metrics.benchmark.test
)

The main metrics are:

For final interpretation, MAE is the easiest metric to explain in business terms. RMSE is useful for model selection because large mistakes matter, but MAE is closer to how a host or platform user would feel the model error.

8 Linear Models

8.1 OLS

# Original training command:
# model.ols <- lm(price ~ ., data = airbnb.train.ols)
# saveRDS(model.ols, file.path(MODELS_DIR, "airbnb_ols.rds"))

model.ols <- readRDS(file.path(MODELS_DIR, "airbnb_ols.rds"))

summary(model.ols)
## 
## Call:
## lm(formula = price ~ ., data = airbnb.train.ols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -222.35  -45.03  -13.81   17.52  846.15 
## 
## Coefficients: (1 not defined because of singularities)
##                                             Estimate    Std. Error t value
## (Intercept)                            -15417.950609   3006.165589  -5.129
## neighbourhood_groupBrooklyn                16.104732     10.190459   1.580
## neighbourhood_groupManhattan              100.020457      9.060477  11.039
## neighbourhood_groupQueens                  20.312757      7.503940   2.707
## neighbourhood_groupStaten Island          -37.084814     15.489284  -2.394
## neighbourhoodBedford-Stuyvesant             2.623407      7.518619   0.349
## neighbourhoodBushwick                       5.112506      8.084026   0.632
## neighbourhoodCarroll Gardens               37.680524     10.645304   3.540
## neighbourhoodChelsea                       -3.331401      8.734448  -0.381
## neighbourhoodChinatown                    -45.003830     10.625785  -4.235
## neighbourhoodClinton Hill                  29.431944      8.771151   3.356
## neighbourhoodCrown Heights                  5.846861      7.837011   0.746
## neighbourhoodDitmars Steinway              -3.815037      8.643316  -0.441
## neighbourhoodEast Flatbush                 -9.419398      9.614256  -0.980
## neighbourhoodEast Harlem                  -63.875012      9.548876  -6.689
## neighbourhoodEast New York                 -7.494405     12.241237  -0.612
## neighbourhoodEast Village                 -29.571197      8.676843  -3.408
## neighbourhoodFinancial District           -25.978687      9.720309  -2.673
## neighbourhoodFlatbush                      -6.828764      8.896873  -0.768
## neighbourhoodFlushing                      18.712239      8.583837   2.180
## neighbourhoodFort Greene                   33.230401      8.994234   3.695
## neighbourhoodGowanus                       32.940883     10.619896   3.102
## neighbourhoodGramercy                     -24.920101     10.792574  -2.309
## neighbourhoodGreenpoint                    26.611680      8.163858   3.260
## neighbourhoodGreenwich Village              2.091020     10.155425   0.206
## neighbourhoodHarlem                       -72.407619      9.080123  -7.974
## neighbourhoodHell's Kitchen               -13.064670      8.338871  -1.567
## neighbourhoodInwood                       -97.716303     12.697302  -7.696
## neighbourhoodJamaica                       21.835299     11.410874   1.914
## neighbourhoodKips Bay                     -29.867372     10.234283  -2.918
## neighbourhoodLong Island City              19.483584      7.319349   2.662
## neighbourhoodLower East Side              -42.710321      9.232878  -4.626
## neighbourhoodMidtown                       32.819729      8.749832   3.751
## neighbourhoodMorningside Heights          -65.764210     11.186332  -5.879
## neighbourhoodMurray Hill                  -15.491447     10.155271  -1.525
## neighbourhoodNolita                         0.668912     11.664186   0.057
## neighbourhoodother                          5.478261      6.325343   0.866
## neighbourhoodPark Slope                    43.212795      8.732484   4.949
## neighbourhoodProspect-Lefferts Gardens     -5.898835      9.135760  -0.646
## neighbourhoodProspect Heights              26.030411      9.634914   2.702
## neighbourhoodRidgewood                    -10.889638      8.101254  -1.344
## neighbourhoodSoHo                          39.593799     10.799113   3.666
## neighbourhoodSouth Slope                   35.024787     10.730664   3.264
## neighbourhoodSunnyside                     -9.386204      8.280638  -1.134
## neighbourhoodSunset Park                   -2.671060      9.055908  -0.295
## neighbourhoodTheater District              18.806875     11.368187   1.654
## neighbourhoodUpper East Side              -39.304005      8.938378  -4.397
## neighbourhoodUpper West Side              -31.650224      8.635101  -3.665
## neighbourhoodWashington Heights           -91.779771     10.311394  -8.901
## neighbourhoodWest Village                  10.366779      9.189347   1.128
## neighbourhoodWilliamsburg                  36.570388      7.320531   4.996
## neighbourhoodWoodside                      -5.889698      9.876853  -0.596
## room_typePrivate room                     -91.472272      1.317084 -69.451
## room_typeShared room                     -113.444715      4.046759 -28.033
## latitude                                   51.208969     40.629400   1.260
## longitude                                -182.015142     36.219518  -5.025
## minimum_nights                             -0.313990      0.028246 -11.116
## number_of_reviews                          -0.156840      0.014754 -10.630
## calculated_host_listings_count             -0.008554      0.021757  -0.393
## availability_365                            0.212647      0.011573  18.374
## is_centralyes                                     NA            NA      NA
## reviews_density                             0.058165      0.051135   1.137
## high_availabilityyes                      -24.606196      3.250890  -7.569
## host_multi_listingyes                      -2.967514      1.452801  -2.043
##                                                    Pr(>|t|)    
## (Intercept)                             0.00000029408648903 ***
## neighbourhood_groupBrooklyn                        0.114036    
## neighbourhood_groupManhattan           < 0.0000000000000002 ***
## neighbourhood_groupQueens                          0.006796 ** 
## neighbourhood_groupStaten Island                   0.016664 *  
## neighbourhoodBedford-Stuyvesant                    0.727152    
## neighbourhoodBushwick                              0.527118    
## neighbourhoodCarroll Gardens                       0.000401 ***
## neighbourhoodChelsea                               0.702903    
## neighbourhoodChinatown                  0.00002291245001899 ***
## neighbourhoodClinton Hill                          0.000793 ***
## neighbourhoodCrown Heights                         0.455641    
## neighbourhoodDitmars Steinway                      0.658938    
## neighbourhoodEast Flatbush                         0.327229    
## neighbourhoodEast Harlem                0.00000000002296315 ***
## neighbourhoodEast New York                         0.540394    
## neighbourhoodEast Village                          0.000655 ***
## neighbourhoodFinancial District                    0.007532 ** 
## neighbourhoodFlatbush                              0.442765    
## neighbourhoodFlushing                              0.029272 *  
## neighbourhoodFort Greene                           0.000221 ***
## neighbourhoodGowanus                               0.001926 ** 
## neighbourhoodGramercy                              0.020952 *  
## neighbourhoodGreenpoint                            0.001117 ** 
## neighbourhoodGreenwich Village                     0.836870    
## neighbourhoodHarlem                     0.00000000000000161 ***
## neighbourhoodHell's Kitchen                        0.117195    
## neighbourhoodInwood                     0.00000000000001464 ***
## neighbourhoodJamaica                               0.055690 .  
## neighbourhoodKips Bay                              0.003522 ** 
## neighbourhoodLong Island City                      0.007775 ** 
## neighbourhoodLower East Side            0.00000375092517111 ***
## neighbourhoodMidtown                               0.000177 ***
## neighbourhoodMorningside Heights        0.00000000418720647 ***
## neighbourhoodMurray Hill                           0.127159    
## neighbourhoodNolita                                0.954269    
## neighbourhoodother                                 0.386455    
## neighbourhoodPark Slope                 0.00000075331432597 ***
## neighbourhoodProspect-Lefferts Gardens             0.518489    
## neighbourhoodProspect Heights                      0.006904 ** 
## neighbourhoodRidgewood                             0.178900    
## neighbourhoodSoHo                                  0.000247 ***
## neighbourhoodSouth Slope                           0.001100 ** 
## neighbourhoodSunnyside                             0.257011    
## neighbourhoodSunset Park                           0.768033    
## neighbourhoodTheater District                      0.098072 .  
## neighbourhoodUpper East Side            0.00001101570229775 ***
## neighbourhoodUpper West Side                       0.000248 ***
## neighbourhoodWashington Heights        < 0.0000000000000002 ***
## neighbourhoodWest Village                          0.259277    
## neighbourhoodWilliamsburg               0.00000059101325900 ***
## neighbourhoodWoodside                              0.550972    
## room_typePrivate room                  < 0.0000000000000002 ***
## room_typeShared room                   < 0.0000000000000002 ***
## latitude                                           0.207541    
## longitude                               0.00000050646876510 ***
## minimum_nights                         < 0.0000000000000002 ***
## number_of_reviews                      < 0.0000000000000002 ***
## calculated_host_listings_count                     0.694215    
## availability_365                       < 0.0000000000000002 ***
## is_centralyes                                            NA    
## reviews_density                                    0.255345    
## high_availabilityyes                    0.00000000000003906 ***
## host_multi_listingyes                              0.041102 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 90.09 on 22166 degrees of freedom
## Multiple R-squared:  0.3668, Adjusted R-squared:  0.365 
## F-statistic: 207.1 on 62 and 22166 DF,  p-value: < 0.00000000000000022
pred.ols.train <- predict(model.ols, newdata = airbnb.train.ols)
pred.ols.test <- predict(model.ols, newdata = airbnb.test.ols)

metrics.ols.train <- regression_metrics(real.train, pred.ols.train)
metrics.ols.test <- regression_metrics(real.test, pred.ols.test)

rbind(train = metrics.ols.train, test = metrics.ols.test)
ols.coefficients <- data.frame(
  variable = names(coef(model.ols)),
  coefficient = as.numeric(coef(model.ols))
) %>%
  arrange(desc(abs(coefficient)))

head(ols.coefficients, 15)

8.2 Regularized Regression

control.cv <- trainControl(
  method = "cv",
  number = 5
)

grid.glmnet <- expand.grid(
  alpha = c(0, 0.5, 1),
  lambda = 10^seq(-3, 1, length = 20)
)

# Original tuning command:
# set.seed(123)
# model.glmnet <- train(
#   price ~ .,
#   data = airbnb.train.ols,
#   method = "glmnet",
#   trControl = control.cv,
#   tuneGrid = grid.glmnet,
#   metric = "RMSE"
# )
# saveRDS(model.glmnet, file.path(MODELS_DIR, "airbnb_glmnet.rds"))

model.glmnet <- readRDS(file.path(MODELS_DIR, "airbnb_glmnet.rds"))

model.glmnet
## glmnet 
## 
## 22229 samples
##    13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 17784, 17783, 17782, 17784, 17783 
## Resampling results across tuning parameters:
## 
##   alpha  lambda         RMSE       Rsquared   MAE     
##   0.0      0.001000000   90.39598  0.3611964  53.65085
##   0.0      0.001832981   90.39598  0.3611964  53.65085
##   0.0      0.003359818   90.39598  0.3611964  53.65085
##   0.0      0.006158482   90.39598  0.3611964  53.65085
##   0.0      0.011288379   90.39598  0.3611964  53.65085
##   0.0      0.020691381   90.39598  0.3611964  53.65085
##   0.0      0.037926902   90.39598  0.3611964  53.65085
##   0.0      0.069519280   90.39598  0.3611964  53.65085
##   0.0      0.127427499   90.39598  0.3611964  53.65085
##   0.0      0.233572147   90.39598  0.3611964  53.65085
##   0.0      0.428133240   90.39598  0.3611964  53.65085
##   0.0      0.784759970   90.39598  0.3611964  53.65085
##   0.0      1.438449888   90.39598  0.3611964  53.65085
##   0.0      2.636650899   90.39598  0.3611964  53.65085
##   0.0      4.832930239   90.39598  0.3611964  53.65085
##   0.0      8.858667904   90.53259  0.3599106  53.66280
##   0.0     16.237767392   90.81296  0.3578368  53.79253
##   0.0     29.763514416   91.37112  0.3546925  54.23014
##   0.0     54.555947812   92.47295  0.3494640  55.31123
##   0.0    100.000000000   94.41679  0.3407376  57.37683
##   0.5      0.001000000   90.24221  0.3630061  53.79135
##   0.5      0.001832981   90.24221  0.3630061  53.79135
##   0.5      0.003359818   90.24221  0.3630061  53.79135
##   0.5      0.006158482   90.24221  0.3630061  53.79135
##   0.5      0.011288379   90.24221  0.3630061  53.79135
##   0.5      0.020691381   90.24221  0.3630061  53.79135
##   0.5      0.037926902   90.24227  0.3630059  53.79126
##   0.5      0.069519280   90.24440  0.3629720  53.78433
##   0.5      0.127427499   90.25182  0.3628644  53.77034
##   0.5      0.233572147   90.26118  0.3627368  53.74542
##   0.5      0.428133240   90.29946  0.3622251  53.71824
##   0.5      0.784759970   90.37382  0.3612806  53.69331
##   0.5      1.438449888   90.53433  0.3593374  53.71599
##   0.5      2.636650899   90.87134  0.3554670  53.89250
##   0.5      4.832930239   91.52329  0.3485840  54.34087
##   0.5      8.858667904   92.98090  0.3327464  55.46401
##   0.5     16.237767392   95.21284  0.3120691  57.40628
##   0.5     29.763514416   99.34455  0.2716039  61.30409
##   0.5     54.555947812  104.91913  0.2325039  67.76424
##   0.5    100.000000000  112.52486  0.2085917  76.38939
##   1.0      0.001000000   90.24062  0.3630260  53.79134
##   1.0      0.001832981   90.24062  0.3630260  53.79134
##   1.0      0.003359818   90.24062  0.3630260  53.79134
##   1.0      0.006158482   90.24062  0.3630260  53.79134
##   1.0      0.011288379   90.24062  0.3630260  53.79134
##   1.0      0.020691381   90.24069  0.3630249  53.79123
##   1.0      0.037926902   90.24447  0.3629715  53.78501
##   1.0      0.069519280   90.25170  0.3628664  53.77193
##   1.0      0.127427499   90.25996  0.3627543  53.74740
##   1.0      0.233572147   90.29933  0.3622236  53.72270
##   1.0      0.428133240   90.37507  0.3612454  53.70110
##   1.0      0.784759970   90.54418  0.3591360  53.73218
##   1.0      1.438449888   90.90319  0.3548002  53.93387
##   1.0      2.636650899   91.57131  0.3472117  54.38527
##   1.0      4.832930239   93.05908  0.3295869  55.49647
##   1.0      8.858667904   95.09350  0.3093418  57.16471
##   1.0     16.237767392   99.18747  0.2632792  60.91656
##   1.0     29.763514416  104.76296  0.2131308  67.48301
##   1.0     54.555947812  113.03935        NaN  76.96256
##   1.0    100.000000000  113.03935        NaN  76.96256
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.01128838.
model.glmnet$bestTune
pred.glmnet.train <- predict(model.glmnet, newdata = airbnb.train.ols)
pred.glmnet.test <- predict(model.glmnet, newdata = airbnb.test.ols)

metrics.glmnet.train <- regression_metrics(real.train, pred.glmnet.train)
metrics.glmnet.test <- regression_metrics(real.test, pred.glmnet.test)

rbind(train = metrics.glmnet.train, test = metrics.glmnet.test)

9 Tree-Based Models

9.1 CART

# Original training command:
# set.seed(123)
# model.tree.default <- rpart(
#   price ~ .,
#   data = airbnb.train.tree,
#   method = "anova"
# )
# saveRDS(model.tree.default, file.path(MODELS_DIR, "airbnb_cart_default.rds"))

model.tree.default <- readRDS(file.path(MODELS_DIR, "airbnb_cart_default.rds"))

pred.tree.default.train <- predict(model.tree.default, newdata = airbnb.train.tree)
pred.tree.default.test <- predict(model.tree.default, newdata = airbnb.test.tree)

metrics.tree.default.train <- regression_metrics(real.train, pred.tree.default.train)
metrics.tree.default.test <- regression_metrics(real.test, pred.tree.default.test)

rbind(train = metrics.tree.default.train, test = metrics.tree.default.test)
# Original pruning commands:
# set.seed(123)
# model.tree.controlled <- rpart(
#   price ~ .,
#   data = airbnb.train.tree,
#   method = "anova",
#   control = rpart.control(cp = 0.001, xval = 10)
# )
# best.cp <- model.tree.controlled$cptable[
#   which.min(model.tree.controlled$cptable[, "xerror"]),
#   "CP"
# ]
# model.tree.pruned <- prune(model.tree.controlled, cp = best.cp)
# saveRDS(model.tree.pruned, file.path(MODELS_DIR, "airbnb_cart_pruned.rds"))

model.tree.pruned <- readRDS(file.path(MODELS_DIR, "airbnb_cart_pruned.rds"))

pred.tree.pruned.train <- predict(model.tree.pruned, newdata = airbnb.train.tree)
pred.tree.pruned.test <- predict(model.tree.pruned, newdata = airbnb.test.tree)

metrics.tree.pruned.train <- regression_metrics(real.train, pred.tree.pruned.train)
metrics.tree.pruned.test <- regression_metrics(real.test, pred.tree.pruned.test)

rbind(train = metrics.tree.pruned.train, test = metrics.tree.pruned.test)
rpart.plot(
  model.tree.pruned,
  main = "Pruned regression tree"
)

9.2 Bagging and Random Forest

p.airbnb <- ncol(airbnb.train.tree) - 1

# Original bagging command:
# set.seed(123)
# model.bagging <- randomForest(
#   price ~ .,
#   data = airbnb.train.tree,
#   mtry = p.airbnb,
#   importance = TRUE
# )
# saveRDS(model.bagging, file.path(MODELS_DIR, "airbnb_bagging.rds"))

# Original Random Forest command:
# set.seed(123)
# model.rf <- randomForest(
#   price ~ .,
#   data = airbnb.train.tree,
#   importance = TRUE
# )
# saveRDS(model.rf, file.path(MODELS_DIR, "airbnb_random_forest.rds"))

model.bagging <- readRDS(file.path(MODELS_DIR, "airbnb_bagging.rds"))
model.rf <- readRDS(file.path(MODELS_DIR, "airbnb_random_forest.rds"))

pred.bag.train <- predict(model.bagging, newdata = airbnb.train.tree)
pred.bag.test <- predict(model.bagging, newdata = airbnb.test.tree)
pred.rf.train <- predict(model.rf, newdata = airbnb.train.tree)
pred.rf.test <- predict(model.rf, newdata = airbnb.test.tree)

metrics.bag.train <- regression_metrics(real.train, pred.bag.train)
metrics.bag.test <- regression_metrics(real.test, pred.bag.test)
metrics.rf.train <- regression_metrics(real.train, pred.rf.train)
metrics.rf.test <- regression_metrics(real.test, pred.rf.test)

rbind(
  Bagging_train = metrics.bag.train,
  Bagging_test = metrics.bag.test,
  RF_train = metrics.rf.train,
  RF_test = metrics.rf.test
)
grid.rf <- expand.grid(
  mtry = c(3, 5, 7, 9),
  splitrule = "variance",
  min.node.size = c(5, 10, 20)
)

# Original tuning command:
# set.seed(123)
# model.rf.tuned <- train(
#   price ~ .,
#   data = airbnb.train.tree,
#   method = "ranger",
#   trControl = control.cv,
#   tuneGrid = grid.rf,
#   metric = "RMSE",
#   importance = "permutation"
# )
# saveRDS(model.rf.tuned, file.path(MODELS_DIR, "airbnb_random_forest_tuned.rds"))

model.rf.tuned <- readRDS(file.path(MODELS_DIR, "airbnb_random_forest_tuned.rds"))

model.rf.tuned
## Random Forest 
## 
## 22229 samples
##    13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 17784, 17783, 17782, 17784, 17783 
## Resampling results across tuning parameters:
## 
##   mtry  min.node.size  RMSE      Rsquared   MAE     
##    2     5             97.29030  0.3260055  61.07318
##    2    10             97.88314  0.3193010  61.73504
##    2    20             97.73341  0.3200835  61.56104
##    4     5             90.85075  0.3865473  54.99254
##    4    10             90.87544  0.3869226  55.06398
##    4    20             91.17436  0.3823284  55.30367
##    6     5             86.81101  0.4272049  51.41793
##    6    10             87.06469  0.4242464  51.63108
##    6    20             87.13046  0.4232197  51.62803
##    8     5             84.83022  0.4450666  49.79568
##    8    10             84.94662  0.4437740  49.85793
##    8    20             85.18869  0.4407271  49.94693
##   10     5             83.98413  0.4514135  49.09499
##   10    10             84.08005  0.4506435  49.14023
##   10    20             84.26683  0.4488577  49.23690
##   12     5             83.61572  0.4542409  48.77377
##   12    10             83.73997  0.4529458  48.89917
##   12    20             83.85036  0.4521699  48.94918
## 
## Tuning parameter 'splitrule' was held constant at a value of variance
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 12, splitrule = variance
##  and min.node.size = 5.
model.rf.tuned$bestTune
pred.rf.tuned.train <- predict(model.rf.tuned, newdata = airbnb.train.tree)
pred.rf.tuned.test <- predict(model.rf.tuned, newdata = airbnb.test.tree)

metrics.rf.tuned.train <- regression_metrics(real.train, pred.rf.tuned.train)
metrics.rf.tuned.test <- regression_metrics(real.test, pred.rf.tuned.test)

rbind(train = metrics.rf.tuned.train, test = metrics.rf.tuned.test)
importance.rf <- randomForest::importance(model.rf)

importance.rf.df <- data.frame(
  variable = rownames(importance.rf),
  IncNodePurity = importance.rf[, "IncNodePurity"]
) %>%
  arrange(desc(IncNodePurity)) %>%
  slice(1:20)

plot.rf.importance <- ggplot(
  importance.rf.df,
  aes(x = reorder(variable, IncNodePurity), y = IncNodePurity)
) +
  geom_col(fill = "#2c7fb8") +
  coord_flip() +
  theme_bw() +
  labs(
    title = "Random Forest variable importance",
    x = "Variable",
    y = "Increase in node purity"
  )

plot.rf.importance

save_plot(plot.rf.importance, "final_random_forest_importance.png")

9.3 GBM

grid.gbm <- expand.grid(
  interaction.depth = c(1, 2, 3),
  n.trees = c(100, 300, 500),
  shrinkage = c(0.01, 0.05),
  n.minobsinnode = c(10, 20)
)

# Original tuning command:
# set.seed(123)
# model.gbm <- train(
#   price ~ .,
#   data = airbnb.train.tree,
#   method = "gbm",
#   trControl = control.cv,
#   tuneGrid = grid.gbm,
#   metric = "RMSE",
#   verbose = FALSE
# )
# saveRDS(model.gbm, file.path(MODELS_DIR, "airbnb_gbm.rds"))

model.gbm <- readRDS(file.path(MODELS_DIR, "airbnb_gbm.rds"))

model.gbm
## Stochastic Gradient Boosting 
## 
## 22229 samples
##    13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 17784, 17783, 17782, 17784, 17783 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.minobsinnode  n.trees  RMSE       Rsquared 
##   0.01       1                  10              100      101.37632  0.2644749
##   0.01       1                  10              300       95.68549  0.3092401
##   0.01       1                  10              500       93.43678  0.3328510
##   0.01       1                  20              100      101.37512  0.2653646
##   0.01       1                  20              300       95.67114  0.3088630
##   0.01       1                  20              500       93.42151  0.3330108
##   0.01       2                  10              100       98.73190  0.2981678
##   0.01       2                  10              300       92.26147  0.3511508
##   0.01       2                  10              500       89.93124  0.3780368
##   0.01       2                  20              100       98.73527  0.2985133
##   0.01       2                  20              300       92.26225  0.3510600
##   0.01       2                  20              500       89.95995  0.3775847
##   0.01       3                  10              100       97.20368  0.3242921
##   0.01       3                  10              300       90.33850  0.3771726
##   0.01       3                  10              500       88.02433  0.4018948
##   0.01       3                  20              100       97.19500  0.3239763
##   0.01       3                  20              300       90.37368  0.3763813
##   0.01       3                  20              500       88.09297  0.4006288
##   0.05       1                  10              100       93.36590  0.3334723
##   0.05       1                  10              300       89.69518  0.3748576
##   0.05       1                  10              500       88.79215  0.3842214
##   0.05       1                  20              100       93.37143  0.3343785
##   0.05       1                  20              300       89.67429  0.3752339
##   0.05       1                  20              500       88.76947  0.3845758
##   0.05       2                  10              100       89.89319  0.3780530
##   0.05       2                  10              300       87.02157  0.4088911
##   0.05       2                  10              500       86.33595  0.4175179
##   0.05       2                  20              100       89.92557  0.3773864
##   0.05       2                  20              300       87.07697  0.4081333
##   0.05       2                  20              500       86.32500  0.4175460
##   0.05       3                  10              100       88.05525  0.4009267
##   0.05       3                  10              300       85.53757  0.4285283
##   0.05       3                  10              500       84.83256  0.4374377
##   0.05       3                  20              100       88.09349  0.4000672
##   0.05       3                  20              300       85.68321  0.4264457
##   0.05       3                  20              500       85.05745  0.4343600
##   MAE     
##   63.69982
##   57.60641
##   55.67469
##   63.70730
##   57.58794
##   55.63698
##   61.25522
##   54.84469
##   52.94598
##   61.25523
##   54.80262
##   52.91420
##   60.00274
##   53.42618
##   51.60880
##   60.00653
##   53.43218
##   51.63137
##   55.62788
##   52.89212
##   52.77673
##   55.61048
##   52.91206
##   52.79919
##   52.94567
##   51.03731
##   50.74902
##   52.90823
##   51.03901
##   50.71150
##   51.60729
##   49.97217
##   49.58412
##   51.59365
##   49.95622
##   49.65879
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 500, interaction.depth =
##  3, shrinkage = 0.05 and n.minobsinnode = 10.
model.gbm$bestTune
pred.gbm.train <- predict(model.gbm, newdata = airbnb.train.tree)
pred.gbm.test <- predict(model.gbm, newdata = airbnb.test.tree)

metrics.gbm.train <- regression_metrics(real.train, pred.gbm.train)
metrics.gbm.test <- regression_metrics(real.test, pred.gbm.test)

rbind(train = metrics.gbm.train, test = metrics.gbm.test)
plot.gbm.importance <- plot(
  varImp(model.gbm),
  top = 20,
  main = "GBM variable importance"
)

plot.gbm.importance

9.4 XGBoost

XGBoost was trained with the native API instead of caret::train(method = "xgbTree"), because recent XGBoost versions may not cooperate reliably with caret.

dummy.xgb <- dummyVars(price ~ ., data = airbnb.train.tree, fullRank = TRUE)

xgb.train.df <- predict(dummy.xgb, newdata = airbnb.train.tree) %>%
  as.data.frame()
xgb.test.df <- predict(dummy.xgb, newdata = airbnb.test.tree) %>%
  as.data.frame()

names(xgb.train.df) <- make.names(names(xgb.train.df), unique = TRUE)
names(xgb.test.df) <- make.names(names(xgb.test.df), unique = TRUE)

nzv.xgb <- nearZeroVar(xgb.train.df)

if (length(nzv.xgb) > 0) {
  xgb.train.df <- xgb.train.df[, -nzv.xgb, drop = FALSE]
  xgb.test.df <- xgb.test.df[, -nzv.xgb, drop = FALSE]
}

dtrain <- xgb.DMatrix(
  data = as.matrix(xgb.train.df),
  label = real.train
)

dtest <- xgb.DMatrix(
  data = as.matrix(xgb.test.df),
  label = real.test
)

# Original tuning command used xgb.cv() over a grid of eta, max_depth,
# min_child_weight, subsample and colsample_bytree. The best configuration was
# then refitted with xgb.train().
#
# model.xgb <- xgb.train(
#   params = params.xgb.best,
#   data = dtrain,
#   nrounds = best.xgb$nrounds,
#   verbose = 0
# )
# saveRDS(model.xgb, file.path(MODELS_DIR, "airbnb_xgboost.rds"))

model.xgb <- readRDS(file.path(MODELS_DIR, "airbnb_xgboost.rds"))

pred.xgb.train <- predict(model.xgb, newdata = dtrain)
pred.xgb.test <- predict(model.xgb, newdata = dtest)

metrics.xgb.train <- regression_metrics(real.train, pred.xgb.train)
metrics.xgb.test <- regression_metrics(real.test, pred.xgb.test)

rbind(train = metrics.xgb.train, test = metrics.xgb.test)
importance.xgb <- xgb.importance(
  feature_names = colnames(xgb.train.df),
  model = model.xgb
)

head(importance.xgb, 20)

10 Neural Networks

Keras was used instead of the older neuralnet package because it gives direct control over epochs, batch size, dropout, L2 regularization, early stopping and learning-rate reduction. The models were trained in a separate tuning script and saved to disk.

# Original simple Keras architecture pattern:
# model.keras <- keras_model_sequential(input_shape = c(ncol(x.nn.train))) %>%
#   layer_dense(units = 16, activation = "relu") %>%
#   layer_dense(units = 8, activation = "relu") %>%
#   layer_dense(units = 1, activation = "linear")
#
# model.keras %>% compile(
#   optimizer = optimizer_adam(learning_rate = 0.001),
#   loss = "mse",
#   metrics = c("mae")
# )
#
# history.keras <- model.keras %>% fit(
#   x = x.nn.train,
#   y = y.nn.train,
#   epochs = 200,
#   batch_size = 32,
#   validation_split = 0.20,
#   callbacks = list(callback_early_stopping(patience = 20)),
#   verbose = 0
# )

keras.results.file <- file.path(OUTPUT_DIR, "keras_validation_results.csv")

if (file.exists(keras.results.file)) {
  keras.validation.results <- read.csv(keras.results.file)
  keras.validation.results
}

11 Extended Tuning

The second tuning stage expanded two parts of the project:

rf.extended.cv.file <- file.path(EXTENDED_DIR, "rf_extended_cv_comparison.csv")
rf.extended.test.file <- file.path(EXTENDED_DIR, "rf_extended_test_metrics.csv")

if (file.exists(rf.extended.cv.file)) {
  rf.extended.cv <- read.csv(rf.extended.cv.file)
  rf.extended.cv
}
if (file.exists(rf.extended.test.file)) {
  rf.extended.test <- read.csv(rf.extended.test.file)
  rf.extended.test
}
keras.extended.file <- file.path(EXTENDED_DIR, "keras_extended_validation_results.csv")
keras.extended.test.file <- file.path(EXTENDED_DIR, "keras_extended_test_metrics.csv")

if (file.exists(keras.extended.file)) {
  keras.extended.results <- read.csv(keras.extended.file) %>%
    arrange(RMSE)
  
  head(keras.extended.results, 10)
}
if (file.exists(keras.extended.test.file)) {
  keras.extended.test <- read.csv(keras.extended.test.file)
  keras.extended.test
}

In validation, the best extended Keras configuration was a direct-price model with two hidden layers (128-64), dropout and L2 regularization. However, on the independent test set, the extended models did not improve over the best baseline Random Forest.

This was one of the more interesting parts of the project for us. The neural network tuning was more technically advanced, but it did not automatically translate into the best final result. In this dataset, the simpler Random Forest remained more competitive, which is a useful reminder that model complexity is not the same thing as model quality.

12 Model Comparison

comparison.train <- rbind(
  Benchmark = metrics.benchmark.train,
  OLS = metrics.ols.train,
  GLMNET = metrics.glmnet.train,
  CART_default = metrics.tree.default.train,
  CART_pruned = metrics.tree.pruned.train,
  Bagging = metrics.bag.train,
  Random_Forest = metrics.rf.train,
  Random_Forest_tuned = metrics.rf.tuned.train,
  GBM = metrics.gbm.train,
  XGBoost = metrics.xgb.train
)

comparison.test <- rbind(
  Benchmark = metrics.benchmark.test,
  OLS = metrics.ols.test,
  GLMNET = metrics.glmnet.test,
  CART_default = metrics.tree.default.test,
  CART_pruned = metrics.tree.pruned.test,
  Bagging = metrics.bag.test,
  Random_Forest = metrics.rf.test,
  Random_Forest_tuned = metrics.rf.tuned.test,
  GBM = metrics.gbm.test,
  XGBoost = metrics.xgb.test
)

comparison.train <- comparison.train %>%
  rownames_to_column("model") %>%
  arrange(RMSE)

comparison.test <- comparison.test %>%
  rownames_to_column("model") %>%
  arrange(RMSE)

write.csv(
  comparison.train,
  file.path(OUTPUT_DIR, "final_script_model_comparison_train.csv"),
  row.names = FALSE
)

write.csv(
  comparison.test,
  file.path(OUTPUT_DIR, "final_script_model_comparison_test.csv"),
  row.names = FALSE
)

comparison.train
comparison.test
plot.comparison <- comparison.test %>%
  arrange(RMSE) %>%
  ggplot(aes(x = reorder(model, RMSE), y = RMSE, fill = model)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  theme_bw() +
  labs(
    title = "Independent test-set RMSE by model",
    x = "Model",
    y = "RMSE"
  )

plot.comparison

save_plot(plot.comparison, "final_model_comparison_rpubs.png")
extended.comparison.file <- file.path(EXTENDED_DIR, "extended_model_comparison_test.csv")

if (file.exists(extended.comparison.file)) {
  extended.comparison <- read.csv(extended.comparison.file)
  
  combined.comparison <- bind_rows(
    comparison.test,
    extended.comparison
  ) %>%
    arrange(RMSE)
  
  combined.comparison
}

The best model in the baseline report is the default Random Forest:

The MAE value means that, on average, the model prediction differs from the true nightly price by about 48 dollars on the independent test set.

In our opinion, this is a reasonable result for this type of data. A 48-dollar average absolute error is not small for a single booking decision, but it is much better than the benchmark and it is understandable given that many price drivers are not present in the dataset.

13 Overfitting Check

overfitting.check <- comparison.train %>%
  select(model, RMSE_train = RMSE, MAE_train = MAE, R2_train = R2) %>%
  inner_join(
    comparison.test %>%
      select(model, RMSE_test = RMSE, MAE_test = MAE, R2_test = R2),
    by = "model"
  ) %>%
  mutate(
    RMSE_gap = RMSE_test - RMSE_train,
    MAE_gap = MAE_test - MAE_train,
    R2_gap = R2_train - R2_test
  ) %>%
  arrange(RMSE_test)

overfitting.check

Random Forest achieved the best test RMSE, but it also shows a clear train-test gap. GBM generalizes more evenly than bagging and default Random Forest, but its test error is slightly higher. Linear models have smaller overfitting gaps, but their predictive performance is weaker.

Because of this, we would not describe Random Forest as a perfect model. We would describe it as the best compromise in this experiment: it predicts better than the alternatives on the test set, but the train-test gap suggests that it has learned some very specific patterns from the training data.

14 Interpretation

The most useful predictors are related to location, room type, availability and review activity. This is consistent with the business context: nightly price is strongly affected by where the listing is located, what type of accommodation it offers, and how active or visible it is on the platform.

Distance to Midtown Manhattan is an interpretable spatial feature. It converts latitude and longitude into an approximate distance from a central reference point in Manhattan. This helps the model represent the intuition that central locations tend to have different prices than peripheral locations.

This feature is also easy to explain to a non-technical audience. Instead of saying that the model uses raw latitude and longitude, we can say that it also uses an approximate distance from a well-known central point in New York.

15 Conclusions

The project satisfies the regression task requirements:

The best final model is the baseline Random Forest. It achieved the lowest independent test RMSE among the evaluated models. Its test MAE of about 48 dollars means that the average absolute prediction error is approximately 48 dollars per listing.

The extended tuning did not beat the baseline Random Forest. This is still an important result: more complex feature engineering and larger neural networks do not automatically improve generalization. The best model should be selected based on independent test performance, not on complexity.

If we were continuing this project, we would spend less time making the neural network larger and more time improving the feature set. The missing information about apartment standard, photos, exact dates, seasonality and local events is probably more important than adding another hidden layer.

For future work, the strongest directions are:

16 Reproducibility

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Polish_Poland.utf8  LC_CTYPE=Polish_Poland.utf8   
## [3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=Polish_Poland.utf8    
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] xgboost_3.2.1.1      gbm_2.2.3            ranger_0.18.0       
##  [4] randomForest_4.7-1.2 rpart.plot_3.1.4     rpart_4.1.24        
##  [7] glmnet_4.1-10        Matrix_1.7-3         corrplot_0.95       
## [10] skimr_2.2.2          caret_7.0-1          lattice_0.22-7      
## [13] lubridate_1.9.4      forcats_1.0.1        stringr_1.5.2       
## [16] dplyr_1.2.1          purrr_1.1.0          readr_2.1.5         
## [19] tidyr_1.3.1          tibble_3.3.0         ggplot2_4.0.3       
## [22] tidyverse_2.0.0     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    timeDate_4051.111   
##  [4] farver_2.1.2         S7_0.2.0             fastmap_1.2.0       
##  [7] pROC_1.19.0.1        digest_0.6.37        timechange_0.3.0    
## [10] lifecycle_1.0.5      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.7          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] knitr_1.51           labeling_0.4.3       plyr_1.8.9          
## [22] repr_1.1.7           RColorBrewer_1.1-3   withr_3.0.2         
## [25] nnet_7.3-20          grid_4.5.1           stats4_4.5.1        
## [28] e1071_1.7-16         future_1.67.0        globals_0.18.0      
## [31] scales_1.4.0         iterators_1.0.14     MASS_7.3-65         
## [34] cli_3.6.5            rmarkdown_2.31       ragg_1.5.0          
## [37] generics_0.1.4       otel_0.2.0           rstudioapi_0.17.1   
## [40] future.apply_1.20.0  reshape2_1.4.4       tzdb_0.5.0          
## [43] proxy_0.4-27         cachem_1.1.0         splines_4.5.1       
## [46] parallel_4.5.1       base64enc_0.1-3      vctrs_0.7.1         
## [49] hardhat_1.4.3        jsonlite_2.0.0       hms_1.1.3           
## [52] listenv_0.9.1        systemfonts_1.3.1    foreach_1.5.2       
## [55] gower_1.0.2          jquerylib_0.1.4      recipes_1.3.1       
## [58] glue_1.8.0           parallelly_1.45.1    codetools_0.2-20    
## [61] stringi_1.8.7        gtable_0.3.6         shape_1.4.6.1       
## [64] pillar_1.11.1        htmltools_0.5.8.1    ipred_0.9-15        
## [67] lava_1.8.2           R6_2.6.1             textshaping_1.0.3   
## [70] evaluate_1.0.5       bslib_0.9.0          class_7.3-23        
## [73] Rcpp_1.1.0           nlme_3.1-168         prodlim_2025.04.28  
## [76] xfun_0.53            pkgconfig_2.0.3      ModelMetrics_1.2.2.2