Pradictive Model for Hanckey

Preparing the Data for Analysis

Firstly, letโ€™s clear the memory:

rm(list=ls())

Import libraries:

suppressMessages(library(caret))
suppressMessages(library(tidyverse))
suppressMessages(library(skimr))
## Warning: package 'skimr' was built under R version 3.5.2
suppressMessages(library(ggplot2))
suppressMessages(library(ggthemes))
suppressMessages(library(gridExtra))
suppressMessages(library(grid))
suppressMessages(library(ggplot2))
suppressMessages(library(lattice))
suppressMessages(library(glmnet))
suppressMessages(library(rattle))
suppressMessages(library(ranger))

Location folders:

data_in <- "data/"
func <- "code/"

Call functions:

# Load ggplot theme function
source(paste0(func, "theme_bg.R"))
# Created a helper function with some useful stuff
source(paste0(func, "da_helper_functions.R"))
source(paste0(func, "Ch14_airbnb_prediction_functions.R"))
options(digits = 3) 

Load data:

suppressMessages(
data <- read_csv(paste0(data_in, "airbnb_", "london", "_workfile_adj.csv")) %>% 
  mutate_if(is.character, factor)  %>% 
  filter(!is.na(price))
)

Where do we have missing variables now?

count_missing_values <- function(data) {
  num_missing_values <- map_int(data, function(x) sum(is.na(x)))
  num_missing_values[num_missing_values > 0] 
}

count_missing_values(data)
##        usd_cleaning_fee    p_host_response_rate             n_bathrooms 
##                   20017                   12868                     237 
##  n_review_scores_rating     n_reviews_per_month                  n_beds 
##                   16501                   15741                     167 
##            n_days_since                 ln_beds              f_bathroom 
##                   15741                     168                     237 
##     f_number_of_reviews           ln_days_since          ln_days_since2 
##                       1                   15749                   15749 
##          ln_days_since3           n_days_since2           n_days_since3 
##                   15749                   15741                   15741 
## ln_review_scores_rating        f_minimum_nights 
##                   16501                       1

Sample definition and preparation:

# We focus on normal apartments
data <- data %>% 
  filter(n_accommodates < 8)

# Remove missing data, that has no score rating
data <- data %>% 
  drop_na(n_review_scores_rating)

# Create ln(price) variable
data <- data %>%
  mutate(ln_price = ifelse(is.na(price), price, log(price)))

# Handle some missing variables
data <- data %>%
  mutate(n_bathrooms = ifelse(is.na(n_bathrooms), 
                              median(n_bathrooms, na.rm = T), 
                              n_bathrooms), #assume at least 1 bath
         
         n_beds = ifelse(is.na(n_beds), 
                         n_accommodates, 
                         n_beds)) #assume n_beds=n_accomodates

count_missing_values(data)
##     usd_cleaning_fee p_host_response_rate              ln_beds 
##                11201                 5446                   46 
##           f_bathroom  f_number_of_reviews        ln_days_since 
##                   94                    1                    8 
##       ln_days_since2       ln_days_since3     f_minimum_nights 
##                    8                    8                    1
# get rid of unused columns and keep only complete cases
data <- data %>% 
  select(-one_of("usd_cleaning_fee", "p_host_response_rate")) %>% 
  drop_na()

# copy a variable - purpose later, see at variable importance
data <- data %>% 
  mutate(n_accommodates_copy = n_accommodates)

#Use only airbnb apartments in london
data <- data %>%
  filter(f_property_type == "Apartment")

Prepare Data For Analaysis

Create train and test samples:

train_data <- data %>%
  filter(f_neighbourhood_cleansed != "Hackney")
test_data <- data %>%
  filter(f_neighbourhood_cleansed == "Hackney")

DiM <- rbind(dim(train_data), dim(test_data))
rownames(DiM) <- c("Train", "Test")
colnames(DiM) <- c("Observations", "Variables")
DiM
##       Observations Variables
## Train        23647        87
## Test          2384        87

Define models:

# Basic Variables
basic_vars <- c("n_accommodates", "n_beds", "f_room_type", "n_days_since") 
basic_log <- c("ln_accommodates", "ln_beds", "f_room_type","ln_days_since") 

# Factorized variables
basic_add <- c("f_bathroom", "f_cancellation_policy", "f_bed_type") 
reviews <- c("f_number_of_reviews", "n_review_scores_rating") 

# Higher orders
poly_lev <- c("n_accommodates2", "n_days_since2", "n_days_since3")
poly_log <- c("ln_accommodates2","ln_days_since2","ln_days_since3")

# Dummy variables
amenities <-  grep("^d_.*", names(data), value = TRUE)
# Models in levels
predictors_lev0 <- "n_accommodates"
predictors_lev1 <- c(basic_vars, basic_add)
predictors_lev2 <- c(basic_vars, basic_add, reviews)
predictors_lev3 <- c(basic_vars, basic_add, reviews, poly_lev)
predictors_lev4 <- c(basic_vars, basic_add, reviews, poly_lev, amenities)

# Models in logs
predictors_log0 <- "ln_accommodates"
predictors_log1 <- c(basic_log, basic_add)
predictors_log2 <- c(basic_log, basic_add, reviews)
predictors_log3 <- c(basic_log, basic_add, reviews, poly_log)
predictors_log4 <- c(basic_log, basic_add, reviews, poly_log, amenities)

Random Forest

train_control <- trainControl(method = "cv", number = 5, verboseIter = TRUE)

Random Forest Model 1:

# 1
# Very simple model
set.seed(1915362)
rf_model_lev1 <- train(
  formula(paste0("price ~", paste0(predictors_lev0, collapse = " + "))),
  data = train_data,
  method = "ranger",
  trControl = train_control,
  tuneGrid = data.frame(.mtry = 1, .splitrule = "variance", .min.node.size = 5),
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size=5 
## - Fold1: mtry=1, splitrule=variance, min.node.size=5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=5 
## - Fold2: mtry=1, splitrule=variance, min.node.size=5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=5 
## - Fold3: mtry=1, splitrule=variance, min.node.size=5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=5 
## - Fold4: mtry=1, splitrule=variance, min.node.size=5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=5 
## - Fold5: mtry=1, splitrule=variance, min.node.size=5 
## Aggregating results
## Fitting final model on full training set

Random Forest Model 2:

# 2
tune_grid <- expand.grid(
  .mtry = 1,
  .splitrule = "variance",
  .min.node.size = c(5, 10)
)

set.seed(1915362)
rf_model_lev2 <- train(
  formula(paste0("price ~", paste0(predictors_lev1, collapse = " + "))),
  data = train_data,
  method = "ranger",
  trControl = train_control,
  tuneGrid = tune_grid,
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold1: mtry=1, splitrule=variance, min.node.size=10 
## - Fold1: mtry=1, splitrule=variance, min.node.size=10 
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=10 
## - Fold2: mtry=1, splitrule=variance, min.node.size=10 
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=10 
## - Fold3: mtry=1, splitrule=variance, min.node.size=10 
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=10 
## - Fold4: mtry=1, splitrule=variance, min.node.size=10 
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=10 
## - Fold5: mtry=1, splitrule=variance, min.node.size=10 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 5 on full training set

Random Forest Model 3:

# 3
tune_grid <- expand.grid(
  .mtry = 1,
  .splitrule = "variance",
  .min.node.size = c(5, 10)
)

set.seed(1915362)
rf_model_lev3 <- train(
  formula(paste0("price ~", paste0(predictors_lev4, collapse = " + "))),
  data = train_data,
  method = "ranger",
  trControl = train_control,
  tuneGrid = tune_grid,
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold1: mtry=1, splitrule=variance, min.node.size=10 
## - Fold1: mtry=1, splitrule=variance, min.node.size=10 
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=10 
## - Fold2: mtry=1, splitrule=variance, min.node.size=10 
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=10 
## - Fold3: mtry=1, splitrule=variance, min.node.size=10 
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=10 
## - Fold4: mtry=1, splitrule=variance, min.node.size=10 
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=10 
## - Fold5: mtry=1, splitrule=variance, min.node.size=10 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 10 on full training set
## Computing permutation importance.. Progress: 77%. Estimated remaining time: 9 seconds.

Random Forest Model 4:

# 4
set.seed(1915362)
rf_model_log1 <- train(
  formula(paste0("ln_price ~", paste0(predictors_log0, collapse = " + "))),
  data = train_data,
  method = "ranger",
  trControl = train_control,
  tuneGrid = data.frame(.mtry = 1, .splitrule = "variance", .min.node.size = 5),
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size=5 
## - Fold1: mtry=1, splitrule=variance, min.node.size=5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=5 
## - Fold2: mtry=1, splitrule=variance, min.node.size=5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=5 
## - Fold3: mtry=1, splitrule=variance, min.node.size=5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=5 
## - Fold4: mtry=1, splitrule=variance, min.node.size=5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=5 
## - Fold5: mtry=1, splitrule=variance, min.node.size=5 
## Aggregating results
## Fitting final model on full training set

Random Forest Model 5:

# 5
tune_grid <- expand.grid(
  .mtry = 1,
  .splitrule = "variance",
  .min.node.size = c(5, 10)
)

set.seed(1915362)
rf_model_log2 <- train(
  formula(paste0("ln_price ~", paste0(predictors_log1, collapse = " + "))),
  data = train_data,
  method = "ranger",
  trControl = train_control,
  tuneGrid = tune_grid,
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold1: mtry=1, splitrule=variance, min.node.size=10 
## - Fold1: mtry=1, splitrule=variance, min.node.size=10 
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=10 
## - Fold2: mtry=1, splitrule=variance, min.node.size=10 
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=10 
## - Fold3: mtry=1, splitrule=variance, min.node.size=10 
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=10 
## - Fold4: mtry=1, splitrule=variance, min.node.size=10 
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=10 
## - Fold5: mtry=1, splitrule=variance, min.node.size=10 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 5 on full training set

Random Forest Model 6:

# 6
tune_grid <- expand.grid(
  .mtry = 1,
  .splitrule = "variance",
  .min.node.size = c(5, 10)
)

set.seed(1915362)
rf_model_log3 <- train(
  formula(paste0("ln_price ~", paste0(predictors_log4, collapse = " + "))),
  data = train_data,
  method = "ranger",
  trControl = train_control,
  tuneGrid = tune_grid,
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold1: mtry=1, splitrule=variance, min.node.size=10 
## - Fold1: mtry=1, splitrule=variance, min.node.size=10 
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=10 
## - Fold2: mtry=1, splitrule=variance, min.node.size=10 
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=10 
## - Fold3: mtry=1, splitrule=variance, min.node.size=10 
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=10 
## - Fold4: mtry=1, splitrule=variance, min.node.size=10 
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=10 
## - Fold5: mtry=1, splitrule=variance, min.node.size=10 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 5 on full training set
## Computing permutation importance.. Progress: 89%. Estimated remaining time: 3 seconds.

Evaluate random forests:

results <- resamples(
  list(
    lev_1 = rf_model_lev1, 
    lev_2 = rf_model_lev2,
    lev_3 = rf_model_lev3,
    log_1 = rf_model_log1,
    log_2 = rf_model_log2,
    log_3 = rf_model_log3
  )
)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: lev_1, lev_2, lev_3, log_1, log_2, log_3 
## Number of resamples: 5 
## 
## MAE 
##         Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## lev_1 33.200  33.243 33.262 33.382  33.357 33.846    0
## lev_2 30.139  30.657 30.661 30.679  30.669 31.268    0
## lev_3 33.988  34.268 34.449 34.582  34.686 35.520    0
## log_1  0.357   0.359  0.363  0.363   0.364  0.373    0
## log_2  0.320   0.326  0.329  0.328   0.331  0.333    0
## log_3  0.372   0.378  0.378  0.378   0.380  0.380    0
## 
## RMSE 
##         Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## lev_1 48.806  49.045 49.651 50.367  51.780 52.551    0
## lev_2 45.562  46.646 46.871 47.764  49.261 50.478    0
## lev_3 50.394  50.928 51.680 52.296  53.090 55.389    0
## log_1  0.452   0.454  0.459  0.458   0.459  0.466    0
## log_2  0.410   0.415  0.420  0.418   0.421  0.423    0
## log_3  0.471   0.479  0.479  0.478   0.479  0.481    0
## 
## Rsquared 
##        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
## lev_1 0.330   0.348  0.353 0.351   0.360 0.363    0
## lev_2 0.439   0.473  0.485 0.476   0.488 0.492    0
## lev_3 0.452   0.465  0.481 0.475   0.484 0.494    0
## log_1 0.407   0.425  0.429 0.429   0.435 0.449    0
## log_2 0.589   0.589  0.601 0.599   0.603 0.613    0
## log_3 0.572   0.585  0.587 0.589   0.598 0.605    0

Because of it has the lowest MAE, RMSE and biggest R-squared, the model which named log_2 is the best model for random forest.

Linear Models

# Create models in levels models: 1-8
modellev0 <- " ~ n_accommodates"
modellev1 <- paste0(" ~ ", paste(predictors_lev1, collapse="+"))
modellev2 <- paste0(" ~ ", paste(predictors_lev2, collapse="+")) 
modellev3 <- paste0(" ~ ", paste(predictors_lev3, collapse="+"))
modellev4 <- paste0(" ~ ", paste(predictors_lev4, collapse="+"))

# Create models in logs, models: 1-8
modellog0 <- " ~ ln_accommodates"
modellog1 <- paste0(" ~ ", paste(predictors_log1, collapse="+"))
modellog2 <- paste0(" ~ ", paste(predictors_log2, collapse="+")) 
modellog3 <- paste0(" ~ ", paste(predictors_log3, collapse="+"))
modellog4 <- paste0(" ~ ", paste(predictors_log4, collapse="+"))

Create list to save model results:

model_results <- list()

# For each level and logs
for (type in c("lev","log")) {
  # for each model
  for (i in 0:4) {
    
    # Get the proper model names
    model_name <- paste0("model",type,i)
    # Get the proper target variable
    yvar <- ifelse(type=="lev","price","ln_price")
    # Get the depedent variables
    xvars <- eval(parse(text = model_name))
    # Create the appropriate formula
    formula <- formula(paste0(yvar,xvars))
    # Estimate on the training data
    model <- lm(formula,data = train_data)
    # Predict on the training sample (in-sample)
    prediction_train <- predict(model, newdata = train_data)
    # Predict on the testing sample (out-of--sample)
    prediction_test <- predict(model, newdata = test_data)
    
    # Estimate the appropriate Criteria
    if (type=="lev") {
      mse_train <- RMSE(prediction_train, train_data[[yvar]], na.rm = TRUE)^2
      mse_test <- RMSE(prediction_test, test_data[[yvar]], na.rm = TRUE)^2
    } else {
      rmselog <- RMSE(prediction_train, train_data[[yvar]], na.rm = TRUE)
      mse_train <- mse_log(prediction_train, train_data[[yvar]], rmselog)
      mse_test <- mse_log(prediction_test, test_data[[yvar]], rmselog)
    }
    # Bayesian Criteria
    BIC <- BIC(model)
    # Save into model results
    model_results[[model_name]] <- list(
      yvar=yvar,xvars=xvars,formula=formula,model=model,
      prediction_train = prediction_train,prediction_test = prediction_test,
      mse_train = mse_train,mse_test = mse_test,BIC = BIC
    )
  }
}

Compare the models:

imap(model_results,  ~{
  results <- as.data.frame(.x[c("mse_train", "mse_test", "BIC")])
  results %>% mutate(model = .y)
}) %>% 
  bind_rows() 
##    mse_train mse_test    BIC     model
## 1       2537     1670 252498 modellev0
## 2       2070     1404 247771 modellev1
## 3       2060     1411 247672 modellev2
## 4       2050     1397 247591 modellev3
## 5       1869     1326 245905 modellev4
## 6       2539     1665  30260 modellog0
## 7       2043     1361  21039 modellog1
## 8       2040     1367  20770 modellog2
## 9       2035     1367  20727 modellog3
## 10      1814     1275  18059 modellog4

Due to it has lowest MSE and BIC value, model named modellog4 is the best model here.


Lasso

Take model 10 and find observations where there is no missing data:

vars_model_10 <- c("ln_price", predictors_log4)
data_train_complete <- train_data %>% 
  select_(.dots = vars_model_10) %>% 
  drop_na()

data_test_complete <- test_data %>% 
  select_(.dots = vars_model_10) %>% 
  drop_na()
train_control <- trainControl(method = "cv", number = 10)
tune_grid <- expand.grid("alpha" = c(1), "lambda" = seq(0.05, 1, by = 0.05))
formula <- formula(paste0("ln_price ~ ", paste(setdiff(vars_model_10, "ln_price"), collapse = " + ")))

Lasso Model:

set.seed(1234)
lasso_model <- train(formula,
                     data = data_train_complete,
                     method = "glmnet",
                     preProcess = c("center", "scale"),
                     trControl = train_control,
                     tuneGrid = tune_grid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
print(lasso_model$bestTune$lambda)
## [1] 0.05
lasso_coeffs <- coef(lasso_model$finalModel, lasso_model$bestTune$lambda) %>%
  as.matrix() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "variable") %>% 
  rename(coefficient = `1`)  

print(lasso_coeffs)
##                         variable coefficient
## 1                    (Intercept)      4.3445
## 2                ln_accommodates      0.1321
## 3                        ln_beds      0.0000
## 4        f_room_typePrivate room     -0.2465
## 5         f_room_typeShared room     -0.0420
## 6                  ln_days_since      0.0000
## 7                     f_bathroom      0.0446
## 8  f_cancellation_policymoderate      0.0000
## 9    f_cancellation_policystrict      0.0000
## 10            f_bed_typeReal Bed      0.0000
## 11           f_number_of_reviews      0.0000
## 12        n_review_scores_rating      0.0000
## 13              ln_accommodates2      0.0442
## 14                ln_days_since2      0.0000
## 15                ln_days_since3      0.0000
## 16               d_24hourcheckin      0.0000
## 17             d_airconditioning      0.0000
## 18                   d_breakfast      0.0000
## 19      d_buzzerwirelessintercom      0.0000
## 20                     d_cabletv      0.0000
## 21      d_carbonmonoxidedetector      0.0000
## 22                        d_cats      0.0000
## 23                        d_dogs      0.0000
## 24                     d_doorman      0.0000
## 25                d_doormanentry      0.0000
## 26                       d_dryer      0.0243
## 27          d_elevatorinbuilding      0.0000
## 28                  d_essentials      0.0000
## 29           d_familykidfriendly      0.0000
## 30            d_fireextinguisher      0.0000
## 31                 d_firstaidkit      0.0000
## 32       d_freeparkingonpremises     -0.0252
## 33         d_freeparkingonstreet      0.0000
## 34                         d_gym      0.0000
## 35                   d_hairdryer      0.0000
## 36                     d_hangers      0.0000
## 37                     d_heating      0.0000
## 38                      d_hottub      0.0000
## 39             d_indoorfireplace      0.0000
## 40                    d_internet      0.0000
## 41                        d_iron      0.0000
## 42                      d_keypad      0.0000
## 43                     d_kitchen      0.0000
## 44     d_laptopfriendlyworkspace      0.0000
## 45           d_lockonbedroomdoor      0.0000
## 46                     d_lockbox      0.0000
## 47                   d_otherpets      0.0000
## 48      d_paidparkingoffpremises      0.0000
## 49                 d_petsallowed      0.0000
## 50      d_petsliveonthisproperty      0.0000
## 51                        d_pool      0.0000
## 52             d_privateentrance      0.0000
## 53           d_privatelivingroom      0.0000
## 54                  d_safetycard      0.0000
## 55                 d_selfcheckin      0.0000
## 56                     d_shampoo      0.0000
## 57                   d_smartlock      0.0000
## 58               d_smokedetector      0.0000
## 59              d_smokingallowed      0.0000
## 60           d_suitableforevents      0.0000
## 61                          d_tv      0.0266
## 62                      d_washer      0.0000
## 63                 d_washerdryer      0.0000
## 64        d_wheelchairaccessible      0.0000
## 65            d_wirelessinternet      0.0000

Evaluate model. CV error:

lasso_cv_rmse <- lasso_model$results %>% 
  filter(lambda == lasso_model$bestTune$lambda) %>% 
  select(RMSE)
print(lasso_cv_rmse[1, 1]^2)
## [1] 0.143

Evaluate on test set:

lasso_test_rmse <- RMSE(predict(lasso_model, newdata = data_test_complete), data_test_complete$ln_price)
print(lasso_test_rmse^2)
## [1] 0.11

Post Lasso OLS

Keep vars from lasso, do OLS. Show predicted MSE:

nonzero_lasso_vars <- lasso_coeffs %>% 
  filter(coefficient != 0) %>% 
  pull(variable)

Do not use the few interactions for now:

post_lasso_ols_vars <- intersect(nonzero_lasso_vars, names(data_train_complete))

fit_control <- trainControl(method = "cv", number = 10)

set.seed(1234)  
post_lasso_ols_model <- train(
  formula(paste("ln_price ~ ", paste(post_lasso_ols_vars, collapse = " + "))), 
  data = data_train_complete, 
  method = "lm", 
  trControl = fit_control
)
post_lasso_ols_rmse <- post_lasso_ols_model$results[["RMSE"]]
post_lasso_ols_rmse^2 
## [1] 0.18

Evaluate on test set:

post_lasso_ols_test_rmse <- 
  RMSE(predict(post_lasso_ols_model, 
               newdata = data_test_complete), 
               data_test_complete$ln_price)
print(post_lasso_ols_test_rmse^2)
## [1] 0.158

Conculation

We have random forest, linear, lasso and post lasso models. The best prediction was made by Lasso with 0.11 RMSE.


Pradictive Model for Manchester

Preparing the Data for Analysis

Load data:

suppressMessages(
data2 <- read_csv(paste0(data_in, "airbnb_", "bristol_and_london", 
                         "_workfile_adj.csv")) %>% 
  mutate_if(is.character, factor)  %>% 
  filter(!is.na(price))
)

suppressMessages(
data3 <- read_csv(paste0(data_in, "airbnb_", "manchester", 
                         "_workfile_adj.csv")) %>% 
  mutate_if(is.character, factor)  %>% 
  filter(!is.na(price))
)

Sample definition and preparation:

# We focus on normal apartments
data2 <- data2 %>% 
  filter(n_accommodates < 8)
data3 <- data3 %>% 
  filter(n_accommodates < 8)

# Remove missing data, that has no score rating
data2 <- data2 %>% 
  drop_na(n_review_scores_rating)
data3 <- data3 %>% 
  drop_na(n_review_scores_rating)


# Handle some missing variables
data2 <- data2 %>%
  mutate(f_bathrooms = ifelse(is.na(f_bathrooms), 
                              median(f_bathrooms, na.rm = T), 
                              f_bathrooms), 
         
         n_beds = ifelse(is.na(n_beds), 
                         n_accommodates, 
                         n_beds))
data3 <- data3 %>%
  mutate(f_bathrooms = ifelse(is.na(f_bathrooms), 
                              median(f_bathrooms, na.rm = T), 
                              f_bathrooms), 
         
         n_beds = ifelse(is.na(n_beds), 
                         n_accommodates, 
                         n_beds))

Prepare Data For Analaysis

Create train and test samples:

train_data2 <- data2 
test_data2 <- data3

DiM <- rbind(dim(train_data2), dim(test_data2))
rownames(DiM) <- c("Train", "Test")
colnames(DiM) <- c("Observations", "Variables")
DiM
##       Observations Variables
## Train        35886        16
## Test           612        16

Define models:

# Basic Variables
basic_vars <- c("n_accommodates", "n_beds", "f_room_type") 
basic_log <- c("ln_accommodates", "ln_beds", "f_room_type") 

# Factorized variables
basic_add <- c("f_bathrooms", "f_cancellation_policy", "f_bed_type") 
reviews <- c("n_review_scores_rating") 

# Higher orders
poly_lev <- c("n_accommodates2")
poly_log <- c("ln_accommodates2")
# Models in levels
predictors_lev0 <- "n_accommodates"
predictors_lev1 <- c(basic_vars, basic_add)
predictors_lev2 <- c(basic_vars, basic_add, reviews)
predictors_lev3 <- c(basic_vars, basic_add, reviews, poly_lev)

Random Forest

train_control <- trainControl(method = "cv", number = 5, verboseIter = TRUE)

Random Forest Model 1:

# 1
# Very simple model
set.seed(1915362)
mrf_model_lev1 <- train(
  formula(paste0("price ~", paste0(predictors_lev0, collapse = " + "))),
  data = train_data2,
  method = "ranger",
  trControl = train_control,
  tuneGrid = data.frame(.mtry = 1, .splitrule = "variance", .min.node.size = 5),
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size=5 
## - Fold1: mtry=1, splitrule=variance, min.node.size=5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=5 
## - Fold2: mtry=1, splitrule=variance, min.node.size=5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=5 
## - Fold3: mtry=1, splitrule=variance, min.node.size=5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=5 
## - Fold4: mtry=1, splitrule=variance, min.node.size=5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=5 
## - Fold5: mtry=1, splitrule=variance, min.node.size=5 
## Aggregating results
## Fitting final model on full training set

Random Forest Model 2:

# 2
tune_grid <- expand.grid(
  .mtry = 1,
  .splitrule = "variance",
  .min.node.size = c(5, 10)
)

set.seed(1915362)
mrf_model_lev2 <- train(
  formula(paste0("price ~", paste0(predictors_lev1, collapse = " + "))),
  data = train_data2,
  method = "ranger",
  trControl = train_control,
  tuneGrid = tune_grid,
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold1: mtry=1, splitrule=variance, min.node.size=10 
## - Fold1: mtry=1, splitrule=variance, min.node.size=10 
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=10 
## - Fold2: mtry=1, splitrule=variance, min.node.size=10 
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=10 
## - Fold3: mtry=1, splitrule=variance, min.node.size=10 
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=10 
## - Fold4: mtry=1, splitrule=variance, min.node.size=10 
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=10 
## - Fold5: mtry=1, splitrule=variance, min.node.size=10 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 10 on full training set

Random Forest Model 3:

# 3
tune_grid <- expand.grid(
  .mtry = 1,
  .splitrule = "variance",
  .min.node.size = c(5, 10)
)

set.seed(1915362)
mrf_model_lev3 <- train(
  formula(paste0("price ~", paste0(predictors_lev3, collapse = " + "))),
  data = train_data2,
  method = "ranger",
  trControl = train_control,
  tuneGrid = tune_grid,
  importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold1: mtry=1, splitrule=variance, min.node.size=10 
## - Fold1: mtry=1, splitrule=variance, min.node.size=10 
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold2: mtry=1, splitrule=variance, min.node.size=10 
## - Fold2: mtry=1, splitrule=variance, min.node.size=10 
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold3: mtry=1, splitrule=variance, min.node.size=10 
## - Fold3: mtry=1, splitrule=variance, min.node.size=10 
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold4: mtry=1, splitrule=variance, min.node.size=10 
## - Fold4: mtry=1, splitrule=variance, min.node.size=10 
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5 
## + Fold5: mtry=1, splitrule=variance, min.node.size=10 
## - Fold5: mtry=1, splitrule=variance, min.node.size=10 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 10 on full training set

Evaluate random forests:

results <- resamples(
  list(
    lev_1 = mrf_model_lev1, 
    lev_2 = mrf_model_lev2,
    lev_3 = mrf_model_lev3
  )
)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: lev_1, lev_2, lev_3 
## Number of resamples: 5 
## 
## MAE 
##       Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lev_1 31.4    31.8   31.8 31.9    31.8 32.4    0
## lev_2 31.5    32.4   32.7 32.5    32.9 33.0    0
## lev_3 31.3    31.3   31.4 31.7    32.2 32.2    0
## 
## RMSE 
##       Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lev_1 47.4    49.2   50.3 50.0    51.2 52.1    0
## lev_2 49.1    49.3   51.6 51.4    53.3 53.6    0
## lev_3 47.8    48.9   50.0 50.4    52.3 52.8    0
## 
## Rsquared 
##        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
## lev_1 0.380   0.381  0.385 0.392   0.391 0.422    0
## lev_2 0.468   0.479  0.488 0.488   0.492 0.514    0
## lev_3 0.464   0.473  0.486 0.484   0.488 0.510    0

Even though it does not have the biggest, the model which named lev_1 is the best model for random forest R-squared because of it has the lowest MAE and RMSE.

Linear Models

# Create models in levels models: 1-8
mmodellev0 <- " ~ n_accommodates"
mmodellev1 <- paste0(" ~ ", paste(predictors_lev1, collapse="+"))
mmodellev2 <- paste0(" ~ ", paste(predictors_lev2, collapse="+")) 
mmodellev3 <- paste0(" ~ ", paste(predictors_lev3, collapse="+"))

Create list to save model results:

model_results <- list()

# For each level and logs
for (type in c("lev")) {
  # for each model
  for (i in 0:3) {
    
    # Get the proper model names
    model_name <- paste0("mmodel",type,i)
    # Get the proper target variable
    yvar <- ifelse(type=="lev","price","ln_price")
    # Get the depedent variables
    xvars <- eval(parse(text = model_name))
    # Create the appropriate formula
    formula <- formula(paste0(yvar,xvars))
    # Estimate on the training data
    model <- lm(formula,data = train_data2)
    # Predict on the training sample (in-sample)
    prediction_train <- predict(model, newdata = train_data2)
    # Predict on the testing sample (out-of--sample)
    prediction_test <- predict(model, newdata = test_data2)
    
    # Estimate the appropriate Criteria
    if (type=="lev") {
      mse_train <- RMSE(prediction_train, train_data2[[yvar]], na.rm = TRUE)^2
      mse_test <- RMSE(prediction_test, test_data2[[yvar]], na.rm = TRUE)^2
    } else {
      rmselog <- RMSE(prediction_train, train_data2[[yvar]], na.rm = TRUE)
      mse_train <- mse_log(prediction_train, train_data2[[yvar]], rmselog)
      mse_test <- mse_log(prediction_test, test_data2[[yvar]], rmselog)
    }
    # Bayesian Criteria
    BIC <- BIC(model)
    # Save into model results
    model_results[[model_name]] <- list(
      yvar=yvar,xvars=xvars,formula=formula,model=model,
      prediction_train = prediction_train,prediction_test = prediction_test,
      mse_train = mse_train,mse_test = mse_test,BIC = BIC
    )
  }
}

Compare the models:

imap(model_results,  ~{
  results <- as.data.frame(.x[c("mse_train", "mse_test", "BIC")])
  results %>% mutate(model = .y)
}) %>% 
  bind_rows() 
##   mse_train mse_test    BIC      model
## 1      2505     1797 382721 mmodellev0
## 2      2060     1613 375830 mmodellev1
## 3      2052     1601 375697 mmodellev2
## 4      2036     1523 375433 mmodellev3

Due to it has lowest MSE and BIC value, model named mmodellev3 is the best model here.


Lasso

Take model 4 and find observations where there is no missing data:

vars_model_4 <- c("price", predictors_lev3)
data_train_complete2 <- train_data2 %>% 
  select_(.dots = vars_model_4) %>% 
  drop_na()

data_test_complete2 <- test_data2 %>% 
  select_(.dots = vars_model_4) %>% 
  drop_na()
train_control <- trainControl(method = "cv", number = 10)
tune_grid <- expand.grid("alpha" = c(1), "lambda" = seq(0.05, 1, by = 0.05))
formula <- formula(paste0("price ~ ", paste(setdiff(vars_model_4, "price"), collapse = " + ")))

Lasso Model:

set.seed(1234)
lasso_model <- train(formula,
                     data = data_train_complete2,
                     method = "glmnet",
                     preProcess = c("center", "scale"),
                     trControl = train_control,
                     tuneGrid = tune_grid)

print(lasso_model$bestTune$lambda)
## [1] 0.2
lasso_coeffs <- coef(lasso_model$finalModel, lasso_model$bestTune$lambda) %>%
  as.matrix() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "variable") %>% 
  rename(coefficient = `1`)  

print(lasso_coeffs)
##                                            variable coefficient
## 1                                       (Intercept)     85.5307
## 2                                    n_accommodates      1.7888
## 3                                            n_beds      0.1507
## 4                           f_room_typePrivate room    -23.7480
## 5                            f_room_typeShared room     -5.7509
## 6                                       f_bathrooms     12.0970
## 7                     f_cancellation_policymoderate      0.0000
## 8                       f_cancellation_policystrict      4.2529
## 9  f_cancellation_policystrict_14_with_grace_period     -1.8300
## 10             f_cancellation_policysuper_strict_30      0.0703
## 11             f_cancellation_policysuper_strict_60      0.0534
## 12                                  f_bed_typeCouch      0.0000
## 13                                  f_bed_typeFuton      0.0000
## 14                          f_bed_typePull-out Sofa      0.0000
## 15                               f_bed_typeReal Bed      0.3337
## 16                           n_review_scores_rating      2.6051
## 17                                  n_accommodates2     19.3638

Evaluate model. CV error:

lasso_cv_rmse <- lasso_model$results %>% 
  filter(lambda == lasso_model$bestTune$lambda) %>% 
  select(RMSE)
print(lasso_cv_rmse[1, 1]^2)
## [1] 2036

Evaluate on test set:

lasso_test_rmse <- RMSE(predict(lasso_model, newdata = data_test_complete2), data_test_complete2$price)
print(lasso_test_rmse^2) 
## [1] 1517

Post Lasso OLS

Keep vars from lasso, do OLS. Show predicted MSE:

nonzero_lasso_vars <- lasso_coeffs %>% 
  filter(coefficient != 0) %>% 
  pull(variable)

Do not use the few interactions for now:

post_lasso_ols_vars <- intersect(nonzero_lasso_vars, names(data_train_complete2))

fit_control <- trainControl(method = "cv", number = 10)

set.seed(1234)  
post_lasso_ols_model <- train(
  formula(paste("price ~ ", paste(post_lasso_ols_vars, collapse = " + "))), 
  data = data_train_complete2, 
  method = "lm", 
  trControl = fit_control
)
post_lasso_ols_rmse <- post_lasso_ols_model$results[["RMSE"]]
post_lasso_ols_rmse^2 
## [1] 2403

Evaluate on test set:

post_lasso_ols_test_rmse <- 
  RMSE(predict(post_lasso_ols_model, 
               newdata = data_test_complete2), 
               data_test_complete2$price)
print(post_lasso_ols_test_rmse^2)
## [1] 1873

Conculation

We have random forest, linear, lasso and post lasso models. The best prediction was made by Random Forrest.