Firstly, letโs clear the memory:
rm(list=ls())
Import libraries:
suppressMessages(library(caret))
suppressMessages(library(tidyverse))
suppressMessages(library(skimr))
## Warning: package 'skimr' was built under R version 3.5.2
suppressMessages(library(ggplot2))
suppressMessages(library(ggthemes))
suppressMessages(library(gridExtra))
suppressMessages(library(grid))
suppressMessages(library(ggplot2))
suppressMessages(library(lattice))
suppressMessages(library(glmnet))
suppressMessages(library(rattle))
suppressMessages(library(ranger))
Location folders:
data_in <- "data/"
func <- "code/"
Call functions:
# Load ggplot theme function
source(paste0(func, "theme_bg.R"))
# Created a helper function with some useful stuff
source(paste0(func, "da_helper_functions.R"))
source(paste0(func, "Ch14_airbnb_prediction_functions.R"))
options(digits = 3)
Load data:
suppressMessages(
data <- read_csv(paste0(data_in, "airbnb_", "london", "_workfile_adj.csv")) %>%
mutate_if(is.character, factor) %>%
filter(!is.na(price))
)
Where do we have missing variables now?
count_missing_values <- function(data) {
num_missing_values <- map_int(data, function(x) sum(is.na(x)))
num_missing_values[num_missing_values > 0]
}
count_missing_values(data)
## usd_cleaning_fee p_host_response_rate n_bathrooms
## 20017 12868 237
## n_review_scores_rating n_reviews_per_month n_beds
## 16501 15741 167
## n_days_since ln_beds f_bathroom
## 15741 168 237
## f_number_of_reviews ln_days_since ln_days_since2
## 1 15749 15749
## ln_days_since3 n_days_since2 n_days_since3
## 15749 15741 15741
## ln_review_scores_rating f_minimum_nights
## 16501 1
Sample definition and preparation:
# We focus on normal apartments
data <- data %>%
filter(n_accommodates < 8)
# Remove missing data, that has no score rating
data <- data %>%
drop_na(n_review_scores_rating)
# Create ln(price) variable
data <- data %>%
mutate(ln_price = ifelse(is.na(price), price, log(price)))
# Handle some missing variables
data <- data %>%
mutate(n_bathrooms = ifelse(is.na(n_bathrooms),
median(n_bathrooms, na.rm = T),
n_bathrooms), #assume at least 1 bath
n_beds = ifelse(is.na(n_beds),
n_accommodates,
n_beds)) #assume n_beds=n_accomodates
count_missing_values(data)
## usd_cleaning_fee p_host_response_rate ln_beds
## 11201 5446 46
## f_bathroom f_number_of_reviews ln_days_since
## 94 1 8
## ln_days_since2 ln_days_since3 f_minimum_nights
## 8 8 1
# get rid of unused columns and keep only complete cases
data <- data %>%
select(-one_of("usd_cleaning_fee", "p_host_response_rate")) %>%
drop_na()
# copy a variable - purpose later, see at variable importance
data <- data %>%
mutate(n_accommodates_copy = n_accommodates)
#Use only airbnb apartments in london
data <- data %>%
filter(f_property_type == "Apartment")
Create train and test samples:
train_data <- data %>%
filter(f_neighbourhood_cleansed != "Hackney")
test_data <- data %>%
filter(f_neighbourhood_cleansed == "Hackney")
DiM <- rbind(dim(train_data), dim(test_data))
rownames(DiM) <- c("Train", "Test")
colnames(DiM) <- c("Observations", "Variables")
DiM
## Observations Variables
## Train 23647 87
## Test 2384 87
Define models:
# Basic Variables
basic_vars <- c("n_accommodates", "n_beds", "f_room_type", "n_days_since")
basic_log <- c("ln_accommodates", "ln_beds", "f_room_type","ln_days_since")
# Factorized variables
basic_add <- c("f_bathroom", "f_cancellation_policy", "f_bed_type")
reviews <- c("f_number_of_reviews", "n_review_scores_rating")
# Higher orders
poly_lev <- c("n_accommodates2", "n_days_since2", "n_days_since3")
poly_log <- c("ln_accommodates2","ln_days_since2","ln_days_since3")
# Dummy variables
amenities <- grep("^d_.*", names(data), value = TRUE)
# Models in levels
predictors_lev0 <- "n_accommodates"
predictors_lev1 <- c(basic_vars, basic_add)
predictors_lev2 <- c(basic_vars, basic_add, reviews)
predictors_lev3 <- c(basic_vars, basic_add, reviews, poly_lev)
predictors_lev4 <- c(basic_vars, basic_add, reviews, poly_lev, amenities)
# Models in logs
predictors_log0 <- "ln_accommodates"
predictors_log1 <- c(basic_log, basic_add)
predictors_log2 <- c(basic_log, basic_add, reviews)
predictors_log3 <- c(basic_log, basic_add, reviews, poly_log)
predictors_log4 <- c(basic_log, basic_add, reviews, poly_log, amenities)
train_control <- trainControl(method = "cv", number = 5, verboseIter = TRUE)
Random Forest Model 1:
# 1
# Very simple model
set.seed(1915362)
rf_model_lev1 <- train(
formula(paste0("price ~", paste0(predictors_lev0, collapse = " + "))),
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = data.frame(.mtry = 1, .splitrule = "variance", .min.node.size = 5),
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size=5
## - Fold1: mtry=1, splitrule=variance, min.node.size=5
## + Fold2: mtry=1, splitrule=variance, min.node.size=5
## - Fold2: mtry=1, splitrule=variance, min.node.size=5
## + Fold3: mtry=1, splitrule=variance, min.node.size=5
## - Fold3: mtry=1, splitrule=variance, min.node.size=5
## + Fold4: mtry=1, splitrule=variance, min.node.size=5
## - Fold4: mtry=1, splitrule=variance, min.node.size=5
## + Fold5: mtry=1, splitrule=variance, min.node.size=5
## - Fold5: mtry=1, splitrule=variance, min.node.size=5
## Aggregating results
## Fitting final model on full training set
Random Forest Model 2:
# 2
tune_grid <- expand.grid(
.mtry = 1,
.splitrule = "variance",
.min.node.size = c(5, 10)
)
set.seed(1915362)
rf_model_lev2 <- train(
formula(paste0("price ~", paste0(predictors_lev1, collapse = " + "))),
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid,
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5
## + Fold1: mtry=1, splitrule=variance, min.node.size=10
## - Fold1: mtry=1, splitrule=variance, min.node.size=10
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5
## + Fold2: mtry=1, splitrule=variance, min.node.size=10
## - Fold2: mtry=1, splitrule=variance, min.node.size=10
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5
## + Fold3: mtry=1, splitrule=variance, min.node.size=10
## - Fold3: mtry=1, splitrule=variance, min.node.size=10
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5
## + Fold4: mtry=1, splitrule=variance, min.node.size=10
## - Fold4: mtry=1, splitrule=variance, min.node.size=10
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5
## + Fold5: mtry=1, splitrule=variance, min.node.size=10
## - Fold5: mtry=1, splitrule=variance, min.node.size=10
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 5 on full training set
Random Forest Model 3:
# 3
tune_grid <- expand.grid(
.mtry = 1,
.splitrule = "variance",
.min.node.size = c(5, 10)
)
set.seed(1915362)
rf_model_lev3 <- train(
formula(paste0("price ~", paste0(predictors_lev4, collapse = " + "))),
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid,
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5
## + Fold1: mtry=1, splitrule=variance, min.node.size=10
## - Fold1: mtry=1, splitrule=variance, min.node.size=10
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5
## + Fold2: mtry=1, splitrule=variance, min.node.size=10
## - Fold2: mtry=1, splitrule=variance, min.node.size=10
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5
## + Fold3: mtry=1, splitrule=variance, min.node.size=10
## - Fold3: mtry=1, splitrule=variance, min.node.size=10
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5
## + Fold4: mtry=1, splitrule=variance, min.node.size=10
## - Fold4: mtry=1, splitrule=variance, min.node.size=10
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5
## + Fold5: mtry=1, splitrule=variance, min.node.size=10
## - Fold5: mtry=1, splitrule=variance, min.node.size=10
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 10 on full training set
## Computing permutation importance.. Progress: 77%. Estimated remaining time: 9 seconds.
Random Forest Model 4:
# 4
set.seed(1915362)
rf_model_log1 <- train(
formula(paste0("ln_price ~", paste0(predictors_log0, collapse = " + "))),
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = data.frame(.mtry = 1, .splitrule = "variance", .min.node.size = 5),
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size=5
## - Fold1: mtry=1, splitrule=variance, min.node.size=5
## + Fold2: mtry=1, splitrule=variance, min.node.size=5
## - Fold2: mtry=1, splitrule=variance, min.node.size=5
## + Fold3: mtry=1, splitrule=variance, min.node.size=5
## - Fold3: mtry=1, splitrule=variance, min.node.size=5
## + Fold4: mtry=1, splitrule=variance, min.node.size=5
## - Fold4: mtry=1, splitrule=variance, min.node.size=5
## + Fold5: mtry=1, splitrule=variance, min.node.size=5
## - Fold5: mtry=1, splitrule=variance, min.node.size=5
## Aggregating results
## Fitting final model on full training set
Random Forest Model 5:
# 5
tune_grid <- expand.grid(
.mtry = 1,
.splitrule = "variance",
.min.node.size = c(5, 10)
)
set.seed(1915362)
rf_model_log2 <- train(
formula(paste0("ln_price ~", paste0(predictors_log1, collapse = " + "))),
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid,
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5
## + Fold1: mtry=1, splitrule=variance, min.node.size=10
## - Fold1: mtry=1, splitrule=variance, min.node.size=10
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5
## + Fold2: mtry=1, splitrule=variance, min.node.size=10
## - Fold2: mtry=1, splitrule=variance, min.node.size=10
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5
## + Fold3: mtry=1, splitrule=variance, min.node.size=10
## - Fold3: mtry=1, splitrule=variance, min.node.size=10
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5
## + Fold4: mtry=1, splitrule=variance, min.node.size=10
## - Fold4: mtry=1, splitrule=variance, min.node.size=10
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5
## + Fold5: mtry=1, splitrule=variance, min.node.size=10
## - Fold5: mtry=1, splitrule=variance, min.node.size=10
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 5 on full training set
Random Forest Model 6:
# 6
tune_grid <- expand.grid(
.mtry = 1,
.splitrule = "variance",
.min.node.size = c(5, 10)
)
set.seed(1915362)
rf_model_log3 <- train(
formula(paste0("ln_price ~", paste0(predictors_log4, collapse = " + "))),
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid,
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5
## + Fold1: mtry=1, splitrule=variance, min.node.size=10
## - Fold1: mtry=1, splitrule=variance, min.node.size=10
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5
## + Fold2: mtry=1, splitrule=variance, min.node.size=10
## - Fold2: mtry=1, splitrule=variance, min.node.size=10
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5
## + Fold3: mtry=1, splitrule=variance, min.node.size=10
## - Fold3: mtry=1, splitrule=variance, min.node.size=10
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5
## + Fold4: mtry=1, splitrule=variance, min.node.size=10
## - Fold4: mtry=1, splitrule=variance, min.node.size=10
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5
## + Fold5: mtry=1, splitrule=variance, min.node.size=10
## - Fold5: mtry=1, splitrule=variance, min.node.size=10
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 5 on full training set
## Computing permutation importance.. Progress: 89%. Estimated remaining time: 3 seconds.
Evaluate random forests:
results <- resamples(
list(
lev_1 = rf_model_lev1,
lev_2 = rf_model_lev2,
lev_3 = rf_model_lev3,
log_1 = rf_model_log1,
log_2 = rf_model_log2,
log_3 = rf_model_log3
)
)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lev_1, lev_2, lev_3, log_1, log_2, log_3
## Number of resamples: 5
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lev_1 33.200 33.243 33.262 33.382 33.357 33.846 0
## lev_2 30.139 30.657 30.661 30.679 30.669 31.268 0
## lev_3 33.988 34.268 34.449 34.582 34.686 35.520 0
## log_1 0.357 0.359 0.363 0.363 0.364 0.373 0
## log_2 0.320 0.326 0.329 0.328 0.331 0.333 0
## log_3 0.372 0.378 0.378 0.378 0.380 0.380 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lev_1 48.806 49.045 49.651 50.367 51.780 52.551 0
## lev_2 45.562 46.646 46.871 47.764 49.261 50.478 0
## lev_3 50.394 50.928 51.680 52.296 53.090 55.389 0
## log_1 0.452 0.454 0.459 0.458 0.459 0.466 0
## log_2 0.410 0.415 0.420 0.418 0.421 0.423 0
## log_3 0.471 0.479 0.479 0.478 0.479 0.481 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lev_1 0.330 0.348 0.353 0.351 0.360 0.363 0
## lev_2 0.439 0.473 0.485 0.476 0.488 0.492 0
## lev_3 0.452 0.465 0.481 0.475 0.484 0.494 0
## log_1 0.407 0.425 0.429 0.429 0.435 0.449 0
## log_2 0.589 0.589 0.601 0.599 0.603 0.613 0
## log_3 0.572 0.585 0.587 0.589 0.598 0.605 0
Because of it has the lowest MAE, RMSE and biggest R-squared, the model which named log_2 is the best model for random forest.
# Create models in levels models: 1-8
modellev0 <- " ~ n_accommodates"
modellev1 <- paste0(" ~ ", paste(predictors_lev1, collapse="+"))
modellev2 <- paste0(" ~ ", paste(predictors_lev2, collapse="+"))
modellev3 <- paste0(" ~ ", paste(predictors_lev3, collapse="+"))
modellev4 <- paste0(" ~ ", paste(predictors_lev4, collapse="+"))
# Create models in logs, models: 1-8
modellog0 <- " ~ ln_accommodates"
modellog1 <- paste0(" ~ ", paste(predictors_log1, collapse="+"))
modellog2 <- paste0(" ~ ", paste(predictors_log2, collapse="+"))
modellog3 <- paste0(" ~ ", paste(predictors_log3, collapse="+"))
modellog4 <- paste0(" ~ ", paste(predictors_log4, collapse="+"))
Create list to save model results:
model_results <- list()
# For each level and logs
for (type in c("lev","log")) {
# for each model
for (i in 0:4) {
# Get the proper model names
model_name <- paste0("model",type,i)
# Get the proper target variable
yvar <- ifelse(type=="lev","price","ln_price")
# Get the depedent variables
xvars <- eval(parse(text = model_name))
# Create the appropriate formula
formula <- formula(paste0(yvar,xvars))
# Estimate on the training data
model <- lm(formula,data = train_data)
# Predict on the training sample (in-sample)
prediction_train <- predict(model, newdata = train_data)
# Predict on the testing sample (out-of--sample)
prediction_test <- predict(model, newdata = test_data)
# Estimate the appropriate Criteria
if (type=="lev") {
mse_train <- RMSE(prediction_train, train_data[[yvar]], na.rm = TRUE)^2
mse_test <- RMSE(prediction_test, test_data[[yvar]], na.rm = TRUE)^2
} else {
rmselog <- RMSE(prediction_train, train_data[[yvar]], na.rm = TRUE)
mse_train <- mse_log(prediction_train, train_data[[yvar]], rmselog)
mse_test <- mse_log(prediction_test, test_data[[yvar]], rmselog)
}
# Bayesian Criteria
BIC <- BIC(model)
# Save into model results
model_results[[model_name]] <- list(
yvar=yvar,xvars=xvars,formula=formula,model=model,
prediction_train = prediction_train,prediction_test = prediction_test,
mse_train = mse_train,mse_test = mse_test,BIC = BIC
)
}
}
Compare the models:
imap(model_results, ~{
results <- as.data.frame(.x[c("mse_train", "mse_test", "BIC")])
results %>% mutate(model = .y)
}) %>%
bind_rows()
## mse_train mse_test BIC model
## 1 2537 1670 252498 modellev0
## 2 2070 1404 247771 modellev1
## 3 2060 1411 247672 modellev2
## 4 2050 1397 247591 modellev3
## 5 1869 1326 245905 modellev4
## 6 2539 1665 30260 modellog0
## 7 2043 1361 21039 modellog1
## 8 2040 1367 20770 modellog2
## 9 2035 1367 20727 modellog3
## 10 1814 1275 18059 modellog4
Due to it has lowest MSE and BIC value, model named modellog4 is the best model here.
Take model 10 and find observations where there is no missing data:
vars_model_10 <- c("ln_price", predictors_log4)
data_train_complete <- train_data %>%
select_(.dots = vars_model_10) %>%
drop_na()
data_test_complete <- test_data %>%
select_(.dots = vars_model_10) %>%
drop_na()
train_control <- trainControl(method = "cv", number = 10)
tune_grid <- expand.grid("alpha" = c(1), "lambda" = seq(0.05, 1, by = 0.05))
formula <- formula(paste0("ln_price ~ ", paste(setdiff(vars_model_10, "ln_price"), collapse = " + ")))
Lasso Model:
set.seed(1234)
lasso_model <- train(formula,
data = data_train_complete,
method = "glmnet",
preProcess = c("center", "scale"),
trControl = train_control,
tuneGrid = tune_grid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
print(lasso_model$bestTune$lambda)
## [1] 0.05
lasso_coeffs <- coef(lasso_model$finalModel, lasso_model$bestTune$lambda) %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column(var = "variable") %>%
rename(coefficient = `1`)
print(lasso_coeffs)
## variable coefficient
## 1 (Intercept) 4.3445
## 2 ln_accommodates 0.1321
## 3 ln_beds 0.0000
## 4 f_room_typePrivate room -0.2465
## 5 f_room_typeShared room -0.0420
## 6 ln_days_since 0.0000
## 7 f_bathroom 0.0446
## 8 f_cancellation_policymoderate 0.0000
## 9 f_cancellation_policystrict 0.0000
## 10 f_bed_typeReal Bed 0.0000
## 11 f_number_of_reviews 0.0000
## 12 n_review_scores_rating 0.0000
## 13 ln_accommodates2 0.0442
## 14 ln_days_since2 0.0000
## 15 ln_days_since3 0.0000
## 16 d_24hourcheckin 0.0000
## 17 d_airconditioning 0.0000
## 18 d_breakfast 0.0000
## 19 d_buzzerwirelessintercom 0.0000
## 20 d_cabletv 0.0000
## 21 d_carbonmonoxidedetector 0.0000
## 22 d_cats 0.0000
## 23 d_dogs 0.0000
## 24 d_doorman 0.0000
## 25 d_doormanentry 0.0000
## 26 d_dryer 0.0243
## 27 d_elevatorinbuilding 0.0000
## 28 d_essentials 0.0000
## 29 d_familykidfriendly 0.0000
## 30 d_fireextinguisher 0.0000
## 31 d_firstaidkit 0.0000
## 32 d_freeparkingonpremises -0.0252
## 33 d_freeparkingonstreet 0.0000
## 34 d_gym 0.0000
## 35 d_hairdryer 0.0000
## 36 d_hangers 0.0000
## 37 d_heating 0.0000
## 38 d_hottub 0.0000
## 39 d_indoorfireplace 0.0000
## 40 d_internet 0.0000
## 41 d_iron 0.0000
## 42 d_keypad 0.0000
## 43 d_kitchen 0.0000
## 44 d_laptopfriendlyworkspace 0.0000
## 45 d_lockonbedroomdoor 0.0000
## 46 d_lockbox 0.0000
## 47 d_otherpets 0.0000
## 48 d_paidparkingoffpremises 0.0000
## 49 d_petsallowed 0.0000
## 50 d_petsliveonthisproperty 0.0000
## 51 d_pool 0.0000
## 52 d_privateentrance 0.0000
## 53 d_privatelivingroom 0.0000
## 54 d_safetycard 0.0000
## 55 d_selfcheckin 0.0000
## 56 d_shampoo 0.0000
## 57 d_smartlock 0.0000
## 58 d_smokedetector 0.0000
## 59 d_smokingallowed 0.0000
## 60 d_suitableforevents 0.0000
## 61 d_tv 0.0266
## 62 d_washer 0.0000
## 63 d_washerdryer 0.0000
## 64 d_wheelchairaccessible 0.0000
## 65 d_wirelessinternet 0.0000
Evaluate model. CV error:
lasso_cv_rmse <- lasso_model$results %>%
filter(lambda == lasso_model$bestTune$lambda) %>%
select(RMSE)
print(lasso_cv_rmse[1, 1]^2)
## [1] 0.143
Evaluate on test set:
lasso_test_rmse <- RMSE(predict(lasso_model, newdata = data_test_complete), data_test_complete$ln_price)
print(lasso_test_rmse^2)
## [1] 0.11
Keep vars from lasso, do OLS. Show predicted MSE:
nonzero_lasso_vars <- lasso_coeffs %>%
filter(coefficient != 0) %>%
pull(variable)
Do not use the few interactions for now:
post_lasso_ols_vars <- intersect(nonzero_lasso_vars, names(data_train_complete))
fit_control <- trainControl(method = "cv", number = 10)
set.seed(1234)
post_lasso_ols_model <- train(
formula(paste("ln_price ~ ", paste(post_lasso_ols_vars, collapse = " + "))),
data = data_train_complete,
method = "lm",
trControl = fit_control
)
post_lasso_ols_rmse <- post_lasso_ols_model$results[["RMSE"]]
post_lasso_ols_rmse^2
## [1] 0.18
Evaluate on test set:
post_lasso_ols_test_rmse <-
RMSE(predict(post_lasso_ols_model,
newdata = data_test_complete),
data_test_complete$ln_price)
print(post_lasso_ols_test_rmse^2)
## [1] 0.158
We have random forest, linear, lasso and post lasso models. The best prediction was made by Lasso with 0.11 RMSE.
Load data:
suppressMessages(
data2 <- read_csv(paste0(data_in, "airbnb_", "bristol_and_london",
"_workfile_adj.csv")) %>%
mutate_if(is.character, factor) %>%
filter(!is.na(price))
)
suppressMessages(
data3 <- read_csv(paste0(data_in, "airbnb_", "manchester",
"_workfile_adj.csv")) %>%
mutate_if(is.character, factor) %>%
filter(!is.na(price))
)
Sample definition and preparation:
# We focus on normal apartments
data2 <- data2 %>%
filter(n_accommodates < 8)
data3 <- data3 %>%
filter(n_accommodates < 8)
# Remove missing data, that has no score rating
data2 <- data2 %>%
drop_na(n_review_scores_rating)
data3 <- data3 %>%
drop_na(n_review_scores_rating)
# Handle some missing variables
data2 <- data2 %>%
mutate(f_bathrooms = ifelse(is.na(f_bathrooms),
median(f_bathrooms, na.rm = T),
f_bathrooms),
n_beds = ifelse(is.na(n_beds),
n_accommodates,
n_beds))
data3 <- data3 %>%
mutate(f_bathrooms = ifelse(is.na(f_bathrooms),
median(f_bathrooms, na.rm = T),
f_bathrooms),
n_beds = ifelse(is.na(n_beds),
n_accommodates,
n_beds))
Create train and test samples:
train_data2 <- data2
test_data2 <- data3
DiM <- rbind(dim(train_data2), dim(test_data2))
rownames(DiM) <- c("Train", "Test")
colnames(DiM) <- c("Observations", "Variables")
DiM
## Observations Variables
## Train 35886 16
## Test 612 16
Define models:
# Basic Variables
basic_vars <- c("n_accommodates", "n_beds", "f_room_type")
basic_log <- c("ln_accommodates", "ln_beds", "f_room_type")
# Factorized variables
basic_add <- c("f_bathrooms", "f_cancellation_policy", "f_bed_type")
reviews <- c("n_review_scores_rating")
# Higher orders
poly_lev <- c("n_accommodates2")
poly_log <- c("ln_accommodates2")
# Models in levels
predictors_lev0 <- "n_accommodates"
predictors_lev1 <- c(basic_vars, basic_add)
predictors_lev2 <- c(basic_vars, basic_add, reviews)
predictors_lev3 <- c(basic_vars, basic_add, reviews, poly_lev)
train_control <- trainControl(method = "cv", number = 5, verboseIter = TRUE)
Random Forest Model 1:
# 1
# Very simple model
set.seed(1915362)
mrf_model_lev1 <- train(
formula(paste0("price ~", paste0(predictors_lev0, collapse = " + "))),
data = train_data2,
method = "ranger",
trControl = train_control,
tuneGrid = data.frame(.mtry = 1, .splitrule = "variance", .min.node.size = 5),
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size=5
## - Fold1: mtry=1, splitrule=variance, min.node.size=5
## + Fold2: mtry=1, splitrule=variance, min.node.size=5
## - Fold2: mtry=1, splitrule=variance, min.node.size=5
## + Fold3: mtry=1, splitrule=variance, min.node.size=5
## - Fold3: mtry=1, splitrule=variance, min.node.size=5
## + Fold4: mtry=1, splitrule=variance, min.node.size=5
## - Fold4: mtry=1, splitrule=variance, min.node.size=5
## + Fold5: mtry=1, splitrule=variance, min.node.size=5
## - Fold5: mtry=1, splitrule=variance, min.node.size=5
## Aggregating results
## Fitting final model on full training set
Random Forest Model 2:
# 2
tune_grid <- expand.grid(
.mtry = 1,
.splitrule = "variance",
.min.node.size = c(5, 10)
)
set.seed(1915362)
mrf_model_lev2 <- train(
formula(paste0("price ~", paste0(predictors_lev1, collapse = " + "))),
data = train_data2,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid,
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5
## + Fold1: mtry=1, splitrule=variance, min.node.size=10
## - Fold1: mtry=1, splitrule=variance, min.node.size=10
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5
## + Fold2: mtry=1, splitrule=variance, min.node.size=10
## - Fold2: mtry=1, splitrule=variance, min.node.size=10
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5
## + Fold3: mtry=1, splitrule=variance, min.node.size=10
## - Fold3: mtry=1, splitrule=variance, min.node.size=10
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5
## + Fold4: mtry=1, splitrule=variance, min.node.size=10
## - Fold4: mtry=1, splitrule=variance, min.node.size=10
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5
## + Fold5: mtry=1, splitrule=variance, min.node.size=10
## - Fold5: mtry=1, splitrule=variance, min.node.size=10
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 10 on full training set
Random Forest Model 3:
# 3
tune_grid <- expand.grid(
.mtry = 1,
.splitrule = "variance",
.min.node.size = c(5, 10)
)
set.seed(1915362)
mrf_model_lev3 <- train(
formula(paste0("price ~", paste0(predictors_lev3, collapse = " + "))),
data = train_data2,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid,
importance = "permutation"
)
## + Fold1: mtry=1, splitrule=variance, min.node.size= 5
## - Fold1: mtry=1, splitrule=variance, min.node.size= 5
## + Fold1: mtry=1, splitrule=variance, min.node.size=10
## - Fold1: mtry=1, splitrule=variance, min.node.size=10
## + Fold2: mtry=1, splitrule=variance, min.node.size= 5
## - Fold2: mtry=1, splitrule=variance, min.node.size= 5
## + Fold2: mtry=1, splitrule=variance, min.node.size=10
## - Fold2: mtry=1, splitrule=variance, min.node.size=10
## + Fold3: mtry=1, splitrule=variance, min.node.size= 5
## - Fold3: mtry=1, splitrule=variance, min.node.size= 5
## + Fold3: mtry=1, splitrule=variance, min.node.size=10
## - Fold3: mtry=1, splitrule=variance, min.node.size=10
## + Fold4: mtry=1, splitrule=variance, min.node.size= 5
## - Fold4: mtry=1, splitrule=variance, min.node.size= 5
## + Fold4: mtry=1, splitrule=variance, min.node.size=10
## - Fold4: mtry=1, splitrule=variance, min.node.size=10
## + Fold5: mtry=1, splitrule=variance, min.node.size= 5
## - Fold5: mtry=1, splitrule=variance, min.node.size= 5
## + Fold5: mtry=1, splitrule=variance, min.node.size=10
## - Fold5: mtry=1, splitrule=variance, min.node.size=10
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = variance, min.node.size = 10 on full training set
Evaluate random forests:
results <- resamples(
list(
lev_1 = mrf_model_lev1,
lev_2 = mrf_model_lev2,
lev_3 = mrf_model_lev3
)
)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lev_1, lev_2, lev_3
## Number of resamples: 5
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lev_1 31.4 31.8 31.8 31.9 31.8 32.4 0
## lev_2 31.5 32.4 32.7 32.5 32.9 33.0 0
## lev_3 31.3 31.3 31.4 31.7 32.2 32.2 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lev_1 47.4 49.2 50.3 50.0 51.2 52.1 0
## lev_2 49.1 49.3 51.6 51.4 53.3 53.6 0
## lev_3 47.8 48.9 50.0 50.4 52.3 52.8 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lev_1 0.380 0.381 0.385 0.392 0.391 0.422 0
## lev_2 0.468 0.479 0.488 0.488 0.492 0.514 0
## lev_3 0.464 0.473 0.486 0.484 0.488 0.510 0
Even though it does not have the biggest, the model which named lev_1 is the best model for random forest R-squared because of it has the lowest MAE and RMSE.
# Create models in levels models: 1-8
mmodellev0 <- " ~ n_accommodates"
mmodellev1 <- paste0(" ~ ", paste(predictors_lev1, collapse="+"))
mmodellev2 <- paste0(" ~ ", paste(predictors_lev2, collapse="+"))
mmodellev3 <- paste0(" ~ ", paste(predictors_lev3, collapse="+"))
Create list to save model results:
model_results <- list()
# For each level and logs
for (type in c("lev")) {
# for each model
for (i in 0:3) {
# Get the proper model names
model_name <- paste0("mmodel",type,i)
# Get the proper target variable
yvar <- ifelse(type=="lev","price","ln_price")
# Get the depedent variables
xvars <- eval(parse(text = model_name))
# Create the appropriate formula
formula <- formula(paste0(yvar,xvars))
# Estimate on the training data
model <- lm(formula,data = train_data2)
# Predict on the training sample (in-sample)
prediction_train <- predict(model, newdata = train_data2)
# Predict on the testing sample (out-of--sample)
prediction_test <- predict(model, newdata = test_data2)
# Estimate the appropriate Criteria
if (type=="lev") {
mse_train <- RMSE(prediction_train, train_data2[[yvar]], na.rm = TRUE)^2
mse_test <- RMSE(prediction_test, test_data2[[yvar]], na.rm = TRUE)^2
} else {
rmselog <- RMSE(prediction_train, train_data2[[yvar]], na.rm = TRUE)
mse_train <- mse_log(prediction_train, train_data2[[yvar]], rmselog)
mse_test <- mse_log(prediction_test, test_data2[[yvar]], rmselog)
}
# Bayesian Criteria
BIC <- BIC(model)
# Save into model results
model_results[[model_name]] <- list(
yvar=yvar,xvars=xvars,formula=formula,model=model,
prediction_train = prediction_train,prediction_test = prediction_test,
mse_train = mse_train,mse_test = mse_test,BIC = BIC
)
}
}
Compare the models:
imap(model_results, ~{
results <- as.data.frame(.x[c("mse_train", "mse_test", "BIC")])
results %>% mutate(model = .y)
}) %>%
bind_rows()
## mse_train mse_test BIC model
## 1 2505 1797 382721 mmodellev0
## 2 2060 1613 375830 mmodellev1
## 3 2052 1601 375697 mmodellev2
## 4 2036 1523 375433 mmodellev3
Due to it has lowest MSE and BIC value, model named mmodellev3 is the best model here.
Take model 4 and find observations where there is no missing data:
vars_model_4 <- c("price", predictors_lev3)
data_train_complete2 <- train_data2 %>%
select_(.dots = vars_model_4) %>%
drop_na()
data_test_complete2 <- test_data2 %>%
select_(.dots = vars_model_4) %>%
drop_na()
train_control <- trainControl(method = "cv", number = 10)
tune_grid <- expand.grid("alpha" = c(1), "lambda" = seq(0.05, 1, by = 0.05))
formula <- formula(paste0("price ~ ", paste(setdiff(vars_model_4, "price"), collapse = " + ")))
Lasso Model:
set.seed(1234)
lasso_model <- train(formula,
data = data_train_complete2,
method = "glmnet",
preProcess = c("center", "scale"),
trControl = train_control,
tuneGrid = tune_grid)
print(lasso_model$bestTune$lambda)
## [1] 0.2
lasso_coeffs <- coef(lasso_model$finalModel, lasso_model$bestTune$lambda) %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column(var = "variable") %>%
rename(coefficient = `1`)
print(lasso_coeffs)
## variable coefficient
## 1 (Intercept) 85.5307
## 2 n_accommodates 1.7888
## 3 n_beds 0.1507
## 4 f_room_typePrivate room -23.7480
## 5 f_room_typeShared room -5.7509
## 6 f_bathrooms 12.0970
## 7 f_cancellation_policymoderate 0.0000
## 8 f_cancellation_policystrict 4.2529
## 9 f_cancellation_policystrict_14_with_grace_period -1.8300
## 10 f_cancellation_policysuper_strict_30 0.0703
## 11 f_cancellation_policysuper_strict_60 0.0534
## 12 f_bed_typeCouch 0.0000
## 13 f_bed_typeFuton 0.0000
## 14 f_bed_typePull-out Sofa 0.0000
## 15 f_bed_typeReal Bed 0.3337
## 16 n_review_scores_rating 2.6051
## 17 n_accommodates2 19.3638
Evaluate model. CV error:
lasso_cv_rmse <- lasso_model$results %>%
filter(lambda == lasso_model$bestTune$lambda) %>%
select(RMSE)
print(lasso_cv_rmse[1, 1]^2)
## [1] 2036
Evaluate on test set:
lasso_test_rmse <- RMSE(predict(lasso_model, newdata = data_test_complete2), data_test_complete2$price)
print(lasso_test_rmse^2)
## [1] 1517
Keep vars from lasso, do OLS. Show predicted MSE:
nonzero_lasso_vars <- lasso_coeffs %>%
filter(coefficient != 0) %>%
pull(variable)
Do not use the few interactions for now:
post_lasso_ols_vars <- intersect(nonzero_lasso_vars, names(data_train_complete2))
fit_control <- trainControl(method = "cv", number = 10)
set.seed(1234)
post_lasso_ols_model <- train(
formula(paste("price ~ ", paste(post_lasso_ols_vars, collapse = " + "))),
data = data_train_complete2,
method = "lm",
trControl = fit_control
)
post_lasso_ols_rmse <- post_lasso_ols_model$results[["RMSE"]]
post_lasso_ols_rmse^2
## [1] 2403
Evaluate on test set:
post_lasso_ols_test_rmse <-
RMSE(predict(post_lasso_ols_model,
newdata = data_test_complete2),
data_test_complete2$price)
print(post_lasso_ols_test_rmse^2)
## [1] 1873
We have random forest, linear, lasso and post lasso models. The best prediction was made by Random Forrest.