Load Dataset

Firstly we will load dataset from DHS survey

# Reading DHS survey data from SIAP's website
df <- read.csv("https://www.unsiap.or.jp/on_line/ML/M6-clean_data.csv")

Summary Statistics

Variables of dataset is categorized as numerical and categorical and it is shown with datasummary_skim function

Numerical Variables

library(modelsummary)
## Warning: package 'modelsummary' was built under R version 4.5.1
datasummary_skim(df, type = "numeric")
Unique Missing Pct. Mean SD Min Median Max Histogram
price 649 0 163.7 421.8 0.0 105.0 10000.0
latitude 18964 0 40.7 0.1 40.5 40.7 40.9
longitude 14790 0 -74.0 0.0 -74.2 -74.0 -73.7
accommodates 19 0 2.9 1.9 1.0 2.0 22.0
bedrooms 12 0 1.2 0.8 0.0 1.0 21.0
beds 17 0 1.5 1.1 0.0 1.0 16.0
cleaning_fee 214 0 56.2 66.3 0.0 40.0 1200.0
minimum_nights 110 0 7.9 21.6 1.0 3.0 1250.0
availability_365 366 0 120.7 140.4 0.0 63.0 365.0
review_scores_rating 62 0 94.2 8.0 20.0 96.0 100.0

Categorical Variables

datasummary_skim(df, type = "categorical")
N %
host_is_superhost False 39055 80.1
True 9696 19.9
neighborhood A-Zone 19604 40.2
B-Zone 1149 2.4
M-Zone 21778 44.7
Q-Zone 5859 12.0
S-Zone 361 0.7
property_type Appartment 41746 85.6
House 5702 11.7
PrivateRoom 1303 2.7
cancellation_policy flexible 15540 31.9
moderate 11323 23.2
strict 21888 44.9

Correlations between Variables

Correlation evaluation is good practice whether there is multicollinearity before modelling

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.5.1
## Zorunlu paket yükleniyor: ggplot2
numeric <- select_if(df, is.numeric)



corr_coef <- cor(numeric, use = "p")

ggcorrplot(corr_coef, 
           type = "lower", 
           outline.col = "white",
           lab = T) + ggtitle("Corr. between numeric variables")

As you can see accomodates variable and beds variable have high correlation so it can be cause multicollinearity problem for modelling.

Penalization Method and LASSO and Preparation of Data

This part dataset is divided by createDataPartition which creates training and validation data for analysis

library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Zorunlu paket yükleniyor: lattice
set.seed(777)
trainIndex <- createDataPartition(df$price, p = 0.5, 
                                  list = F)
train_data <- df[trainIndex, ]
validation_data <- df[-trainIndex, ]

LASSO can be affected by scale of variables , so variables scaled before LASSO regression.

ScalingValues <- preProcess(train_data, method = c("center", "scale"))
train_data <- predict(ScalingValues, train_data)
validation_data <- predict(ScalingValues
                          , validation_data)
numbers <- 5
repeats <- 15
rcvTunes <- 10
seed <- 123






rcvControl <- trainControl(method = "repeatedcv",
                           number = numbers, repeats = repeats)

set.seed(123)

lasso_fit <- train(price ~ .,
                   data = train_data, 
                   method = "glmnet",
                   tuneGrid = expand.grid(alpha = 1,
                                          lambda = seq(0,0.005,length = 10)),
                   trControl = rcvControl)

LASSO Viz

LASSO model is prepared and it’s viz time to show best regularization parameter

ggplot(lasso_fit) +
  ggtitle("Lasso Penalization") + 
  labs(x = "Lambda") + 
  theme_minimal()

Feature Importance after LASSO regression

theme_models <-  theme_minimal()+ theme(plot.title = element_text(hjust = 0.5),
                legend.position = "none") 

lasso_varImp <- data.frame(variables = row.names(varImp(lasso_fit)$importance), varImp(lasso_fit)$importance)
# Below we set that we only show feature importance with a value larger than 3
# You can lower this if you want to see more variables, or increase it if you want to see fewer.
threshold = 2
lasso_varImp <- lasso_varImp[lasso_varImp$Overall > threshold,]
ggplot(data = lasso_varImp, mapping = aes(x=reorder(variables, Overall),
                                        y=Overall,
                                        fill=variables)) +
  coord_flip() + geom_bar(stat = "identity", position = "dodge") +
  theme_models +
  labs(x = "", y = "") +
  ggtitle("Feature Importance Lasso Regression") 

LASSO RMSE Values

library(Metrics)
## Warning: package 'Metrics' was built under R version 4.5.1
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
lasso_preds <- predict(lasso_fit, validation_data)
rmse(actual = validation_data$price, predicted = lasso_preds)
## [1] 0.9991005

Feature Selection

MyRegressors <- "property_type + neighborhood + accommodates +
bedrooms +cancellation_policy + cleaning_fee"

Training Random Forest

Ntree <- 10

MyFormula <- as.formula(paste("price ~", MyRegressors))


rf_fit <- train(MyFormula,
                data = train_data, 
                method = "rf",
                ntree = Ntree)


rf_fit
## Random Forest 
## 
## 24376 samples
##     6 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 24376, 24376, 24376, 24376, 24376, 24376, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE      
##    2    0.9541075  0.1482332  0.2087995
##    6    0.9427561  0.1600841  0.2107752
##   11    0.9495708  0.1528282  0.2168409
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 6.

Prediction of Random Forest Model

rf_preds <- predict(rf_fit , validation_data)
rmse(actual = validation_data$price, predicted = rf_preds)
## [1] 0.9865642

Variable Importance of Random Forest Model

theme_models <-  theme_minimal()+ theme(plot.title = element_text(hjust = 0.5),
                legend.position = "none") 

rf_varImp <- data.frame(variables = row.names(varImp(rf_fit)$importance), varImp(rf_fit)$importance)

ggplot(data = rf_varImp, mapping = aes(x=reorder(variables, Overall),
                                        y=Overall,
                                        fill=variables)) +
  coord_flip() + geom_bar(stat = "identity", position = "dodge") +
  theme_models +
  labs(x = "", y = "") +
  ggtitle("Feature Importance Random Forest") 

Comparison of Models

library(grid)
# This graphics allows the comparison of results (on the SAME scale)
# even if number of CV samples are different

lasso <- as.data.frame(lasso_fit$resample$RMSE)
rf <- as.data.frame(rf_fit$resample$RMSE)

y.min <- min(lasso, rf)
y.max <- max(lasso, rf)

p.lasso <- ggplot(lasso) +
          aes(x = "", y = `lasso_fit$resample$RMSE`) +
          geom_boxplot(fill = "blue") +
          labs( x = "LASSO", y= "RMSE") +
          coord_cartesian(ylim = c(y.min, y.max))+
          theme_minimal()

rf <- as.data.frame(rf_fit$resample$RMSE)
p.rf <- ggplot(rf) +
          aes(x = "", y = `rf_fit$resample$RMSE`) +
          geom_boxplot(fill = "orange") +
          coord_cartesian(ylim = c(y.min, y.max))+
          labs( x = "Random Forest", y ="")+
          theme_minimal()


grid.newpage()
grid.draw(cbind(ggplotGrob(p.lasso), ggplotGrob(p.rf), size = "last"))

Random forest is better LASSO because RMSE value is lower than LASSO.So for this project which random forest will be selected.