Firstly we will load dataset from DHS survey
# Reading DHS survey data from SIAP's website
df <- read.csv("https://www.unsiap.or.jp/on_line/ML/M6-clean_data.csv")
Variables of dataset is categorized as numerical and categorical and it is shown with datasummary_skim function
library(modelsummary)
## Warning: package 'modelsummary' was built under R version 4.5.1
datasummary_skim(df, type = "numeric")
| Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram | |
|---|---|---|---|---|---|---|---|---|
| price | 649 | 0 | 163.7 | 421.8 | 0.0 | 105.0 | 10000.0 | |
| latitude | 18964 | 0 | 40.7 | 0.1 | 40.5 | 40.7 | 40.9 | |
| longitude | 14790 | 0 | -74.0 | 0.0 | -74.2 | -74.0 | -73.7 | |
| accommodates | 19 | 0 | 2.9 | 1.9 | 1.0 | 2.0 | 22.0 | |
| bedrooms | 12 | 0 | 1.2 | 0.8 | 0.0 | 1.0 | 21.0 | |
| beds | 17 | 0 | 1.5 | 1.1 | 0.0 | 1.0 | 16.0 | |
| cleaning_fee | 214 | 0 | 56.2 | 66.3 | 0.0 | 40.0 | 1200.0 | |
| minimum_nights | 110 | 0 | 7.9 | 21.6 | 1.0 | 3.0 | 1250.0 | |
| availability_365 | 366 | 0 | 120.7 | 140.4 | 0.0 | 63.0 | 365.0 | |
| review_scores_rating | 62 | 0 | 94.2 | 8.0 | 20.0 | 96.0 | 100.0 |
datasummary_skim(df, type = "categorical")
| N | % | ||
|---|---|---|---|
| host_is_superhost | False | 39055 | 80.1 |
| True | 9696 | 19.9 | |
| neighborhood | A-Zone | 19604 | 40.2 |
| B-Zone | 1149 | 2.4 | |
| M-Zone | 21778 | 44.7 | |
| Q-Zone | 5859 | 12.0 | |
| S-Zone | 361 | 0.7 | |
| property_type | Appartment | 41746 | 85.6 |
| House | 5702 | 11.7 | |
| PrivateRoom | 1303 | 2.7 | |
| cancellation_policy | flexible | 15540 | 31.9 |
| moderate | 11323 | 23.2 | |
| strict | 21888 | 44.9 |
Correlation evaluation is good practice whether there is multicollinearity before modelling
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.5.1
## Zorunlu paket yükleniyor: ggplot2
numeric <- select_if(df, is.numeric)
corr_coef <- cor(numeric, use = "p")
ggcorrplot(corr_coef,
type = "lower",
outline.col = "white",
lab = T) + ggtitle("Corr. between numeric variables")
As you can see accomodates variable and beds variable have high correlation so it can be cause multicollinearity problem for modelling.
This part dataset is divided by createDataPartition which creates training and validation data for analysis
library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Zorunlu paket yükleniyor: lattice
set.seed(777)
trainIndex <- createDataPartition(df$price, p = 0.5,
list = F)
train_data <- df[trainIndex, ]
validation_data <- df[-trainIndex, ]
LASSO can be affected by scale of variables , so variables scaled before LASSO regression.
ScalingValues <- preProcess(train_data, method = c("center", "scale"))
train_data <- predict(ScalingValues, train_data)
validation_data <- predict(ScalingValues
, validation_data)
numbers <- 5
repeats <- 15
rcvTunes <- 10
seed <- 123
rcvControl <- trainControl(method = "repeatedcv",
number = numbers, repeats = repeats)
set.seed(123)
lasso_fit <- train(price ~ .,
data = train_data,
method = "glmnet",
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0,0.005,length = 10)),
trControl = rcvControl)
LASSO model is prepared and it’s viz time to show best regularization parameter
ggplot(lasso_fit) +
ggtitle("Lasso Penalization") +
labs(x = "Lambda") +
theme_minimal()
theme_models <- theme_minimal()+ theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
lasso_varImp <- data.frame(variables = row.names(varImp(lasso_fit)$importance), varImp(lasso_fit)$importance)
# Below we set that we only show feature importance with a value larger than 3
# You can lower this if you want to see more variables, or increase it if you want to see fewer.
threshold = 2
lasso_varImp <- lasso_varImp[lasso_varImp$Overall > threshold,]
ggplot(data = lasso_varImp, mapping = aes(x=reorder(variables, Overall),
y=Overall,
fill=variables)) +
coord_flip() + geom_bar(stat = "identity", position = "dodge") +
theme_models +
labs(x = "", y = "") +
ggtitle("Feature Importance Lasso Regression")
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.5.1
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
lasso_preds <- predict(lasso_fit, validation_data)
rmse(actual = validation_data$price, predicted = lasso_preds)
## [1] 0.9991005
MyRegressors <- "property_type + neighborhood + accommodates +
bedrooms +cancellation_policy + cleaning_fee"
Ntree <- 10
MyFormula <- as.formula(paste("price ~", MyRegressors))
rf_fit <- train(MyFormula,
data = train_data,
method = "rf",
ntree = Ntree)
rf_fit
## Random Forest
##
## 24376 samples
## 6 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 24376, 24376, 24376, 24376, 24376, 24376, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.9541075 0.1482332 0.2087995
## 6 0.9427561 0.1600841 0.2107752
## 11 0.9495708 0.1528282 0.2168409
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 6.
Prediction of Random Forest Model
rf_preds <- predict(rf_fit , validation_data)
rmse(actual = validation_data$price, predicted = rf_preds)
## [1] 0.9865642
theme_models <- theme_minimal()+ theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
rf_varImp <- data.frame(variables = row.names(varImp(rf_fit)$importance), varImp(rf_fit)$importance)
ggplot(data = rf_varImp, mapping = aes(x=reorder(variables, Overall),
y=Overall,
fill=variables)) +
coord_flip() + geom_bar(stat = "identity", position = "dodge") +
theme_models +
labs(x = "", y = "") +
ggtitle("Feature Importance Random Forest")
library(grid)
# This graphics allows the comparison of results (on the SAME scale)
# even if number of CV samples are different
lasso <- as.data.frame(lasso_fit$resample$RMSE)
rf <- as.data.frame(rf_fit$resample$RMSE)
y.min <- min(lasso, rf)
y.max <- max(lasso, rf)
p.lasso <- ggplot(lasso) +
aes(x = "", y = `lasso_fit$resample$RMSE`) +
geom_boxplot(fill = "blue") +
labs( x = "LASSO", y= "RMSE") +
coord_cartesian(ylim = c(y.min, y.max))+
theme_minimal()
rf <- as.data.frame(rf_fit$resample$RMSE)
p.rf <- ggplot(rf) +
aes(x = "", y = `rf_fit$resample$RMSE`) +
geom_boxplot(fill = "orange") +
coord_cartesian(ylim = c(y.min, y.max))+
labs( x = "Random Forest", y ="")+
theme_minimal()
grid.newpage()
grid.draw(cbind(ggplotGrob(p.lasso), ggplotGrob(p.rf), size = "last"))
Random forest is better LASSO because RMSE value is lower than LASSO.So for this project which random forest will be selected.