Load Dataset

Firstly we will load dataset from DHS survey

# Reading DHS survey data from SIAP's website
df <- read.csv("https://www.unsiap.or.jp/on_line/ML/M6-clean_data.csv")

Summary Statistics

Variables of dataset is categorized as numerical and categorical and it is shown with datasummary_skim function

Numerical Variables

library(modelsummary)

## Warning: package 'modelsummary' was built under R version 4.5.1

datasummary_skim(df, type = "numeric")

	Unique	Mean	SD	Min	Median	Max
price	649	163.7	421.8	0.0	105.0	10000.0
latitude	18964	40.7	0.1	40.5	40.7	40.9
longitude	14790	-74.0	0.0	-74.2	-74.0	-73.7
accommodates	19	2.9	1.9	1.0	2.0	22.0
bedrooms	12	1.2	0.8	0.0	1.0	21.0
beds	17	1.5	1.1	0.0	1.0	16.0
cleaning_fee	214	56.2	66.3	0.0	40.0	1200.0
minimum_nights	110	7.9	21.6	1.0	3.0	1250.0
availability_365	366	120.7	140.4	0.0	63.0	365.0
review_scores_rating	62	94.2	8.0	20.0	96.0	100.0

Categorical Variables

datasummary_skim(df, type = "categorical")

		N	%
host_is_superhost	False	39055	80.1
	True	9696	19.9
neighborhood	A-Zone	19604	40.2
	B-Zone	1149	2.4
	M-Zone	21778	44.7
	Q-Zone	5859	12.0
	S-Zone	361	0.7
property_type	Appartment	41746	85.6
	House	5702	11.7
	PrivateRoom	1303	2.7
cancellation_policy	flexible	15540	31.9
	moderate	11323	23.2
	strict	21888	44.9

Correlations between Variables

Correlation evaluation is good practice whether there is multicollinearity before modelling

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.5.1

## Zorunlu paket yükleniyor: ggplot2

numeric <- select_if(df, is.numeric)



corr_coef <- cor(numeric, use = "p")

ggcorrplot(corr_coef, 
           type = "lower", 
           outline.col = "white",
           lab = T) + ggtitle("Corr. between numeric variables")

As you can see accomodates variable and beds variable have high correlation so it can be cause multicollinearity problem for modelling.

Penalization Method and LASSO and Preparation of Data

This part dataset is divided by createDataPartition which creates training and validation data for analysis

library(caret)

## Warning: package 'caret' was built under R version 4.5.1

## Zorunlu paket yükleniyor: lattice

set.seed(777)
trainIndex <- createDataPartition(df$price, p = 0.5, 
                                  list = F)

train_data <- df[trainIndex, ]
validation_data <- df[-trainIndex, ]

LASSO can be affected by scale of variables , so variables scaled before LASSO regression.

ScalingValues <- preProcess(train_data, method = c("center", "scale"))
train_data <- predict(ScalingValues, train_data)
validation_data <- predict(ScalingValues
                          , validation_data)

numbers <- 5
repeats <- 15
rcvTunes <- 10
seed <- 123






rcvControl <- trainControl(method = "repeatedcv",
                           number = numbers, repeats = repeats)

set.seed(123)

lasso_fit <- train(price ~ .,
                   data = train_data, 
                   method = "glmnet",
                   tuneGrid = expand.grid(alpha = 1,
                                          lambda = seq(0,0.005,length = 10)),
                   trControl = rcvControl)

LASSO Viz

LASSO model is prepared and it’s viz time to show best regularization parameter

ggplot(lasso_fit) +
  ggtitle("Lasso Penalization") + 
  labs(x = "Lambda") + 
  theme_minimal()

Feature Importance after LASSO regression

theme_models <-  theme_minimal()+ theme(plot.title = element_text(hjust = 0.5),
                legend.position = "none") 

lasso_varImp <- data.frame(variables = row.names(varImp(lasso_fit)$importance), varImp(lasso_fit)$importance)
# Below we set that we only show feature importance with a value larger than 3
# You can lower this if you want to see more variables, or increase it if you want to see fewer.
threshold = 2
lasso_varImp <- lasso_varImp[lasso_varImp$Overall > threshold,]
ggplot(data = lasso_varImp, mapping = aes(x=reorder(variables, Overall),
                                        y=Overall,
                                        fill=variables)) +
  coord_flip() + geom_bar(stat = "identity", position = "dodge") +
  theme_models +
  labs(x = "", y = "") +
  ggtitle("Feature Importance Lasso Regression")

LASSO RMSE Values

library(Metrics)

## Warning: package 'Metrics' was built under R version 4.5.1

## 
## Attaching package: 'Metrics'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

lasso_preds <- predict(lasso_fit, validation_data)
rmse(actual = validation_data$price, predicted = lasso_preds)

## [1] 0.9991005

Feature Selection

MyRegressors <- "property_type + neighborhood + accommodates +
bedrooms +cancellation_policy + cleaning_fee"

Training Random Forest

Ntree <- 10

MyFormula <- as.formula(paste("price ~", MyRegressors))


rf_fit <- train(MyFormula,
                data = train_data, 
                method = "rf",
                ntree = Ntree)


rf_fit

## Random Forest 
## 
## 24376 samples
##     6 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 24376, 24376, 24376, 24376, 24376, 24376, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE      
##    2    0.9541075  0.1482332  0.2087995
##    6    0.9427561  0.1600841  0.2107752
##   11    0.9495708  0.1528282  0.2168409
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 6.

Prediction of Random Forest Model

rf_preds <- predict(rf_fit , validation_data)
rmse(actual = validation_data$price, predicted = rf_preds)

## [1] 0.9865642

Variable Importance of Random Forest Model

theme_models <-  theme_minimal()+ theme(plot.title = element_text(hjust = 0.5),
                legend.position = "none") 

rf_varImp <- data.frame(variables = row.names(varImp(rf_fit)$importance), varImp(rf_fit)$importance)

ggplot(data = rf_varImp, mapping = aes(x=reorder(variables, Overall),
                                        y=Overall,
                                        fill=variables)) +
  coord_flip() + geom_bar(stat = "identity", position = "dodge") +
  theme_models +
  labs(x = "", y = "") +
  ggtitle("Feature Importance Random Forest")

Comparison of Models

library(grid)
# This graphics allows the comparison of results (on the SAME scale)
# even if number of CV samples are different

lasso <- as.data.frame(lasso_fit$resample$RMSE)
rf <- as.data.frame(rf_fit$resample$RMSE)

y.min <- min(lasso, rf)
y.max <- max(lasso, rf)

p.lasso <- ggplot(lasso) +
          aes(x = "", y = `lasso_fit$resample$RMSE`) +
          geom_boxplot(fill = "blue") +
          labs( x = "LASSO", y= "RMSE") +
          coord_cartesian(ylim = c(y.min, y.max))+
          theme_minimal()

rf <- as.data.frame(rf_fit$resample$RMSE)
p.rf <- ggplot(rf) +
          aes(x = "", y = `rf_fit$resample$RMSE`) +
          geom_boxplot(fill = "orange") +
          coord_cartesian(ylim = c(y.min, y.max))+
          labs( x = "Random Forest", y ="")+
          theme_minimal()


grid.newpage()
grid.draw(cbind(ggplotGrob(p.lasso), ggplotGrob(p.rf), size = "last"))

Random forest is better LASSO because RMSE value is lower than LASSO.So for this project which random forest will be selected.

Project Escap

Emre Cirak

2025-08-22