Data description

Data comes from kaggle. It consists of information about reservations in certain hotel. Aim is to find which customers are likely to cancel reservations.

library(FactoMineR)
library(corrplot)
library(smacof)
library(clusterSim)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(factoextra)
library(Rtsne)

Used libraries:

FactoMineR: A library for multivariate exploratory data analysis and factor analysis.
corrplot: A library for visualizing and analyzing correlations between variables.
smacof: A library for multidimensional scaling.
clusterSim: A library for simulating datasets for cluster analysis.
dplyr: A library for data manipulation and analysis.
ggplot2: A library for creating static and interactive visualizations.
tidyverse: A meta-library for data manipulation, analysis, and visualization.
factoextra: A library for factorial analysis and clustering with additional functionalities.
Rtsne: A library for fast and efficient implementation of t-SNE in R

EDA

Read data:

data <- read.csv("reservations.csv", header = T)

Names of the columns:

colnames(data)
##  [1] "Booking_ID"                          
##  [2] "no_of_adults"                        
##  [3] "no_of_children"                      
##  [4] "no_of_weekend_nights"                
##  [5] "no_of_week_nights"                   
##  [6] "type_of_meal_plan"                   
##  [7] "required_car_parking_space"          
##  [8] "room_type_reserved"                  
##  [9] "lead_time"                           
## [10] "arrival_year"                        
## [11] "arrival_month"                       
## [12] "arrival_date"                        
## [13] "market_segment_type"                 
## [14] "repeated_guest"                      
## [15] "no_of_previous_cancellations"        
## [16] "no_of_previous_bookings_not_canceled"
## [17] "avg_price_per_room"                  
## [18] "no_of_special_requests"              
## [19] "booking_status"

Data types:

str(data)
## 'data.frame':    36275 obs. of  19 variables:
##  $ Booking_ID                          : chr  "INN00001" "INN00002" "INN00003" "INN00004" ...
##  $ no_of_adults                        : int  2 2 1 2 2 2 2 2 3 2 ...
##  $ no_of_children                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_of_weekend_nights                : int  1 2 2 0 1 0 1 1 0 0 ...
##  $ no_of_week_nights                   : int  2 3 1 2 1 2 3 3 4 5 ...
##  $ type_of_meal_plan                   : chr  "Meal Plan 1" "Not Selected" "Meal Plan 1" "Meal Plan 1" ...
##  $ required_car_parking_space          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ room_type_reserved                  : chr  "Room_Type 1" "Room_Type 1" "Room_Type 1" "Room_Type 1" ...
##  $ lead_time                           : int  224 5 1 211 48 346 34 83 121 44 ...
##  $ arrival_year                        : int  2017 2018 2018 2018 2018 2018 2017 2018 2018 2018 ...
##  $ arrival_month                       : int  10 11 2 5 4 9 10 12 7 10 ...
##  $ arrival_date                        : int  2 6 28 20 11 13 15 26 6 18 ...
##  $ market_segment_type                 : chr  "Offline" "Online" "Online" "Online" ...
##  $ repeated_guest                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_of_previous_cancellations        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_of_previous_bookings_not_canceled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ avg_price_per_room                  : num  65 106.7 60 100 94.5 ...
##  $ no_of_special_requests              : int  0 1 0 0 0 1 1 1 1 3 ...
##  $ booking_status                      : chr  "Not_Canceled" "Not_Canceled" "Canceled" "Canceled" ...

Data summary:

summary(data)
##   Booking_ID         no_of_adults   no_of_children    no_of_weekend_nights
##  Length:36275       Min.   :0.000   Min.   : 0.0000   Min.   :0.0000      
##  Class :character   1st Qu.:2.000   1st Qu.: 0.0000   1st Qu.:0.0000      
##  Mode  :character   Median :2.000   Median : 0.0000   Median :1.0000      
##                     Mean   :1.845   Mean   : 0.1053   Mean   :0.8107      
##                     3rd Qu.:2.000   3rd Qu.: 0.0000   3rd Qu.:2.0000      
##                     Max.   :4.000   Max.   :10.0000   Max.   :7.0000      
##  no_of_week_nights type_of_meal_plan  required_car_parking_space
##  Min.   : 0.000    Length:36275       Min.   :0.00000           
##  1st Qu.: 1.000    Class :character   1st Qu.:0.00000           
##  Median : 2.000    Mode  :character   Median :0.00000           
##  Mean   : 2.204                       Mean   :0.03099           
##  3rd Qu.: 3.000                       3rd Qu.:0.00000           
##  Max.   :17.000                       Max.   :1.00000           
##  room_type_reserved   lead_time       arrival_year  arrival_month   
##  Length:36275       Min.   :  0.00   Min.   :2017   Min.   : 1.000  
##  Class :character   1st Qu.: 17.00   1st Qu.:2018   1st Qu.: 5.000  
##  Mode  :character   Median : 57.00   Median :2018   Median : 8.000  
##                     Mean   : 85.23   Mean   :2018   Mean   : 7.424  
##                     3rd Qu.:126.00   3rd Qu.:2018   3rd Qu.:10.000  
##                     Max.   :443.00   Max.   :2018   Max.   :12.000  
##   arrival_date  market_segment_type repeated_guest   
##  Min.   : 1.0   Length:36275        Min.   :0.00000  
##  1st Qu.: 8.0   Class :character    1st Qu.:0.00000  
##  Median :16.0   Mode  :character    Median :0.00000  
##  Mean   :15.6                       Mean   :0.02564  
##  3rd Qu.:23.0                       3rd Qu.:0.00000  
##  Max.   :31.0                       Max.   :1.00000  
##  no_of_previous_cancellations no_of_previous_bookings_not_canceled
##  Min.   : 0.00000             Min.   : 0.0000                     
##  1st Qu.: 0.00000             1st Qu.: 0.0000                     
##  Median : 0.00000             Median : 0.0000                     
##  Mean   : 0.02335             Mean   : 0.1534                     
##  3rd Qu.: 0.00000             3rd Qu.: 0.0000                     
##  Max.   :13.00000             Max.   :58.0000                     
##  avg_price_per_room no_of_special_requests booking_status    
##  Min.   :  0.00     Min.   :0.0000         Length:36275      
##  1st Qu.: 80.30     1st Qu.:0.0000         Class :character  
##  Median : 99.45     Median :0.0000         Mode  :character  
##  Mean   :103.42     Mean   :0.6197                           
##  3rd Qu.:120.00     3rd Qu.:1.0000                           
##  Max.   :540.00     Max.   :5.0000

Fix data types:

data$type_of_meal_plan <- as.factor(data$type_of_meal_plan)
data$room_type_reserved <- as.factor(data$room_type_reserved)
data$market_segment_type <- as.factor(data$market_segment_type)
data$booking_status <- as.factor(data$booking_status)

Select numeric columns for PCA:

pca_cols <- 
  c(
    "no_of_adults",
    "no_of_children",
    "no_of_weekend_nights",
    "no_of_week_nights",
    "required_car_parking_space",
    "arrival_year",
    "arrival_month",
    "arrival_date",
    "repeated_guest",
    "no_of_previous_cancellations",
    "no_of_previous_bookings_not_canceled",
    "avg_price_per_room",
    "no_of_special_requests"
  )
data.pca <- data[, pca_cols]

Show correlations:

corr.df<-cor(data.pca, method="pearson") 
corrplot(corr.df, order ="alphabet", tl.cex=0.6)

Scale the data

data.pca <- data.Normalization(data.pca, type="n1", normalization="column")
data.pca <- data.pca[complete.cases(data.pca), ]

PCA

Explained variances

sample.pca <- prcomp(data.pca)
fviz_screeplot(sample.pca)

Contributions of certain variables

fviz_contrib(sample.pca, choice = "var")

MDS

Calculate dissimilarities – distance matrix

dist.sub<-dist(t(data.pca))

Run MDS

mds.object <-mds(dist.sub, ndim=2,  type="ratio") 

Stress Decomposition Chart

plot(mds.object, plot.type = "stressplot") 

MDS for selected variables

plot(mds.object, pch=21, cex=as.numeric(mds.object$spp), bg="coral2", main="MDS for selected variables")

t-SNE

Perform t-SNE

tsne.obj <- Rtsne(data.pca, check_duplicates = FALSE)

Prepare data for visualization

tnse.data <- data %>% 
  mutate(id = row_number())

tsne_results <- tsne.obj$Y %>% 
  as.data.frame() %>% 
  mutate(id=row_number())

tnse.data <- tnse.data %>% 
  inner_join(tsne_results, by="id")

Vizualize results

tnse.data %>% 
  ggplot(aes(x=V1, y=V2, color=booking_status)) + geom_point() 

Conclusions

In conclusion, dimensionality reduction techniques were used to explore the Hotel Reservations Classification Dataset from Kaggle. The goal was to determine which customers are likely to cancel their reservations. To achieve this, several R libraries were utilized, including FactoMineR, corrplot, smacof, clusterSim, tidyverse, ggplot2, dplyr, factoextra, and Rtsne. Exploratory data analysis was performed by reading and summarizing the data, fixing the data types, and selecting numeric columns for principal component analysis (PCA). The PCA was then performed to determine the explained variances and contributions of certain variables, and multidimensional scaling (MDS) was performed to calculate dissimilarities and prepare visualizations. Finally, t-SNE was performed to further reduce the dimensions and prepare the data for visualization.