K-Means clustering of hotel guests

Loading the dataset (A hotelโ€™s customers dataset )

Creating the necessary fields for clustering

library(readxl); library(tidyverse)
cust <- read_excel('HotelCustomersDataset.xlsx')

cust_req <- cust %>% 
  mutate_at(c('Age', 'LodgingRevenue', 'OtherRevenue'), as.numeric) %>%
  filter(Age>0 & LodgingRevenue>0 & OtherRevenue>=0 & AverageLeadTime>=0 &
         DaysSinceLastStay>=0 & DaysSinceFirstStay>=0) %>%
  mutate(ALR = LodgingRevenue / RoomNights)

Standardising the fields in the dataframe

outliers <- c(1181, 9036, 10472, 17279, 48990, 54834)
fields <- c('Nationality', 'NameHash', 'DocIDHash', 'DistributionChannel', 'MarketSegment')
cust_scaled <- cust_req %>% select(-(fields)) %>% slice(-(outliers)) %>%
  mutate_all(~c(scale(.)))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(fields)` instead of `fields` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

Fixing the final number of clusters (2)

library(cluster); library(factoextra); library(gridExtra)
final <- kmeans(cust_scaled, 2, nstart = 25)
print(final$size)
## [1] 30709 29640
fviz_cluster(final, data = cust_scaled, geom = 'point', ellipse.alpha = 0.005)

Analyzing the characteristics for each cluster

options(dplyr.width = Inf)
cust_req %>% select(-(fields)) %>% select(-ID) %>% slice(-(outliers)) %>%
  mutate(Cluster = final$cluster) %>%
  group_by(Cluster) %>% summarise_all(list(mean=mean, sd=sd))
## # A tibble: 2 x 53
##   Cluster Age_mean DaysSinceCreation_mean AverageLeadTime_mean
##     <int>    <dbl>                  <dbl>                <dbl>
## 1       1     48.1                   771.                 87.0
## 2       2     46.5                   256.                 89.1
##   LodgingRevenue_mean OtherRevenue_mean BookingsCanceled_mean
##                 <dbl>             <dbl>                 <dbl>
## 1                341.              90.3               0.00221
## 2                460.              89.0               0.00172
##   BookingsNoShowed_mean BookingsCheckedIn_mean PersonsNights_mean
##                   <dbl>                  <dbl>              <dbl>
## 1              0.000749                   1.04               6.20
## 2              0.000506                   1.04               6.17
##   RoomNights_mean DaysSinceLastStay_mean DaysSinceFirstStay_mean
##             <dbl>                  <dbl>                   <dbl>
## 1            3.06                   771.                    774.
## 2            3.18                   257.                    259.
##   SRHighFloor_mean SRLowFloor_mean SRAccessibleRoom_mean SRMediumFloor_mean
##              <dbl>           <dbl>                 <dbl>              <dbl>
## 1           0.0432         0.00111              0.000326           0.00117 
## 2           0.0483         0.00155              0.000236           0.000641
##   SRBathtub_mean SRShower_mean SRCrib_mean SRKingSizeBed_mean SRTwinBed_mean
##            <dbl>         <dbl>       <dbl>              <dbl>          <dbl>
## 1        0.00231       0.00153     0.00804              0.349          0.139
## 2        0.00321       0.00213     0.0109               0.356          0.135
##   SRNearElevator_mean SRAwayFromElevator_mean SRNoAlcoholInMiniBar_mean
##                 <dbl>                   <dbl>                     <dbl>
## 1            0.000195                 0.00501                  0       
## 2            0.000574                 0.00250                  0.000169
##   SRQuietRoom_mean ALR_mean Age_sd DaysSinceCreation_sd AverageLeadTime_sd
##              <dbl>    <dbl>  <dbl>                <dbl>              <dbl>
## 1           0.0953     115.   15.1                 161.               91.5
## 2           0.0853     147.   14.8                 149.               90.9
##   LodgingRevenue_sd OtherRevenue_sd BookingsCanceled_sd BookingsNoShowed_sd
##               <dbl>           <dbl>               <dbl>               <dbl>
## 1              276.            117.              0.0655              0.0307
## 2              439.            125.              0.0529              0.0239
##   BookingsCheckedIn_sd PersonsNights_sd RoomNights_sd DaysSinceLastStay_sd
##                  <dbl>            <dbl>         <dbl>                <dbl>
## 1                0.510             4.14          1.85                 165.
## 2                0.335             4.36          1.90                 149.
##   DaysSinceFirstStay_sd SRHighFloor_sd SRLowFloor_sd SRAccessibleRoom_sd
##                   <dbl>          <dbl>         <dbl>               <dbl>
## 1                  161.          0.203        0.0333              0.0180
## 2                  149.          0.214        0.0394              0.0154
##   SRMediumFloor_sd SRBathtub_sd SRShower_sd SRCrib_sd SRKingSizeBed_sd
##              <dbl>        <dbl>       <dbl>     <dbl>            <dbl>
## 1           0.0342       0.0480      0.0391    0.0893            0.477
## 2           0.0253       0.0565      0.0461    0.104             0.479
##   SRTwinBed_sd SRNearElevator_sd SRAwayFromElevator_sd SRNoAlcoholInMiniBar_sd
##          <dbl>             <dbl>                 <dbl>                   <dbl>
## 1        0.346            0.0140                0.0706                  0     
## 2        0.342            0.0239                0.0499                  0.0130
##   SRQuietRoom_sd ALR_sd
##            <dbl>  <dbl>
## 1          0.294   72.4
## 2          0.279  105.
cust_req %>% slice(-(outliers)) %>% mutate(Cluster = final$cluster) %>% 
  count(Cluster, Nationality, DistributionChannel, MarketSegment) %>%
  group_by(Cluster) %>% slice_max(n=2, order_by = n)
## # A tibble: 4 x 5
## # Groups:   Cluster [2]
##   Cluster Nationality DistributionChannel   MarketSegment     n
##     <int> <chr>       <chr>                 <chr>         <int>
## 1       1 FRA         Travel Agent/Operator Other          3093
## 2       1 GBR         Travel Agent/Operator Other          2127
## 3       2 FRA         Travel Agent/Operator Other          2120
## 4       2 DEU         Travel Agent/Operator Other          2064