K-Means clustering of hotel guests
Loading the dataset (A hotelโs customers dataset )
Creating the necessary fields for clustering
library(readxl); library(tidyverse)
cust <- read_excel('HotelCustomersDataset.xlsx')
cust_req <- cust %>%
mutate_at(c('Age', 'LodgingRevenue', 'OtherRevenue'), as.numeric) %>%
filter(Age>0 & LodgingRevenue>0 & OtherRevenue>=0 & AverageLeadTime>=0 &
DaysSinceLastStay>=0 & DaysSinceFirstStay>=0) %>%
mutate(ALR = LodgingRevenue / RoomNights)
Standardising the fields in the dataframe
outliers <- c(1181, 9036, 10472, 17279, 48990, 54834)
fields <- c('Nationality', 'NameHash', 'DocIDHash', 'DistributionChannel', 'MarketSegment')
cust_scaled <- cust_req %>% select(-(fields)) %>% slice(-(outliers)) %>%
mutate_all(~c(scale(.)))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(fields)` instead of `fields` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Fixing the final number of clusters (2)
library(cluster); library(factoextra); library(gridExtra)
final <- kmeans(cust_scaled, 2, nstart = 25)
print(final$size)
## [1] 30709 29640
fviz_cluster(final, data = cust_scaled, geom = 'point', ellipse.alpha = 0.005)

Analyzing the characteristics for each cluster
options(dplyr.width = Inf)
cust_req %>% select(-(fields)) %>% select(-ID) %>% slice(-(outliers)) %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>% summarise_all(list(mean=mean, sd=sd))
## # A tibble: 2 x 53
## Cluster Age_mean DaysSinceCreation_mean AverageLeadTime_mean
## <int> <dbl> <dbl> <dbl>
## 1 1 48.1 771. 87.0
## 2 2 46.5 256. 89.1
## LodgingRevenue_mean OtherRevenue_mean BookingsCanceled_mean
## <dbl> <dbl> <dbl>
## 1 341. 90.3 0.00221
## 2 460. 89.0 0.00172
## BookingsNoShowed_mean BookingsCheckedIn_mean PersonsNights_mean
## <dbl> <dbl> <dbl>
## 1 0.000749 1.04 6.20
## 2 0.000506 1.04 6.17
## RoomNights_mean DaysSinceLastStay_mean DaysSinceFirstStay_mean
## <dbl> <dbl> <dbl>
## 1 3.06 771. 774.
## 2 3.18 257. 259.
## SRHighFloor_mean SRLowFloor_mean SRAccessibleRoom_mean SRMediumFloor_mean
## <dbl> <dbl> <dbl> <dbl>
## 1 0.0432 0.00111 0.000326 0.00117
## 2 0.0483 0.00155 0.000236 0.000641
## SRBathtub_mean SRShower_mean SRCrib_mean SRKingSizeBed_mean SRTwinBed_mean
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00231 0.00153 0.00804 0.349 0.139
## 2 0.00321 0.00213 0.0109 0.356 0.135
## SRNearElevator_mean SRAwayFromElevator_mean SRNoAlcoholInMiniBar_mean
## <dbl> <dbl> <dbl>
## 1 0.000195 0.00501 0
## 2 0.000574 0.00250 0.000169
## SRQuietRoom_mean ALR_mean Age_sd DaysSinceCreation_sd AverageLeadTime_sd
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0953 115. 15.1 161. 91.5
## 2 0.0853 147. 14.8 149. 90.9
## LodgingRevenue_sd OtherRevenue_sd BookingsCanceled_sd BookingsNoShowed_sd
## <dbl> <dbl> <dbl> <dbl>
## 1 276. 117. 0.0655 0.0307
## 2 439. 125. 0.0529 0.0239
## BookingsCheckedIn_sd PersonsNights_sd RoomNights_sd DaysSinceLastStay_sd
## <dbl> <dbl> <dbl> <dbl>
## 1 0.510 4.14 1.85 165.
## 2 0.335 4.36 1.90 149.
## DaysSinceFirstStay_sd SRHighFloor_sd SRLowFloor_sd SRAccessibleRoom_sd
## <dbl> <dbl> <dbl> <dbl>
## 1 161. 0.203 0.0333 0.0180
## 2 149. 0.214 0.0394 0.0154
## SRMediumFloor_sd SRBathtub_sd SRShower_sd SRCrib_sd SRKingSizeBed_sd
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0342 0.0480 0.0391 0.0893 0.477
## 2 0.0253 0.0565 0.0461 0.104 0.479
## SRTwinBed_sd SRNearElevator_sd SRAwayFromElevator_sd SRNoAlcoholInMiniBar_sd
## <dbl> <dbl> <dbl> <dbl>
## 1 0.346 0.0140 0.0706 0
## 2 0.342 0.0239 0.0499 0.0130
## SRQuietRoom_sd ALR_sd
## <dbl> <dbl>
## 1 0.294 72.4
## 2 0.279 105.
cust_req %>% slice(-(outliers)) %>% mutate(Cluster = final$cluster) %>%
count(Cluster, Nationality, DistributionChannel, MarketSegment) %>%
group_by(Cluster) %>% slice_max(n=2, order_by = n)
## # A tibble: 4 x 5
## # Groups: Cluster [2]
## Cluster Nationality DistributionChannel MarketSegment n
## <int> <chr> <chr> <chr> <int>
## 1 1 FRA Travel Agent/Operator Other 3093
## 2 1 GBR Travel Agent/Operator Other 2127
## 3 2 FRA Travel Agent/Operator Other 2120
## 4 2 DEU Travel Agent/Operator Other 2064