1 Summary

I have tested and explored several clustering methods (alogorithms) to create relevant markets of hotels in Houston, Texas (Houston is chosen for a sample city).

I have found that Density-Based Spatial Clustering and Application with Noise (DBSCAN) is valid for the Houston hotel market (Details of DBSCAN is below).

As a results of clustering by DBSCAN, there are 37 clusters from 474 hotels. 19 out of 474 hotels are not parts of any clusters and will be considered a monpoloy in their local market.

2 Update (April 9, 2019)

2.0.1 Load data

load("hdata6.new.Rdata")

2.0.2 Load packages

library(ggmap)
library(dplyr)

2.1 Descriptive Analysis

At this point, I am focusing on analysis of hotels in Houston, Texas, a city with the largest number of hotels in Texas.

2.1.1 Hotel Geogrical Distribution in MSA in Texas

plots.msa.each <- list()
map.mas.list <- list()
for (i in 1:25){
  lon.m <- median(hdata6[hdata6$id.msa==i, c("lon")])
  lon.s <- min(hdata6[hdata6$id.msa==i, c("lon")])
  lon.h <- max(hdata6[hdata6$id.msa==i, c("lon")])
  lat.m <- median(hdata6[hdata6$id.msa==i, c("lat")])
  lat.s <- min(hdata6[hdata6$id.msa==i, c("lat")])
  lat.h <- max(hdata6[hdata6$id.msa==i, c("lat")])
  center.m <- c(lon.m, lat.m)
  box.m <- c(lon.s, lat.s, lon.h, lat.h)
  msa <- hdata6$msa[hdata6$id.msa==i][1]
  #map.msa <- get_googlemap(center.m, zoom=8, maptype = "roadmap")
  map.msa  <- get_stamenmap(bbox = box.m, zoom=10, maptype = "terrain-lines", messaging = F)
  map.mas.list[[i]] <- map.msa
  plot.msa <- ggmap(map.msa, extent="normal") + 
                geom_point(aes(x=lon, y=lat, color=city), alpha=0.8, data = hdata6[hdata6$id.msa==i,], size=1, na.rm=TRUE) +
    xlab("Longitude") + ylab("Latitude") + ggtitle(msa) + theme()
    
  plots.msa.each[[i]] <- plot.msa
}

2.1.1.1 Hotel distribution in Houston MSA

plots.msa.each[[11]]

2.1.1.2 Hotel Distribution of Houston (at the city level)

library('ggsn') # package create scalebar in maps (from ggmap)
library('ggmap')
# create a map for Houston
lon.m <- median(hdata6[hdata6$city=="Houston", c("lon")])
lon.s <- min(hdata6[hdata6$city=="Houston", c("lon")])
lon.h <- max(hdata6[hdata6$city=="Houston", c("lon")])
lat.m <- median(hdata6[hdata6$city=="Houston", c("lat")])
lat.s <- min(hdata6[hdata6$city=="Houston", c("lat")])
lat.h <- max(hdata6[hdata6$city=="Houston", c("lat")])
center.m <- c(lon.m, lat.m)
box.houston <- c(lon.s+0.01, lat.s+0.01, lon.h+0.01, lat.h+0.01)
map.houston <-get_stamenmap(bbox = box.houston, zoom=10, maptype = "terrain-line")


map.houston.all <- ggmap(map.houston, extent="normal") +
                      geom_point(aes(x=lon, y=lat, color=chain_name), alpha=0.8, data = hdata6[which(hdata6$city=="Houston"),], size=1.5, na.rm=TRUE) +
                      scalebar(x.min = lon.s, x.max = lon.h,
                               y.min = lat.s, y.max = lat.h,
                               dist = 2.5, dist_unit = "mi",
                               st.bottom = F, st.color = "black", height = 0.01, st.dist = 0.02, st.size=2,
                               transform = T, model = "WGS84") + 
                      xlab("Longitude") + ylab("Latitude") + ggtitle("Hotels in Houston, TX") + labs(color="Chain") + theme(legend.key.size = unit(0.1, "cm"), legend.position = "bottom", legend.text = element_text(size=8), legend.title = element_text(size=8))


map.houston.brand <- ggmap(map.houston, extent="normal") +
  geom_point(aes(x=lon, y=lat, color=chain_name), alpha=0.8, data = hdata6[which(hdata6$city=="Houston" & hdata6$chain>0),], size=1.5, na.rm=TRUE) +
  blank() +
  scalebar(x.min = lon.s, x.max = lon.h,
           y.min = lat.s, y.max = lat.h,
           dist = 2.5, dist_unit = "mi",
           st.bottom = F, st.color = "black", height = 0.01, st.dist = 0.02, st.size=2,
           transform = T, model = "WGS84") + 
  xlab("Longitude") + ylab("Latitude") + ggtitle("Chain Hotels in Houston, TX") + 
  labs(color="Chain")  + theme(legend.key.size = unit(0.1, "cm"), legend.position = "bottom", legend.text = element_text(size=8), legend.title = element_text(size=8))


map.houston.all

map.houston.brand

2.1.1.3 Hotel Brands and Chains in Houston, TX (2014 Q1)

The following table summarizes hotels in Houston, TX by hotel brands and hotel chains. There are 75 hotel brands and these hotel brands belong to 24 hotels.

library('dplyr')
library('DT')
tab.chain.hotel.houston <- hdata6[which(hdata6$city=="Houston" & hdata6$qtr==1),] %>% group_by(chain_name, brand, rating) %>% tally()
datatable(tab.chain.hotel.houston)

3 Clustering Analysis

I explain the method of DBSCAN first and other clustering methods later.

3.1 Density based spatial clustering and application with noise(DBSCAN)

This approach aims to define dense regions which is measured by the number of points close to the given point.

DBSCAN requires two parameters: distance (\(\epsilon\), or eps) and the minimum number of points (minPts). Given a point i, a radius neighbood is defined with \(\epsilon\). With this neighbood, if there are more than minPts, the point is considered as a core point and then form a cluster. Other points in the cluster will be considered as board points. If any border points within their own neighbors have more points than minPts, they are considered as core points and form their clusters. Since these clusters are closed enogh, the clusters are combined into one. Any other points (neither core nor border) will be considered as noise.

Core, Border, and Noise points in DBSCAN

Core, Border, and Noise points in DBSCAN

3.1.1 DBSCAN for the Houston Hotel Market (First qurater of 2014)

I condudt DBSCAN twice to cover potential clusters with smaller number of hotels (points).

  1. DBSCAN with eps=0.02 and minPts = 5 As a result of this clustering, 21 clusters are formed. 58 out of 471 hotels do not belong to any clusters. To deal with the uncluster

  2. DBSCAN with eps=0.02 and minPts = 2 To deal with these unclustered points, I conduct the second DBSCAN to make cluster with the same eps, but smaller minPts=2 that the first one. By doing this, I can allocate hotels into clusters, even though they are far from any clusters and the number of hotels in small areas is not great.

3.1.1.1 Final Cluster of DBSCAN

library("fpc")
library("dbscan")
library("factoextra")
library("plotly")

key.var.fld <- c("lat", "lon")  # variables used for measure similarity between pairs
dat.1 <- hdata6[which(hdata6$city=="Houston" & hdata6$qtr==1), ] # choose hotels in Houston, TX
w.raw <- dat.1[, key.var.fld] # raw data
row.names(w.raw) <- c()

# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)

distance.hous <- dist(w.scale, method="euclidean") # make a distnace with all treat


# eps determination using k nearest neighrbod distance

dbscan::kNNdistplot(w.scale, k=4)

db.hous <- fpc::dbscan(w.scale, eps=0.2, MinPts=5)


fviz_cluster(db.hous, data = w.scale, stand = FALSE,
             ellipse = FALSE, show.clust.cent = FALSE,
             geom = "point",palette = "jco", ggtheme = theme_classic())
## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 14.
## Warning: Removed 46 rows containing missing values (geom_point).

db.hous2 <- dbscan::dbscan(x=w.scale, eps=0.2, minPts = 4)
w.scale1 <- as.data.frame(cbind(w.scale, db.hous2$cluster))
names(w.scale1) <- c("lat", "lon", "cluster")

# use raw data (lat and long)
dbscan::kNNdistplot(w.raw, k=4)

db.hous3 <- dbscan::dbscan(x=w.raw, eps=0.02, minPts = 4)
summary(db.hous3)
##         Length Class  Mode   
## cluster 471    -none- numeric
## eps       1    -none- numeric
## minPts    1    -none- numeric
names(db.hous3)
## [1] "cluster" "eps"     "minPts"
w.scale2 <- as.data.frame(cbind(w.raw, db.hous3$cluster))
names(w.scale2) <- c("lat", "lon", "cluster")

w.scale2.orginal <- w.scale2

ggplot(w.scale2.orginal[w.scale2.orginal$cluster>0,]) + geom_point(aes(x=lat, y=lon, color=cluster)) + scale_color_gradientn(colours = rainbow(5))

ggplot(w.scale2.orginal) + geom_point(aes(x=lat, y=lon, color=cluster)) + 
  scale_color_gradientn(colours = rainbow(14))

# re DBSCAN for points that are not in clusters
dim(w.scale2)
## [1] 471   3
dim(w.scale2[w.scale2$cluster==0,])
## [1] 58  3
w.scale2.unclu <- w.scale2[w.scale2$cluster==0, c("lat", "lon")]
db.hous2.unclu <- dbscan::dbscan(x=w.scale2.unclu, eps=0.02, minPts = 2)
db.hous2.unclu$cluster
##  [1]  1  2  1  2  3  3  4  4  0  0  0  0  0  5  0  6  6  6  7  7  0  8  8
## [24]  0  9  9  5  5  0  0  0 10 11 11 10  8  0  0  0  0 12 12  6  0 13 13
## [47] 13 14 14 14 15 14  0  0 15  0 16 16
# bind b/ w.scale2.unclu and db.hous2.unclu
w.scale2.unclu2 <- as.data.frame(cbind(w.scale2.unclu, db.hous2.unclu$cluster))
names(w.scale2.unclu2) <- c("lat", "lon", "cluster")
w.scale2.unclu2$cluster <-w.scale2.unclu2$cluster + 21
w.scale2.unclu2$cluster[w.scale2.unclu2$cluster==21] <- 0
w.scale2[w.scale2$cluster==0, c("cluster")] <-  w.scale2.unclu2$cluster

table(w.scale2$cluster)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##  19 115  56  15  29   6  17  39  14  34   4   5   5  14   8   4   4  15 
##  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35 
##   4  14   4   7   2   2   2   2   3   4   2   3   2   2   2   2   3   4 
##  36  37 
##   2   2
houst.cluster <- ggplot(w.scale2) + geom_point(aes(x=lat, y=lon, color=cluster)) + 
                    scale_color_gradientn(colours = rainbow(14))
ggplotly(houst.cluster)
save(hdata6, dat.1, w.scale2, file="cluster.Rdata")
# clustering done


opt.hous <- optics(x= w.scale, eps = 0.2,  minPts = 5)
#opt.hous


opt.hous.extra <- extractDBSCAN(opt.hous, eps_cl =  0.2)
#plot(opt.hous.extra)
#hullplot(w.scale, opt.hous.extra)

# extraXi
xi.hous.extra <- extractXi(opt.hous, xi=0.05)
#xi.hous.extra
#plot(xi.hous.extra)
#hullplot(w.scale, xi.hous.extra )

The following is the exercise of various clustering methods(FFR)

3.2 Partition Analysis

3.2.1 K-Mean Cluster Analysis

Assign observations into a cluster. The objective function is the following:

\[ minimize \; \sum_{j}^{K} \sum_{i}^{n} (X_{ij} - C_j) \] where \(j=1, \cdots, J\) indicates cluster \(j\), \(i=1, \cdots, n\) represents observation \(i\), \(X_{ij}\) is the characteristics of observation \(i\) in cluster \(j\), and \(C_j\) represents the center of cluster \(j\).

Selecting key variables describing similarity between pairs of observations plays decisive roles in defining clusters.

library('class')
library('factoextra')

# W
# create matrix with variables used to calculate Euclidean distance
# 
key.var.fld <- c("lat", "lon")
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()

# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
##       lat               lon          
##  Min.   :-2.2092   Min.   :-2.21383  
##  1st Qu.:-0.7312   1st Qu.:-0.88523  
##  Median :-0.1747   Median : 0.06784  
##  Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.6883   3rd Qu.: 0.60164  
##  Max.   : 2.3375   Max.   : 2.76351
distance.h <- dist(w.scale)

# visualzation of W
fviz_dist(distance.h, show_labels = F, gradient = list(low = "red", mid = "white", high = "blue"))

# find out optimal  K (# of clusters)
library('factoextra')
fviz_nbclust(x = w.scale, kmeans, method = "wss") # find out the optimal K (# of clusters)

Based on the above graph, total within cluster sum of squares (WCSS) are constanct after six clusters. I use

kmean.hou<- kmeans(w.scale, 6, nstart =25) # K = 6 
fviz_cluster(kmean.hou, data = w.scale,
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             geom = c("point"),
             labelsize = 5, pointsize = 1,
             ggtheme = theme_minimal())

However, if other variables are added into the measure of similarity of a pair of observations, there will be overlaps in a map.

key.var.fld <- c("lat", "lon","adr") # key variables are xy coordinates and price(ADR)
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()

# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
##       lat               lon                adr         
##  Min.   :-2.2092   Min.   :-2.21383   Min.   :-1.2852  
##  1st Qu.:-0.7312   1st Qu.:-0.88523   1st Qu.:-0.7995  
##  Median :-0.1747   Median : 0.06784   Median :-0.3480  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.6883   3rd Qu.: 0.60164   3rd Qu.: 0.6536  
##  Max.   : 2.3375   Max.   : 2.76351   Max.   : 5.2228
kmean.hou<- kmeans(w.scale, 6, nstart =25) # K = 6 
fviz_cluster(kmean.hou, data = w.scale,
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             geom = c("point"),
             labelsize = 5, pointsize = 1,
             ggtheme = theme_minimal())

3.2.2 Partitioning Around Medoids(PAM) Cluster Analysis

Partitioning Around Medoids is similar to K-mean cluster analysis, but PAM replys on a medoids(a point) in cluster. K-mean cluster uses a centroid among points within a cluster, while PAM use an actual meiod, represents an average similarity among points within a cluster.

library('cluster')

key.var.fld <- c("lat", "lon") 
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()

# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
##       lat               lon          
##  Min.   :-2.2092   Min.   :-2.21383  
##  1st Qu.:-0.7312   1st Qu.:-0.88523  
##  Median :-0.1747   Median : 0.06784  
##  Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.6883   3rd Qu.: 0.60164  
##  Max.   : 2.3375   Max.   : 2.76351
pam.hou <- pam(w.scale, 6)
pam.hou$medoids
##             lat        lon
## [1,] -0.8017927  0.3045097
## [2,] -0.4110922 -0.1933747
## [3,] -1.1423417  1.4828063
## [4,]  1.3205145  0.4082601
## [5,] -0.5188778 -1.1371846
## [6,]  0.9939465 -0.9650722
fviz_cluster(pam.hou,       
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             geom = c("point"),
             labelsize = 5, pointsize = 1,
             ggtheme = theme_minimal())

key.var.fld <- c("lat", "lon", "adr") # key variables are xy coordinates and price(ADR)
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()

# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
##       lat               lon                adr         
##  Min.   :-2.2092   Min.   :-2.21383   Min.   :-1.2852  
##  1st Qu.:-0.7312   1st Qu.:-0.88523   1st Qu.:-0.7995  
##  Median :-0.1747   Median : 0.06784   Median :-0.3480  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.6883   3rd Qu.: 0.60164   3rd Qu.: 0.6536  
##  Max.   : 2.3375   Max.   : 2.76351   Max.   : 5.2228
pam.hou <- pam(w.scale, 6)
pam.hou$medoids
##             lat        lon        adr
## [1,] -0.3890006 -0.1478687  1.3402753
## [2,] -0.9628511  0.7413767 -0.6668548
## [3,]  0.3170396  0.8581243 -0.7808984
## [4,] -0.5195013 -1.0272327 -0.1392102
## [5,]  0.6517094 -0.6780180 -0.6853484
## [6,]  1.4711685  0.1480888 -0.3480333
fviz_cluster(pam.hou,       
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             geom = c("point"),
             labelsize = 5, pointsize = 1,
             ggtheme = theme_minimal())

3.3 Hierarchical Cluster Analysis

In general, hierarchical cluster analysis groups points into cluster based on similarity between pair of observations. At the beginning, this analysis treats each observation as a cluster and then groups with other clusters with close similarites.

key.var.fld <- c("lat", "lon") 
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()

# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
##       lat               lon          
##  Min.   :-2.2092   Min.   :-2.21383  
##  1st Qu.:-0.7312   1st Qu.:-0.88523  
##  Median :-0.1747   Median : 0.06784  
##  Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.6883   3rd Qu.: 0.60164  
##  Max.   : 2.3375   Max.   : 2.76351
distance.hous <- dist(w.scale, method="euclidean") # make a distnace with all treat
hc.hous <- hclust(d = distance.hous, method = "ward.D2")

fviz_dend(hc.hous, cex = 0.5)

# K = 6 
# cut tree into 6
hc.hous.cut <- cutree(hc.hous, k = 6)
fviz_dend(hc.hous, k=6, cex = 0.5)

fviz_cluster(list(data= w.scale, cluster =hc.hous.cut), ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             geom = c("point"),
             labelsize = 5, pointsize = 1
             )

I changed the key variables

key.var.fld <- c("lat", "lon", "adr") 
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()

# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
##       lat               lon                adr         
##  Min.   :-2.2092   Min.   :-2.21383   Min.   :-1.2852  
##  1st Qu.:-0.7312   1st Qu.:-0.88523   1st Qu.:-0.7995  
##  Median :-0.1747   Median : 0.06784   Median :-0.3480  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.6883   3rd Qu.: 0.60164   3rd Qu.: 0.6536  
##  Max.   : 2.3375   Max.   : 2.76351   Max.   : 5.2228
distance.hous <- dist(w.scale, method="euclidean") # make a distnace with all treat
hc.hous <- hclust(d = distance.hous, method = "ward.D2")

fviz_dend(hc.hous, cex = 0.5)

# K = 6 
# cut tree into 6
hc.hous.cut <- cutree(hc.hous, k = 6)
fviz_dend(hc.hous, k=6, cex = 0.5)

fviz_cluster(list(data= w.scale, cluster =hc.hous.cut), ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             geom = c("point"),
             labelsize = 5, pointsize = 1
             )

3.4 Advanced Clustering Method

3.4.1 Model-based clustering

library(mclust)
mclust.hous <- Mclust(w.scale)
summary(mclust.hous)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EVV (ellipsoidal, equal volume) model with 9 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -1519.468 471 81 -3537.479 -3633.649
## 
## Clustering table:
##  1  2  3  4  5  6  7  8  9 
## 67 40 44 61 38 92 50 25 54
library(factoextra)
fviz_mclust(mclust.hous, "classification", geom = "point", 
            pointsize = 1.5, palette = "jco")

fviz_mclust(mclust.hous, "uncertainty", palette = "jco")

3.5 Fuzzy clustering

3.6 Hierarchical K-mean cluster

library('factoextra')
hkmeans.hous <- hkmeans(w.scale, k=6)
fviz_dend(hkmeans.hous, cex = 0.6, palette = "jco", 
          rect = TRUE, rect_border = "jco", rect_fill = TRUE)

fviz_cluster(hkmeans.hous, palette = "jco", repel = TRUE, geom =c("point"),
             ggtheme = theme_classic())