I have tested and explored several clustering methods (alogorithms) to create relevant markets of hotels in Houston, Texas (Houston is chosen for a sample city).
I have found that Density-Based Spatial Clustering and Application with Noise (DBSCAN) is valid for the Houston hotel market (Details of DBSCAN is below).
As a results of clustering by DBSCAN, there are 37 clusters from 474 hotels. 19 out of 474 hotels are not parts of any clusters and will be considered a monpoloy in their local market.
load("hdata6.new.Rdata")
library(ggmap)
library(dplyr)
At this point, I am focusing on analysis of hotels in Houston, Texas, a city with the largest number of hotels in Texas.
plots.msa.each <- list()
map.mas.list <- list()
for (i in 1:25){
lon.m <- median(hdata6[hdata6$id.msa==i, c("lon")])
lon.s <- min(hdata6[hdata6$id.msa==i, c("lon")])
lon.h <- max(hdata6[hdata6$id.msa==i, c("lon")])
lat.m <- median(hdata6[hdata6$id.msa==i, c("lat")])
lat.s <- min(hdata6[hdata6$id.msa==i, c("lat")])
lat.h <- max(hdata6[hdata6$id.msa==i, c("lat")])
center.m <- c(lon.m, lat.m)
box.m <- c(lon.s, lat.s, lon.h, lat.h)
msa <- hdata6$msa[hdata6$id.msa==i][1]
#map.msa <- get_googlemap(center.m, zoom=8, maptype = "roadmap")
map.msa <- get_stamenmap(bbox = box.m, zoom=10, maptype = "terrain-lines", messaging = F)
map.mas.list[[i]] <- map.msa
plot.msa <- ggmap(map.msa, extent="normal") +
geom_point(aes(x=lon, y=lat, color=city), alpha=0.8, data = hdata6[hdata6$id.msa==i,], size=1, na.rm=TRUE) +
xlab("Longitude") + ylab("Latitude") + ggtitle(msa) + theme()
plots.msa.each[[i]] <- plot.msa
}
plots.msa.each[[11]]
library('ggsn') # package create scalebar in maps (from ggmap)
library('ggmap')
# create a map for Houston
lon.m <- median(hdata6[hdata6$city=="Houston", c("lon")])
lon.s <- min(hdata6[hdata6$city=="Houston", c("lon")])
lon.h <- max(hdata6[hdata6$city=="Houston", c("lon")])
lat.m <- median(hdata6[hdata6$city=="Houston", c("lat")])
lat.s <- min(hdata6[hdata6$city=="Houston", c("lat")])
lat.h <- max(hdata6[hdata6$city=="Houston", c("lat")])
center.m <- c(lon.m, lat.m)
box.houston <- c(lon.s+0.01, lat.s+0.01, lon.h+0.01, lat.h+0.01)
map.houston <-get_stamenmap(bbox = box.houston, zoom=10, maptype = "terrain-line")
map.houston.all <- ggmap(map.houston, extent="normal") +
geom_point(aes(x=lon, y=lat, color=chain_name), alpha=0.8, data = hdata6[which(hdata6$city=="Houston"),], size=1.5, na.rm=TRUE) +
scalebar(x.min = lon.s, x.max = lon.h,
y.min = lat.s, y.max = lat.h,
dist = 2.5, dist_unit = "mi",
st.bottom = F, st.color = "black", height = 0.01, st.dist = 0.02, st.size=2,
transform = T, model = "WGS84") +
xlab("Longitude") + ylab("Latitude") + ggtitle("Hotels in Houston, TX") + labs(color="Chain") + theme(legend.key.size = unit(0.1, "cm"), legend.position = "bottom", legend.text = element_text(size=8), legend.title = element_text(size=8))
map.houston.brand <- ggmap(map.houston, extent="normal") +
geom_point(aes(x=lon, y=lat, color=chain_name), alpha=0.8, data = hdata6[which(hdata6$city=="Houston" & hdata6$chain>0),], size=1.5, na.rm=TRUE) +
blank() +
scalebar(x.min = lon.s, x.max = lon.h,
y.min = lat.s, y.max = lat.h,
dist = 2.5, dist_unit = "mi",
st.bottom = F, st.color = "black", height = 0.01, st.dist = 0.02, st.size=2,
transform = T, model = "WGS84") +
xlab("Longitude") + ylab("Latitude") + ggtitle("Chain Hotels in Houston, TX") +
labs(color="Chain") + theme(legend.key.size = unit(0.1, "cm"), legend.position = "bottom", legend.text = element_text(size=8), legend.title = element_text(size=8))
map.houston.all
map.houston.brand
The following table summarizes hotels in Houston, TX by hotel brands and hotel chains. There are 75 hotel brands and these hotel brands belong to 24 hotels.
library('dplyr')
library('DT')
tab.chain.hotel.houston <- hdata6[which(hdata6$city=="Houston" & hdata6$qtr==1),] %>% group_by(chain_name, brand, rating) %>% tally()
datatable(tab.chain.hotel.houston)
I explain the method of DBSCAN first and other clustering methods later.
This approach aims to define dense regions which is measured by the number of points close to the given point.
DBSCAN requires two parameters: distance (\(\epsilon\), or eps) and the minimum number of points (minPts). Given a point i, a radius neighbood is defined with \(\epsilon\). With this neighbood, if there are more than minPts, the point is considered as a core point and then form a cluster. Other points in the cluster will be considered as board points. If any border points within their own neighbors have more points than minPts, they are considered as core points and form their clusters. Since these clusters are closed enogh, the clusters are combined into one. Any other points (neither core nor border) will be considered as noise.
Core, Border, and Noise points in DBSCAN
I condudt DBSCAN twice to cover potential clusters with smaller number of hotels (points).
DBSCAN with eps=0.02 and minPts = 5 As a result of this clustering, 21 clusters are formed. 58 out of 471 hotels do not belong to any clusters. To deal with the uncluster
DBSCAN with eps=0.02 and minPts = 2 To deal with these unclustered points, I conduct the second DBSCAN to make cluster with the same eps, but smaller minPts=2 that the first one. By doing this, I can allocate hotels into clusters, even though they are far from any clusters and the number of hotels in small areas is not great.
library("fpc")
library("dbscan")
library("factoextra")
library("plotly")
key.var.fld <- c("lat", "lon") # variables used for measure similarity between pairs
dat.1 <- hdata6[which(hdata6$city=="Houston" & hdata6$qtr==1), ] # choose hotels in Houston, TX
w.raw <- dat.1[, key.var.fld] # raw data
row.names(w.raw) <- c()
# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
distance.hous <- dist(w.scale, method="euclidean") # make a distnace with all treat
# eps determination using k nearest neighrbod distance
dbscan::kNNdistplot(w.scale, k=4)
db.hous <- fpc::dbscan(w.scale, eps=0.2, MinPts=5)
fviz_cluster(db.hous, data = w.scale, stand = FALSE,
ellipse = FALSE, show.clust.cent = FALSE,
geom = "point",palette = "jco", ggtheme = theme_classic())
## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 14.
## Warning: Removed 46 rows containing missing values (geom_point).
db.hous2 <- dbscan::dbscan(x=w.scale, eps=0.2, minPts = 4)
w.scale1 <- as.data.frame(cbind(w.scale, db.hous2$cluster))
names(w.scale1) <- c("lat", "lon", "cluster")
# use raw data (lat and long)
dbscan::kNNdistplot(w.raw, k=4)
db.hous3 <- dbscan::dbscan(x=w.raw, eps=0.02, minPts = 4)
summary(db.hous3)
## Length Class Mode
## cluster 471 -none- numeric
## eps 1 -none- numeric
## minPts 1 -none- numeric
names(db.hous3)
## [1] "cluster" "eps" "minPts"
w.scale2 <- as.data.frame(cbind(w.raw, db.hous3$cluster))
names(w.scale2) <- c("lat", "lon", "cluster")
w.scale2.orginal <- w.scale2
ggplot(w.scale2.orginal[w.scale2.orginal$cluster>0,]) + geom_point(aes(x=lat, y=lon, color=cluster)) + scale_color_gradientn(colours = rainbow(5))
ggplot(w.scale2.orginal) + geom_point(aes(x=lat, y=lon, color=cluster)) +
scale_color_gradientn(colours = rainbow(14))
# re DBSCAN for points that are not in clusters
dim(w.scale2)
## [1] 471 3
dim(w.scale2[w.scale2$cluster==0,])
## [1] 58 3
w.scale2.unclu <- w.scale2[w.scale2$cluster==0, c("lat", "lon")]
db.hous2.unclu <- dbscan::dbscan(x=w.scale2.unclu, eps=0.02, minPts = 2)
db.hous2.unclu$cluster
## [1] 1 2 1 2 3 3 4 4 0 0 0 0 0 5 0 6 6 6 7 7 0 8 8
## [24] 0 9 9 5 5 0 0 0 10 11 11 10 8 0 0 0 0 12 12 6 0 13 13
## [47] 13 14 14 14 15 14 0 0 15 0 16 16
# bind b/ w.scale2.unclu and db.hous2.unclu
w.scale2.unclu2 <- as.data.frame(cbind(w.scale2.unclu, db.hous2.unclu$cluster))
names(w.scale2.unclu2) <- c("lat", "lon", "cluster")
w.scale2.unclu2$cluster <-w.scale2.unclu2$cluster + 21
w.scale2.unclu2$cluster[w.scale2.unclu2$cluster==21] <- 0
w.scale2[w.scale2$cluster==0, c("cluster")] <- w.scale2.unclu2$cluster
table(w.scale2$cluster)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 19 115 56 15 29 6 17 39 14 34 4 5 5 14 8 4 4 15
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## 4 14 4 7 2 2 2 2 3 4 2 3 2 2 2 2 3 4
## 36 37
## 2 2
houst.cluster <- ggplot(w.scale2) + geom_point(aes(x=lat, y=lon, color=cluster)) +
scale_color_gradientn(colours = rainbow(14))
ggplotly(houst.cluster)
save(hdata6, dat.1, w.scale2, file="cluster.Rdata")
# clustering done
opt.hous <- optics(x= w.scale, eps = 0.2, minPts = 5)
#opt.hous
opt.hous.extra <- extractDBSCAN(opt.hous, eps_cl = 0.2)
#plot(opt.hous.extra)
#hullplot(w.scale, opt.hous.extra)
# extraXi
xi.hous.extra <- extractXi(opt.hous, xi=0.05)
#xi.hous.extra
#plot(xi.hous.extra)
#hullplot(w.scale, xi.hous.extra )
The following is the exercise of various clustering methods(FFR)
Assign observations into a cluster. The objective function is the following:
\[ minimize \; \sum_{j}^{K} \sum_{i}^{n} (X_{ij} - C_j) \] where \(j=1, \cdots, J\) indicates cluster \(j\), \(i=1, \cdots, n\) represents observation \(i\), \(X_{ij}\) is the characteristics of observation \(i\) in cluster \(j\), and \(C_j\) represents the center of cluster \(j\).
Selecting key variables describing similarity between pairs of observations plays decisive roles in defining clusters.
library('class')
library('factoextra')
# W
# create matrix with variables used to calculate Euclidean distance
#
key.var.fld <- c("lat", "lon")
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()
# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
## lat lon
## Min. :-2.2092 Min. :-2.21383
## 1st Qu.:-0.7312 1st Qu.:-0.88523
## Median :-0.1747 Median : 0.06784
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6883 3rd Qu.: 0.60164
## Max. : 2.3375 Max. : 2.76351
distance.h <- dist(w.scale)
# visualzation of W
fviz_dist(distance.h, show_labels = F, gradient = list(low = "red", mid = "white", high = "blue"))
# find out optimal K (# of clusters)
library('factoextra')
fviz_nbclust(x = w.scale, kmeans, method = "wss") # find out the optimal K (# of clusters)
Based on the above graph, total within cluster sum of squares (WCSS) are constanct after six clusters. I use
kmean.hou<- kmeans(w.scale, 6, nstart =25) # K = 6
fviz_cluster(kmean.hou, data = w.scale,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
geom = c("point"),
labelsize = 5, pointsize = 1,
ggtheme = theme_minimal())
However, if other variables are added into the measure of similarity of a pair of observations, there will be overlaps in a map.
key.var.fld <- c("lat", "lon","adr") # key variables are xy coordinates and price(ADR)
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()
# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
## lat lon adr
## Min. :-2.2092 Min. :-2.21383 Min. :-1.2852
## 1st Qu.:-0.7312 1st Qu.:-0.88523 1st Qu.:-0.7995
## Median :-0.1747 Median : 0.06784 Median :-0.3480
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.6883 3rd Qu.: 0.60164 3rd Qu.: 0.6536
## Max. : 2.3375 Max. : 2.76351 Max. : 5.2228
kmean.hou<- kmeans(w.scale, 6, nstart =25) # K = 6
fviz_cluster(kmean.hou, data = w.scale,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
geom = c("point"),
labelsize = 5, pointsize = 1,
ggtheme = theme_minimal())
Partitioning Around Medoids is similar to K-mean cluster analysis, but PAM replys on a medoids(a point) in cluster. K-mean cluster uses a centroid among points within a cluster, while PAM use an actual meiod, represents an average similarity among points within a cluster.
library('cluster')
key.var.fld <- c("lat", "lon")
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()
# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
## lat lon
## Min. :-2.2092 Min. :-2.21383
## 1st Qu.:-0.7312 1st Qu.:-0.88523
## Median :-0.1747 Median : 0.06784
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6883 3rd Qu.: 0.60164
## Max. : 2.3375 Max. : 2.76351
pam.hou <- pam(w.scale, 6)
pam.hou$medoids
## lat lon
## [1,] -0.8017927 0.3045097
## [2,] -0.4110922 -0.1933747
## [3,] -1.1423417 1.4828063
## [4,] 1.3205145 0.4082601
## [5,] -0.5188778 -1.1371846
## [6,] 0.9939465 -0.9650722
fviz_cluster(pam.hou,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
geom = c("point"),
labelsize = 5, pointsize = 1,
ggtheme = theme_minimal())
key.var.fld <- c("lat", "lon", "adr") # key variables are xy coordinates and price(ADR)
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()
# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
## lat lon adr
## Min. :-2.2092 Min. :-2.21383 Min. :-1.2852
## 1st Qu.:-0.7312 1st Qu.:-0.88523 1st Qu.:-0.7995
## Median :-0.1747 Median : 0.06784 Median :-0.3480
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.6883 3rd Qu.: 0.60164 3rd Qu.: 0.6536
## Max. : 2.3375 Max. : 2.76351 Max. : 5.2228
pam.hou <- pam(w.scale, 6)
pam.hou$medoids
## lat lon adr
## [1,] -0.3890006 -0.1478687 1.3402753
## [2,] -0.9628511 0.7413767 -0.6668548
## [3,] 0.3170396 0.8581243 -0.7808984
## [4,] -0.5195013 -1.0272327 -0.1392102
## [5,] 0.6517094 -0.6780180 -0.6853484
## [6,] 1.4711685 0.1480888 -0.3480333
fviz_cluster(pam.hou,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
geom = c("point"),
labelsize = 5, pointsize = 1,
ggtheme = theme_minimal())
In general, hierarchical cluster analysis groups points into cluster based on similarity between pair of observations. At the beginning, this analysis treats each observation as a cluster and then groups with other clusters with close similarites.
key.var.fld <- c("lat", "lon")
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()
# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
## lat lon
## Min. :-2.2092 Min. :-2.21383
## 1st Qu.:-0.7312 1st Qu.:-0.88523
## Median :-0.1747 Median : 0.06784
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6883 3rd Qu.: 0.60164
## Max. : 2.3375 Max. : 2.76351
distance.hous <- dist(w.scale, method="euclidean") # make a distnace with all treat
hc.hous <- hclust(d = distance.hous, method = "ward.D2")
fviz_dend(hc.hous, cex = 0.5)
# K = 6
# cut tree into 6
hc.hous.cut <- cutree(hc.hous, k = 6)
fviz_dend(hc.hous, k=6, cex = 0.5)
fviz_cluster(list(data= w.scale, cluster =hc.hous.cut), ellipse.type = "convex",
palette = "jco",
repel = TRUE,
geom = c("point"),
labelsize = 5, pointsize = 1
)
I changed the key variables
key.var.fld <- c("lat", "lon", "adr")
w.raw <- dat.1[, key.var.fld]
row.names(w.raw) <- c()
# normalization of w
mean.w <- apply(w.raw, 2, mean)
sd.w <- apply(w.raw, 2, sd)
w.scale <- scale(w.raw, mean.w, sd.w)
summary(w.scale)
## lat lon adr
## Min. :-2.2092 Min. :-2.21383 Min. :-1.2852
## 1st Qu.:-0.7312 1st Qu.:-0.88523 1st Qu.:-0.7995
## Median :-0.1747 Median : 0.06784 Median :-0.3480
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.6883 3rd Qu.: 0.60164 3rd Qu.: 0.6536
## Max. : 2.3375 Max. : 2.76351 Max. : 5.2228
distance.hous <- dist(w.scale, method="euclidean") # make a distnace with all treat
hc.hous <- hclust(d = distance.hous, method = "ward.D2")
fviz_dend(hc.hous, cex = 0.5)
# K = 6
# cut tree into 6
hc.hous.cut <- cutree(hc.hous, k = 6)
fviz_dend(hc.hous, k=6, cex = 0.5)
fviz_cluster(list(data= w.scale, cluster =hc.hous.cut), ellipse.type = "convex",
palette = "jco",
repel = TRUE,
geom = c("point"),
labelsize = 5, pointsize = 1
)
library(mclust)
mclust.hous <- Mclust(w.scale)
summary(mclust.hous)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EVV (ellipsoidal, equal volume) model with 9 components:
##
## log-likelihood n df BIC ICL
## -1519.468 471 81 -3537.479 -3633.649
##
## Clustering table:
## 1 2 3 4 5 6 7 8 9
## 67 40 44 61 38 92 50 25 54
library(factoextra)
fviz_mclust(mclust.hous, "classification", geom = "point",
pointsize = 1.5, palette = "jco")
fviz_mclust(mclust.hous, "uncertainty", palette = "jco")
library('factoextra')
hkmeans.hous <- hkmeans(w.scale, k=6)
fviz_dend(hkmeans.hous, cex = 0.6, palette = "jco",
rect = TRUE, rect_border = "jco", rect_fill = TRUE)
fviz_cluster(hkmeans.hous, palette = "jco", repel = TRUE, geom =c("point"),
ggtheme = theme_classic())