library(cluster)
library(factoextra)
library(class)
library(dplyr)
library(stargazer)
library(ggplot2)
library(gridExtra)

Objective / Introduction

This project provides practical advices of how to use mcahine learning alogrithms to define markets or groups in which firms, or products are highly similar, and to evaluate the outcome of the ML analysis by using econometric models.

Among various machine learning (ML) alogrithms, I use clustering alogrithms, a unsupervised ML to group observations that are closed related.

The following clustering alogrthms are used: K-Means Clustering, Partitioning Around Medoids Clustering (PAM), Hierarchical Clustering, Hierarchical K-Means Clustering, and Density Based Spitial With Noise.

More details of each clustering alogrthm will be discussed later.

After conducting each clustering, we will have a cluster for each observation. Each cluster represents a market.

Given the estimated cluster, we will evaluate each clustering alogrithms(Post-Hoc Analysis). Rather than using crieteria from the literature of machine learning, we will use an econometric model to evaluate the outcome of the clustering alogrithms: distance metric approach (a coeffient of the weighted average prices of rivals) and discrete choice model(own/cross price elasticities in nested logit models).

To use these approach, we make sure that firms produce some levels of differentiated products, which is Bertrand competition with differentiated products. This should be added and fixed

For the distance metric approach, we assume that prices are a function of the rivals’ prices, own product characteristics and market characteristics. Using Pinkse et.al (2002), we reduce the number of coefficeints to one from n coefficients (for n firms in the market), the one of the weighted average prices of rivals in the market (wp, weighted by distances between firms. The distance meausre is Euclidean distance. Depending on the researcher’s choics, the number of factors to calculate the distance varies. Here, we use the xy coordinates.). We will estimate this coefficient from all the clustering alogrithms.

Example of the use of the clustering alogorithms to define markets of hotels in Houston TX.

The following analysis are done with this example.

Example of the use of the clustering alogorithms to define nests in cereal industry.

The example of discrete choice models will be conducted later.

Cluster Analysis

K-Mean Clustering

# sort data.frame 
data.houston <- data.houston[order(data.houston$qtr, data.houston$hotel),]

# W: a matrix of sdfs
factor.fld <- c("lon", "lat")

cl.kmeans <- list()
w.list <- list()
ncl.plot.list <- list()
kcl.plot.list <- list()
for (t in 1:4){
  w.raw <- data.houston[ data.houston$qtr==t, factor.fld]
  row.names(w.raw) <- c()
  w.scale <- scale(w.raw)
  # summary(w.scale)
  dist.w <- dist(w.scale)
  # find out the optimal K (# of clusters)
  ncluster <- fviz_nbclust(x = w.scale, kmeans, method = "wss") + 
                geom_vline(xintercept = 6, linetype = 2) + 
                labs(title = paste0("Optimal Clusters in 2014Q",t))
  
  # print(ncluster)
  
  kmeans.cl <- kmeans(w.scale, 6, nstart =25) # K = 6 
  kcluster <- fviz_cluster(kmeans.cl, data = w.scale,
                           ellipse.type = "convex",palette = "jco",
                           repel = TRUE,geom = c("point"),labelsize = 5,
                           pointsize = 1,ggtheme = theme_minimal()) +
                labs(title=paste0("K Means: Cluster Plot in 2014Q", t))
  # print(kcluster)
  cl.kmeans[[t]] <- kmeans.cl
  w.list[[t]] <- w.scale
  ncl.plot.list[[t]] <- ncluster
  kcl.plot.list[[t]] <- kcluster
}

do.call(grid.arrange, c(ncl.plot.list, ncol=2))

do.call(grid.arrange, c(kcl.plot.list, ncol=2))

# cl.kmeans[[1]]$cluster
kcl.list <- lapply(cl.kmeans, function(x) x$cluster)
# unlist(kcl.list)

# all combined
w.raw <- data.houston[,factor.fld]
row.names(w.raw) <- c()
w.scale <- scale(w.raw)
# summary(w.scale)
dist.w <- dist(w.scale)
# find out the optimal K (# of clusters)
ncluster <- fviz_nbclust(x = w.scale, kmeans, method = "wss") + 
              geom_vline(xintercept = 6, linetype = 2) + 
              labs(title = paste0("Optimal Clusters in 2014Q"))

# print(ncluster)

kmeans.cl <- kmeans(w.scale, 6, nstart =25) # K = 6 
kcluster <- fviz_cluster(kmeans.cl, data = w.scale,
                         ellipse.type = "convex",palette = "jco",
                         repel = TRUE,geom = c("point"),labelsize = 5,
                         pointsize = 1,ggtheme = theme_minimal()) +
              labs(title=paste0("K Means: Cluster Plot in 2014"))

ncluster

kcluster

data.houston$km.cluster <-  kmeans.cl$cluster

get.wp.cl <- function(mkt.id.fld, price.fld, q.fld, dat,...){
  # objective: get weighted price within a cluster
  # input:1) mkt.id.fld (market id variable),  
  #       2) factor.fld (factors that used to calculate distances between
  #                      points) 
  #       3) price.fld (price variable)
  #       4) q.fld (quantity variable)
  #       4) dat (data frames including all variables defined above)
  # output: weighted prices(a weighted average of prices with the distance, for each observation) 
  
  
  # loading packages
  require(dplyr)

  # mkt.id fld
  dat <- dat[order(dat[,mkt.id.fld]),]
  if (length(mkt.id.fld)>1){
    mkt.id <- cumsum(!duplicated(dat[, mkt.id.fld]))
  } else {
    mkt.id <- dat[, mkt.id.fld, drop=F ] 
  }
  
  dat <- dat[order(mkt.id),]
  price <-  dat[, price.fld, drop=F]
  q <- dat[, q.fld, drop=F]
  sales <- price * q
  factor <- dat[, factor.fld, drop=F]
  
  
  # loop dist 
  wp.list <- list()
  hi.list <- list()
  for(m in 1:max(mkt.id)){
    factor.m <- factor[mkt.id==m,]
    price.m <- as.matrix(price[mkt.id==m, ,drop=F])
    sales.m <- as.matrix(sales[mkt.id==m, ,drop=F])
    hi.m <- rep(sum((sales.m/sum(sales.m) * 100)^2)/10000, nrow(factor.m))
    dist.m <- as.matrix(dist(factor.m, diag=TRUE, upper =TRUE,
                             method="euclidean"))
    inv.dist.m <- 1/dist.m
    diag(inv.dist.m) <- 0
    inv.dist.m[is.infinite(inv.dist.m)]<-0
    wp.mat <- 1/nrow(factor.m) * inv.dist.m %*% price.m
    wp.list[[m]] <- wp.mat
    hi.list[[m]] <- hi.m
  }
  
  wp <- unlist(wp.list)
  wp <- as.matrix(wp)
  hi <- as.matrix(unlist(hi.list))
  dat$wp <- wp
  dat$hi <- hi
return(list(data=dat, wp=wp, hi=hi))
}

table(data.houston$km.cluster)
## 
##   1   2   3   4   5   6 
## 274 292 287 258 436 333
# create mkt.id 
data.houston$mkt.id.km <- data.houston %>% group_indices(qtr, km.cluster)
# table(data.houston$mkt.id.km)
# order data based on "mkt.id.km"

wp.km <- get.wp.cl(mkt.id.fld = "mkt.id.km", price.fld="adr", 
                   q.fld = "room_sold", dat = data.houston)

ols.wp.km <- lm(adr ~ wp + rating + room + hi +factor(qtr), data=wp.km$data)

stargazer(ols.wp.km, omit = "qtr", type="text",
          add.lines = list(c("QTR Fixed Effect", "YES")),
          no.space = TRUE, suppress.errors = TRUE, 
          omit.stat=c("LL","ser","f"))
## 
## ============================================
##                      Dependent variable:    
##                  ---------------------------
##                              adr            
## --------------------------------------------
## wp                        0.0002***         
##                           (0.0001)          
## rating                    18.986***         
##                            (0.618)          
## room                      0.075***          
##                            (0.009)          
## hi                         23.002           
##                           (41.848)          
## Constant                  40.924***         
##                            (2.595)          
## --------------------------------------------
## QTR Fixed Effect             YES            
## Observations                1,880           
## R2                          0.527           
## Adjusted R2                 0.525           
## ============================================
## Note:            *p<0.1; **p<0.05; ***p<0.01

Partitioning Around Medoids(PAM) Cluster Analysis

Partitioning Around Medoids is similar to K-mean cluster analysis, but PAM replys on a medoids(a point) in cluster. K-mean cluster uses a centroid among points within a cluster, while PAM use an actual meiod, represents an average similarity among points within a cluster.

library(cluster)
fviz_nbclust(w.scale, pam, method="silhouette")

cluster.pam <- pam(w.scale, 10, metric="euclidean")
pam.cl.plot <- fviz_cluster(cluster.pam,ellipse.type = "convex",
                            palette = "jco",repel = TRUE,
                            geom = c("point"),
                            labelsize = 5, pointsize = 1,
                            ggtheme = theme_minimal()) +
                  labs(title = "PAM: Cluster Plot")


data.houston$cl.pam <- cluster.pam$clustering

# wp in pam
## create mkt.id 
data.houston$mkt.id.pam <- data.houston %>% group_indices(qtr, cl.pam)

table(data.houston$mkt.id.pam)
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
## 20 43 53 55 54 50 78 35 42 41 19 43 52 55 54 50 79 34 41 40 19 44 53 56 53 50 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 
## 77 34 42 41 19 44 54 56 53 50 76 34 45 42
pam.wp <- get.wp.cl(mkt.id.fld = "mkt.id.pam", price.fld ="adr", 
                    q.fld ="room_sold", dat = data.houston)

ols.wp.pam <- lm(adr ~ wp + rating + room + hi +factor(qtr),
                 data=pam.wp$data)

stargazer(ols.wp.pam, omit = "qtr", type="text",
          add.lines = list(c("QTR Fixed Effect", "YES")),
          no.space = TRUE, suppress.errors = TRUE, 
          omit.stat=c("LL","ser","f"))
## 
## ============================================
##                      Dependent variable:    
##                  ---------------------------
##                              adr            
## --------------------------------------------
## wp                        0.0002***         
##                           (0.00004)         
## rating                    19.518***         
##                            (0.623)          
## room                      0.073***          
##                            (0.009)          
## hi                       183.366***         
##                           (41.946)          
## Constant                  30.447***         
##                            (3.241)          
## --------------------------------------------
## QTR Fixed Effect             YES            
## Observations                1,880           
## R2                          0.532           
## Adjusted R2                 0.530           
## ============================================
## Note:            *p<0.1; **p<0.05; ***p<0.01

Hierical Clustering

# Caluate distance 
dist.hous <- dist(w.scale, method="euclidean")
hc.cl <- hclust(d=dist.hous, method="ward.D2")
fviz_dend(hc.cl, cex=0.5)

fviz_dend(hc.cl, k=6, cex=0.5)

hc.cl6 <- cutree(hc.cl, k=6)
hc.cl.plot <- fviz_cluster(list(data=w.scale, cluster =hc.cl6), 
                           ellipse.type = "convex", 
                           palette ="jco", repel = TRUE, geom = c("point"), 
                           labelsize = 5, pointsize = 1) + 
                labs(title="Hierarchical Cluster")

hc.cl.plot

# wp 
data.houston$cluster.hc <- hc.cl6
data.houston$mkt.id.hc <- data.houston %>% group_indices(qtr, cluster.hc)

hc.wp <- get.wp.cl(mkt.id.fld = "mkt.id.hc", price.fld ="adr", 
                    q.fld ="room_sold", dat = data.houston)

ols.wp.hc <- lm(adr ~ wp + rating + room + hi +factor(qtr),
                 data=hc.wp$data)

stargazer(ols.wp.hc, omit = "qtr", type="text",
          add.lines = list(c("QTR Fixed Effect", "YES")),
          no.space = TRUE, suppress.errors = TRUE, 
          omit.stat=c("LL","ser","f"))
## 
## ============================================
##                      Dependent variable:    
##                  ---------------------------
##                              adr            
## --------------------------------------------
## wp                        0.0002***         
##                           (0.0001)          
## rating                    18.923***         
##                            (0.586)          
## room                      0.065***          
##                            (0.008)          
## hi                       -948.804***        
##                           (72.174)          
## Constant                  73.756***         
##                            (3.034)          
## --------------------------------------------
## QTR Fixed Effect             YES            
## Observations                1,880           
## R2                          0.565           
## Adjusted R2                 0.563           
## ============================================
## Note:            *p<0.1; **p<0.05; ***p<0.01

Hierarchical K-mean cluster

hk.cl <- hkmeans(w.scale, k=6)
hk.cl.deno.plot <- fviz_dend(hk.cl)
hk.cl.plot <- fviz_cluster(hk.cl, palette = "jco", repel = TRUE, 
                           geom =c("point"),ggtheme = theme_classic()) + 
                labs(title="Hierarchical K-mean cluster")
hk.cl.deno.plot

hk.cl.plot

# create cluster under the HK cluster
data.houston$cluster.hk <- hk.cl$cluster

data.houston$mkt.id.hk <- data.houston %>% group_indices(qtr, cluster.hk)

hk.wp <- get.wp.cl(mkt.id.fld = "mkt.id.hk", price.fld ="adr", 
                    q.fld ="room_sold", dat = data.houston)

ols.wp.hk <- lm(adr ~ wp + rating + room + hi +factor(qtr),
                 data=hk.wp$data)

stargazer(ols.wp.hk, omit = "qtr", type="text",
          add.lines = list(c("QTR Fixed Effect", "YES")),
          no.space = TRUE, suppress.errors = TRUE, 
          omit.stat=c("LL","ser","f"))
## 
## ============================================
##                      Dependent variable:    
##                  ---------------------------
##                              adr            
## --------------------------------------------
## wp                        0.0002***         
##                           (0.0001)          
## rating                    18.889***         
##                            (0.620)          
## room                      0.076***          
##                            (0.009)          
## hi                         -24.914          
##                           (36.401)          
## Constant                  43.019***         
##                            (2.507)          
## --------------------------------------------
## QTR Fixed Effect             YES            
## Observations                1,880           
## R2                          0.526           
## Adjusted R2                 0.524           
## ============================================
## Note:            *p<0.1; **p<0.05; ***p<0.01

Density Based Spital Clustering With Noises (DBSCAN)

library(dbscan)
library(ggalt)

nn.plot <- kNNdistplot(dist.hous, k=4)

w.scale1 <- w.scale[data.houston$qtr==1,]
db.cl <- dbscan(w.scale1, eps=0.2, minPts = 4)
# db.cl.plot <- fviz_cluster(db.cl, data=w.scale, stand = FALSE,
#                            ellipse = FALSE, show.clust.cent = FALSE,
#                            geom = "point",palette = "jco", 
#                            ggtheme =theme_classic())


db.cl2 <- as.data.frame(cbind(w.scale1, db.cl$cluster))
colnames(db.cl2) <- c(colnames(w.scale1), "cluster.db")

ggplot(db.cl2[db.cl2$cluster.db>0,], aes(x=lon, y=lat)) + 
  geom_point(aes(color=factor(cluster.db))) +
  geom_encircle(aes(fill=factor(cluster.db)), s_shape=1, expand=0, 
                alpha =0.2, show.legend = FALSE) +
  labs(title ="DBSCAN Cluster (2014Q1)", color="Cluster")

# dbscan
data.dbscan <- data.houston[data.houston$cluster>0,] # drop noise points
data.dbscan$mkt.id <- data.dbscan %>% group_indices(qtr, cluster)
data.dbscan <- data.dbscan[order(data.dbscan$mkt.id),]
dbscan.cl <-get.wp.cl(mkt.id.fld = c("mkt.id"), 
                      price.fld =c("adr"), q.fld = c("room_sold"), 
                      dat = data.dbscan)

ols.dbscan <- lm(adr ~ wp + rating + room + hi + factor(qtr), 
                 data = dbscan.cl$data)
stargazer(ols.dbscan, type="text", 
          omit = "qtr", add.lines = c("QTR Fixed Effect", "YES"),
          no.space = TRUE, suppress.errors = TRUE, 
          omit.stat=c("LL","ser","f"))
## 
## ============================================
##                      Dependent variable:    
##                  ---------------------------
##                              adr            
## --------------------------------------------
## wp                       0.00005***         
##                           (0.00001)         
## rating                    19.067***         
##                            (0.653)          
## room                      0.063***          
##                            (0.009)          
## hi                       -127.945***        
##                           (14.130)          
## Constant                  55.745***         
##                            (2.487)          
## --------------------------------------------
## QTR Fixed Effect                            
## YES                                         
## Observations                1,647           
## R2                          0.535           
## Adjusted R2                 0.533           
## ============================================
## Note:            *p<0.1; **p<0.05; ***p<0.01

Summary of Post Hoc Analysis of All Clustering Algorithms

The following table summerizes all estimated results of the distance metric approach. Coefficients of the weighted average of price

stargazer(ols.wp.km, ols.wp.pam, ols.wp.hc, ols.wp.hk, ols.dbscan,
          type="html", omit = "qtr", 
          add.lines = list(c("QTR Fixed Effect", rep("YES", 5))),
          column.labels = c("K-Means", "PAM", "HC", "H-K", "DBSCAN"),
          no.space = TRUE, suppress.errors = TRUE, 
          omit.stat=c("LL","ser","f"))
Dependent variable:
adr
K-Means PAM HC H-K DBSCAN
(1) (2) (3) (4) (5)
wp 0.0002*** 0.0002*** 0.0002*** 0.0002*** 0.00005***
(0.0001) (0.00004) (0.0001) (0.0001) (0.00001)
rating 18.986*** 19.518*** 18.923*** 18.889*** 19.067***
(0.618) (0.623) (0.586) (0.620) (0.653)
room 0.075*** 0.073*** 0.065*** 0.076*** 0.063***
(0.009) (0.009) (0.008) (0.009) (0.009)
hi 23.002 183.366*** -948.804*** -24.914 -127.945***
(41.848) (41.946) (72.174) (36.401) (14.130)
Constant 40.924*** 30.447*** 73.756*** 43.019*** 55.745***
(2.595) (3.241) (3.034) (2.507) (2.487)
QTR Fixed Effect YES YES YES YES YES
Observations 1,880 1,880 1,880 1,880 1,647
R2 0.527 0.532 0.565 0.526 0.535
Adjusted R2 0.525 0.530 0.563 0.524 0.533
Note: p<0.1; p<0.05; p<0.01