library(cluster)
library(factoextra)
library(class)
library(dplyr)
library(stargazer)
library(ggplot2)
library(gridExtra)
This project provides practical advices of how to use mcahine learning alogrithms to define markets or groups in which firms, or products are highly similar, and to evaluate the outcome of the ML analysis by using econometric models.
Among various machine learning (ML) alogrithms, I use clustering alogrithms, a unsupervised ML to group observations that are closed related.
The following clustering alogrthms are used: K-Means Clustering, Partitioning Around Medoids Clustering (PAM), Hierarchical Clustering, Hierarchical K-Means Clustering, and Density Based Spitial With Noise.
More details of each clustering alogrthm will be discussed later.
After conducting each clustering, we will have a cluster for each observation. Each cluster represents a market.
Given the estimated cluster, we will evaluate each clustering alogrithms(Post-Hoc Analysis). Rather than using crieteria from the literature of machine learning, we will use an econometric model to evaluate the outcome of the clustering alogrithms: distance metric approach (a coeffient of the weighted average prices of rivals) and discrete choice model(own/cross price elasticities in nested logit models).
To use these approach, we make sure that firms produce some levels of differentiated products, which is Bertrand competition with differentiated products. This should be added and fixed
For the distance metric approach, we assume that prices are a function of the rivals’ prices, own product characteristics and market characteristics. Using Pinkse et.al (2002), we reduce the number of coefficeints to one from n coefficients (for n firms in the market), the one of the weighted average prices of rivals in the market (wp, weighted by distances between firms. The distance meausre is Euclidean distance. Depending on the researcher’s choics, the number of factors to calculate the distance varies. Here, we use the xy coordinates.). We will estimate this coefficient from all the clustering alogrithms.
The following analysis are done with this example.
The example of discrete choice models will be conducted later.
# sort data.frame
data.houston <- data.houston[order(data.houston$qtr, data.houston$hotel),]
# W: a matrix of sdfs
factor.fld <- c("lon", "lat")
cl.kmeans <- list()
w.list <- list()
ncl.plot.list <- list()
kcl.plot.list <- list()
for (t in 1:4){
w.raw <- data.houston[ data.houston$qtr==t, factor.fld]
row.names(w.raw) <- c()
w.scale <- scale(w.raw)
# summary(w.scale)
dist.w <- dist(w.scale)
# find out the optimal K (# of clusters)
ncluster <- fviz_nbclust(x = w.scale, kmeans, method = "wss") +
geom_vline(xintercept = 6, linetype = 2) +
labs(title = paste0("Optimal Clusters in 2014Q",t))
# print(ncluster)
kmeans.cl <- kmeans(w.scale, 6, nstart =25) # K = 6
kcluster <- fviz_cluster(kmeans.cl, data = w.scale,
ellipse.type = "convex",palette = "jco",
repel = TRUE,geom = c("point"),labelsize = 5,
pointsize = 1,ggtheme = theme_minimal()) +
labs(title=paste0("K Means: Cluster Plot in 2014Q", t))
# print(kcluster)
cl.kmeans[[t]] <- kmeans.cl
w.list[[t]] <- w.scale
ncl.plot.list[[t]] <- ncluster
kcl.plot.list[[t]] <- kcluster
}
do.call(grid.arrange, c(ncl.plot.list, ncol=2))
do.call(grid.arrange, c(kcl.plot.list, ncol=2))
# cl.kmeans[[1]]$cluster
kcl.list <- lapply(cl.kmeans, function(x) x$cluster)
# unlist(kcl.list)
# all combined
w.raw <- data.houston[,factor.fld]
row.names(w.raw) <- c()
w.scale <- scale(w.raw)
# summary(w.scale)
dist.w <- dist(w.scale)
# find out the optimal K (# of clusters)
ncluster <- fviz_nbclust(x = w.scale, kmeans, method = "wss") +
geom_vline(xintercept = 6, linetype = 2) +
labs(title = paste0("Optimal Clusters in 2014Q"))
# print(ncluster)
kmeans.cl <- kmeans(w.scale, 6, nstart =25) # K = 6
kcluster <- fviz_cluster(kmeans.cl, data = w.scale,
ellipse.type = "convex",palette = "jco",
repel = TRUE,geom = c("point"),labelsize = 5,
pointsize = 1,ggtheme = theme_minimal()) +
labs(title=paste0("K Means: Cluster Plot in 2014"))
ncluster
kcluster
data.houston$km.cluster <- kmeans.cl$cluster
get.wp.cl <- function(mkt.id.fld, price.fld, q.fld, dat,...){
# objective: get weighted price within a cluster
# input:1) mkt.id.fld (market id variable),
# 2) factor.fld (factors that used to calculate distances between
# points)
# 3) price.fld (price variable)
# 4) q.fld (quantity variable)
# 4) dat (data frames including all variables defined above)
# output: weighted prices(a weighted average of prices with the distance, for each observation)
# loading packages
require(dplyr)
# mkt.id fld
dat <- dat[order(dat[,mkt.id.fld]),]
if (length(mkt.id.fld)>1){
mkt.id <- cumsum(!duplicated(dat[, mkt.id.fld]))
} else {
mkt.id <- dat[, mkt.id.fld, drop=F ]
}
dat <- dat[order(mkt.id),]
price <- dat[, price.fld, drop=F]
q <- dat[, q.fld, drop=F]
sales <- price * q
factor <- dat[, factor.fld, drop=F]
# loop dist
wp.list <- list()
hi.list <- list()
for(m in 1:max(mkt.id)){
factor.m <- factor[mkt.id==m,]
price.m <- as.matrix(price[mkt.id==m, ,drop=F])
sales.m <- as.matrix(sales[mkt.id==m, ,drop=F])
hi.m <- rep(sum((sales.m/sum(sales.m) * 100)^2)/10000, nrow(factor.m))
dist.m <- as.matrix(dist(factor.m, diag=TRUE, upper =TRUE,
method="euclidean"))
inv.dist.m <- 1/dist.m
diag(inv.dist.m) <- 0
inv.dist.m[is.infinite(inv.dist.m)]<-0
wp.mat <- 1/nrow(factor.m) * inv.dist.m %*% price.m
wp.list[[m]] <- wp.mat
hi.list[[m]] <- hi.m
}
wp <- unlist(wp.list)
wp <- as.matrix(wp)
hi <- as.matrix(unlist(hi.list))
dat$wp <- wp
dat$hi <- hi
return(list(data=dat, wp=wp, hi=hi))
}
table(data.houston$km.cluster)
##
## 1 2 3 4 5 6
## 274 292 287 258 436 333
# create mkt.id
data.houston$mkt.id.km <- data.houston %>% group_indices(qtr, km.cluster)
# table(data.houston$mkt.id.km)
# order data based on "mkt.id.km"
wp.km <- get.wp.cl(mkt.id.fld = "mkt.id.km", price.fld="adr",
q.fld = "room_sold", dat = data.houston)
ols.wp.km <- lm(adr ~ wp + rating + room + hi +factor(qtr), data=wp.km$data)
stargazer(ols.wp.km, omit = "qtr", type="text",
add.lines = list(c("QTR Fixed Effect", "YES")),
no.space = TRUE, suppress.errors = TRUE,
omit.stat=c("LL","ser","f"))
##
## ============================================
## Dependent variable:
## ---------------------------
## adr
## --------------------------------------------
## wp 0.0002***
## (0.0001)
## rating 18.986***
## (0.618)
## room 0.075***
## (0.009)
## hi 23.002
## (41.848)
## Constant 40.924***
## (2.595)
## --------------------------------------------
## QTR Fixed Effect YES
## Observations 1,880
## R2 0.527
## Adjusted R2 0.525
## ============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Partitioning Around Medoids is similar to K-mean cluster analysis, but PAM replys on a medoids(a point) in cluster. K-mean cluster uses a centroid among points within a cluster, while PAM use an actual meiod, represents an average similarity among points within a cluster.
library(cluster)
fviz_nbclust(w.scale, pam, method="silhouette")
cluster.pam <- pam(w.scale, 10, metric="euclidean")
pam.cl.plot <- fviz_cluster(cluster.pam,ellipse.type = "convex",
palette = "jco",repel = TRUE,
geom = c("point"),
labelsize = 5, pointsize = 1,
ggtheme = theme_minimal()) +
labs(title = "PAM: Cluster Plot")
data.houston$cl.pam <- cluster.pam$clustering
# wp in pam
## create mkt.id
data.houston$mkt.id.pam <- data.houston %>% group_indices(qtr, cl.pam)
table(data.houston$mkt.id.pam)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 20 43 53 55 54 50 78 35 42 41 19 43 52 55 54 50 79 34 41 40 19 44 53 56 53 50
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 77 34 42 41 19 44 54 56 53 50 76 34 45 42
pam.wp <- get.wp.cl(mkt.id.fld = "mkt.id.pam", price.fld ="adr",
q.fld ="room_sold", dat = data.houston)
ols.wp.pam <- lm(adr ~ wp + rating + room + hi +factor(qtr),
data=pam.wp$data)
stargazer(ols.wp.pam, omit = "qtr", type="text",
add.lines = list(c("QTR Fixed Effect", "YES")),
no.space = TRUE, suppress.errors = TRUE,
omit.stat=c("LL","ser","f"))
##
## ============================================
## Dependent variable:
## ---------------------------
## adr
## --------------------------------------------
## wp 0.0002***
## (0.00004)
## rating 19.518***
## (0.623)
## room 0.073***
## (0.009)
## hi 183.366***
## (41.946)
## Constant 30.447***
## (3.241)
## --------------------------------------------
## QTR Fixed Effect YES
## Observations 1,880
## R2 0.532
## Adjusted R2 0.530
## ============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Caluate distance
dist.hous <- dist(w.scale, method="euclidean")
hc.cl <- hclust(d=dist.hous, method="ward.D2")
fviz_dend(hc.cl, cex=0.5)
fviz_dend(hc.cl, k=6, cex=0.5)
hc.cl6 <- cutree(hc.cl, k=6)
hc.cl.plot <- fviz_cluster(list(data=w.scale, cluster =hc.cl6),
ellipse.type = "convex",
palette ="jco", repel = TRUE, geom = c("point"),
labelsize = 5, pointsize = 1) +
labs(title="Hierarchical Cluster")
hc.cl.plot
# wp
data.houston$cluster.hc <- hc.cl6
data.houston$mkt.id.hc <- data.houston %>% group_indices(qtr, cluster.hc)
hc.wp <- get.wp.cl(mkt.id.fld = "mkt.id.hc", price.fld ="adr",
q.fld ="room_sold", dat = data.houston)
ols.wp.hc <- lm(adr ~ wp + rating + room + hi +factor(qtr),
data=hc.wp$data)
stargazer(ols.wp.hc, omit = "qtr", type="text",
add.lines = list(c("QTR Fixed Effect", "YES")),
no.space = TRUE, suppress.errors = TRUE,
omit.stat=c("LL","ser","f"))
##
## ============================================
## Dependent variable:
## ---------------------------
## adr
## --------------------------------------------
## wp 0.0002***
## (0.0001)
## rating 18.923***
## (0.586)
## room 0.065***
## (0.008)
## hi -948.804***
## (72.174)
## Constant 73.756***
## (3.034)
## --------------------------------------------
## QTR Fixed Effect YES
## Observations 1,880
## R2 0.565
## Adjusted R2 0.563
## ============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
hk.cl <- hkmeans(w.scale, k=6)
hk.cl.deno.plot <- fviz_dend(hk.cl)
hk.cl.plot <- fviz_cluster(hk.cl, palette = "jco", repel = TRUE,
geom =c("point"),ggtheme = theme_classic()) +
labs(title="Hierarchical K-mean cluster")
hk.cl.deno.plot
hk.cl.plot
# create cluster under the HK cluster
data.houston$cluster.hk <- hk.cl$cluster
data.houston$mkt.id.hk <- data.houston %>% group_indices(qtr, cluster.hk)
hk.wp <- get.wp.cl(mkt.id.fld = "mkt.id.hk", price.fld ="adr",
q.fld ="room_sold", dat = data.houston)
ols.wp.hk <- lm(adr ~ wp + rating + room + hi +factor(qtr),
data=hk.wp$data)
stargazer(ols.wp.hk, omit = "qtr", type="text",
add.lines = list(c("QTR Fixed Effect", "YES")),
no.space = TRUE, suppress.errors = TRUE,
omit.stat=c("LL","ser","f"))
##
## ============================================
## Dependent variable:
## ---------------------------
## adr
## --------------------------------------------
## wp 0.0002***
## (0.0001)
## rating 18.889***
## (0.620)
## room 0.076***
## (0.009)
## hi -24.914
## (36.401)
## Constant 43.019***
## (2.507)
## --------------------------------------------
## QTR Fixed Effect YES
## Observations 1,880
## R2 0.526
## Adjusted R2 0.524
## ============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
library(dbscan)
library(ggalt)
nn.plot <- kNNdistplot(dist.hous, k=4)
w.scale1 <- w.scale[data.houston$qtr==1,]
db.cl <- dbscan(w.scale1, eps=0.2, minPts = 4)
# db.cl.plot <- fviz_cluster(db.cl, data=w.scale, stand = FALSE,
# ellipse = FALSE, show.clust.cent = FALSE,
# geom = "point",palette = "jco",
# ggtheme =theme_classic())
db.cl2 <- as.data.frame(cbind(w.scale1, db.cl$cluster))
colnames(db.cl2) <- c(colnames(w.scale1), "cluster.db")
ggplot(db.cl2[db.cl2$cluster.db>0,], aes(x=lon, y=lat)) +
geom_point(aes(color=factor(cluster.db))) +
geom_encircle(aes(fill=factor(cluster.db)), s_shape=1, expand=0,
alpha =0.2, show.legend = FALSE) +
labs(title ="DBSCAN Cluster (2014Q1)", color="Cluster")
# dbscan
data.dbscan <- data.houston[data.houston$cluster>0,] # drop noise points
data.dbscan$mkt.id <- data.dbscan %>% group_indices(qtr, cluster)
data.dbscan <- data.dbscan[order(data.dbscan$mkt.id),]
dbscan.cl <-get.wp.cl(mkt.id.fld = c("mkt.id"),
price.fld =c("adr"), q.fld = c("room_sold"),
dat = data.dbscan)
ols.dbscan <- lm(adr ~ wp + rating + room + hi + factor(qtr),
data = dbscan.cl$data)
stargazer(ols.dbscan, type="text",
omit = "qtr", add.lines = c("QTR Fixed Effect", "YES"),
no.space = TRUE, suppress.errors = TRUE,
omit.stat=c("LL","ser","f"))
##
## ============================================
## Dependent variable:
## ---------------------------
## adr
## --------------------------------------------
## wp 0.00005***
## (0.00001)
## rating 19.067***
## (0.653)
## room 0.063***
## (0.009)
## hi -127.945***
## (14.130)
## Constant 55.745***
## (2.487)
## --------------------------------------------
## QTR Fixed Effect
## YES
## Observations 1,647
## R2 0.535
## Adjusted R2 0.533
## ============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
The following table summerizes all estimated results of the distance metric approach. Coefficients of the weighted average of price
stargazer(ols.wp.km, ols.wp.pam, ols.wp.hc, ols.wp.hk, ols.dbscan,
type="html", omit = "qtr",
add.lines = list(c("QTR Fixed Effect", rep("YES", 5))),
column.labels = c("K-Means", "PAM", "HC", "H-K", "DBSCAN"),
no.space = TRUE, suppress.errors = TRUE,
omit.stat=c("LL","ser","f"))
Dependent variable: | |||||
adr | |||||
K-Means | PAM | HC | H-K | DBSCAN | |
(1) | (2) | (3) | (4) | (5) | |
wp | 0.0002*** | 0.0002*** | 0.0002*** | 0.0002*** | 0.00005*** |
(0.0001) | (0.00004) | (0.0001) | (0.0001) | (0.00001) | |
rating | 18.986*** | 19.518*** | 18.923*** | 18.889*** | 19.067*** |
(0.618) | (0.623) | (0.586) | (0.620) | (0.653) | |
room | 0.075*** | 0.073*** | 0.065*** | 0.076*** | 0.063*** |
(0.009) | (0.009) | (0.008) | (0.009) | (0.009) | |
hi | 23.002 | 183.366*** | -948.804*** | -24.914 | -127.945*** |
(41.848) | (41.946) | (72.174) | (36.401) | (14.130) | |
Constant | 40.924*** | 30.447*** | 73.756*** | 43.019*** | 55.745*** |
(2.595) | (3.241) | (3.034) | (2.507) | (2.487) | |
QTR Fixed Effect | YES | YES | YES | YES | YES |
Observations | 1,880 | 1,880 | 1,880 | 1,880 | 1,647 |
R2 | 0.527 | 0.532 | 0.565 | 0.526 | 0.535 |
Adjusted R2 | 0.525 | 0.530 | 0.563 | 0.524 | 0.533 |
Note: | p<0.1; p<0.05; p<0.01 |