Implementation:
# entropy measure H_p
calc_h_p <- function(t, N) {
x <- diff(c(0, t, N+1)) / (N+1)
1 + sum(log(x)*x) / log(length(t)+1)
}
# calculate z-value for given n/N
calc_z <- function(n, N) {
unname(quantile(replicate(100000, calc_h_p(sort(sample(N, n)), N)), 0.95))
}
Validating implementation with published example users:
N <- 30
custs <- list(A = c(1, 6, 11, 16, 21, 26),
B = c(2, 3, 4, 27, 28, 29),
C = c(13, 14, 15, 16, 17, 18))
round(sapply(custs, function(t) calc_h_p(t, N)), 3) # -> matches results from paper, i.e. 0.036, 0.48, 0.34
## A B C
## 0.036 0.477 0.341
round(calc_z(n=6, N=30), 3) # should be: 0.257 -> slightly off
## [1] 0.257
Load CDNOW elog data:
library(BTYD, quietly = TRUE)
elog <- dc.ReadLines(system.file("data/cdnowElog.csv", package="BTYD"),1,3)
## Started reading file. Progress:
## 1000/6919
## 2000/6919
## 3000/6919
## 4000/6919
## 5000/6919
## 6000/6919
## File successfully read.
elog$date <- as.Date(elog$date, "%Y%m%d")
elog$t <- 1 + as.numeric(elog$date - min(elog$date))
head(elog)
## cust date t
## 1 4 1997-01-01 1
## 2 4 1997-01-18 18
## 3 4 1997-08-02 214
## 4 4 1997-12-12 346
## 5 21 1997-01-01 1
## 6 21 1997-01-13 13
# merge same day entries (~3%)
elog <- unique(elog[, c("cust", "date", "t")])
# determine range of observation period
(N <- max(elog$t))
## [1] 546
# remove first purchase entries for each customer
# Note: this data pre-processing step was done for the published paper,
# and significantly impacts estimated share of clumpy users
library(plyr, quietly = TRUE)
elog <- ddply(elog, .(cust), transform, is_first = t==min(t))
elog <- elog[!elog$is_first,]
Calculate clumpiness measure for each customer:
data <- ddply(elog, .(cust), summarize,
n = length(t),
h_p = calc_h_p(t, N),
first = min(t))
head(data)
## cust n h_p first
## 1 10048 2 0.18012 44
## 2 10061 11 0.12978 84
## 3 10064 2 0.05689 114
## 4 10081 1 0.14247 393
## 5 10085 10 0.08686 57
## 6 10096 3 0.48347 374
# calculate Z values
n_values <- unique(data$n)
Z <- data.frame(n=n_values, z=sapply(n_values, function(n) calc_z(n, N)))
data <- merge(data, Z, by="n", all.x=T)
# check C measure against Z value
data$is_clumpy <- data$h_p > data$z
# share of clumpy buyers
mean(data[, "is_clumpy"]) # -> should be ~8%
## [1] 0.09131
# share of clumpiness by frequency
mean(data[data$n==1, "is_clumpy"])
## [1] 0.01737
mean(data[data$n>1, "is_clumpy"])
## [1] 0.1318