Normalized Clumpiness Measure C

Implementation:

# entropy measure H_p
calc_h_p <- function(t, N) {
  x <- diff(c(0, t, N+1)) / (N+1) 
  1 + sum(log(x)*x) / log(length(t)+1)
}

# calculate z-value for given n/N
calc_z <- function(n, N) {
  unname(quantile(replicate(100000, calc_h_p(sort(sample(N, n)), N)), 0.95))
}

Validating implementation with published example users:

N <- 30
custs <- list(A = c(1, 6, 11, 16, 21, 26),
              B = c(2, 3, 4, 27, 28, 29),
              C = c(13, 14, 15, 16, 17, 18))
round(sapply(custs, function(t) calc_h_p(t, N)), 3) # -> matches results from paper, i.e. 0.036, 0.48, 0.34
##     A     B     C 
## 0.036 0.477 0.341
round(calc_z(n=6, N=30), 3) # should be: 0.257 -> slightly off
## [1] 0.257

Analyze CDNOW data

Load CDNOW elog data:

library(BTYD, quietly = TRUE)
elog <- dc.ReadLines(system.file("data/cdnowElog.csv", package="BTYD"),1,3)
## Started reading file. Progress:
## 1000/6919
## 2000/6919
## 3000/6919
## 4000/6919
## 5000/6919
## 6000/6919
## File successfully read.
elog$date <- as.Date(elog$date, "%Y%m%d")
elog$t <- 1 + as.numeric(elog$date - min(elog$date))
head(elog)
##   cust       date   t
## 1    4 1997-01-01   1
## 2    4 1997-01-18  18
## 3    4 1997-08-02 214
## 4    4 1997-12-12 346
## 5   21 1997-01-01   1
## 6   21 1997-01-13  13
# merge same day entries (~3%)
elog <- unique(elog[, c("cust", "date", "t")])

# determine range of observation period
(N <- max(elog$t))
## [1] 546
# remove first purchase entries for each customer
# Note: this data pre-processing step was done for the published paper,
#       and significantly impacts estimated share of clumpy users
library(plyr, quietly = TRUE)
elog <- ddply(elog, .(cust), transform, is_first = t==min(t))
elog <- elog[!elog$is_first,]

Calculate clumpiness measure for each customer:

data <- ddply(elog, .(cust), summarize, 
                    n = length(t),
                    h_p = calc_h_p(t, N),
                    first = min(t))
head(data)
##    cust  n     h_p first
## 1 10048  2 0.18012    44
## 2 10061 11 0.12978    84
## 3 10064  2 0.05689   114
## 4 10081  1 0.14247   393
## 5 10085 10 0.08686    57
## 6 10096  3 0.48347   374
# calculate Z values
n_values <- unique(data$n)
Z <- data.frame(n=n_values, z=sapply(n_values, function(n) calc_z(n, N)))
data <- merge(data, Z, by="n", all.x=T)

# check C measure against Z value
data$is_clumpy <- data$h_p > data$z

# share of clumpy buyers
mean(data[, "is_clumpy"]) # -> should be ~8%
## [1] 0.09131
# share of clumpiness by frequency
mean(data[data$n==1, "is_clumpy"])
## [1] 0.01737
mean(data[data$n>1, "is_clumpy"])
## [1] 0.1318