前言:從交易記錄到顧客價值

善用商業數據分析的工具和技巧,光靠一份最簡單的交易紀錄(只有顧客ID、交易日期和交易金額三個欄位),我們就可以做一系列很深入、很有價值的顧客價值分析和行銷策略規劃,包括:

圖一、顧客價值管理的層次

圖一、顧客價值管理的層次


從這一些分析我們可以看到公司主要的營收和獲利的重要來源,我們也可以看到這一些產生獲利的群組是不是有成長或者衰退的趨勢;據此我們可以設定行銷的重點,決定行銷的策略,和規劃行銷的工具。除了上述的敘述統計、集群分析、和資料視覺化之外,我們還可以利用這些簡單的交易紀錄:


利用這一些預測我們就可以進行全面客製化的:

圖二、顧客價值管理流程

圖二、顧客價值管理流程



Setup
# Sys.setlocale("LC_ALL","C")
packages = c(
  "dplyr","ggplot2","googleVis","devtools","magrittr","caTools","ROCR","caTools")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

if(!is.element("chorddiag", existing))
  devtools::install_github("mattflor/chorddiag")
Library
rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr)
library(ggplot2)
library(caTools)
library(ROCR)
library(googleVis)
library(chorddiag)
library(plotly)


1 1. 資料整理

1.1 交易資料 (X)
X = read.table(
  'purchases.txt', header=FALSE, sep='\t', stringsAsFactors=F)
names(X) = c('cid','amount','date')
X$date = as.Date(X$date)
summary(X)                  # 交易次數 51243
##       cid             amount          date           
##  Min.   :    10   Min.   :   5   Min.   :2005-01-02  
##  1st Qu.: 57720   1st Qu.:  25   1st Qu.:2009-01-17  
##  Median :102440   Median :  30   Median :2011-11-23  
##  Mean   :108935   Mean   :  62   Mean   :2011-07-14  
##  3rd Qu.:160525   3rd Qu.:  60   3rd Qu.:2013-12-29  
##  Max.   :264200   Max.   :4500   Max.   :2015-12-31
par(cex=0.8)
hist(X$date, "years", las=2, freq=T, xlab="", main="No. Transaction by Year")

n_distinct(X$cid)           # 顧客數 18417
## [1] 18417
1.2 顧客資料 (A)
A = X %>% 
  mutate(days = as.integer(as.Date("2016-01-01") - date)) %>% 
  group_by(cid) %>% summarise(
    recent = min(days),     # 最近購買距今天數
    freq = n(),             # 購買次數
    money = mean(amount),   # 平均購買金額
    senior = max(days),     # 第一次購買距今天數
    since = min(date)       # 第一次購買日期
  ) %>% data.frame
1.4 顧客資料摘要
summary(A)
##       cid             recent          freq           money     
##  Min.   :    10   Min.   :   1   Min.   : 1.00   Min.   :   5  
##  1st Qu.: 81990   1st Qu.: 244   1st Qu.: 1.00   1st Qu.:  22  
##  Median :136430   Median :1070   Median : 2.00   Median :  30  
##  Mean   :137574   Mean   :1253   Mean   : 2.78   Mean   :  58  
##  3rd Qu.:195100   3rd Qu.:2130   3rd Qu.: 3.00   3rd Qu.:  50  
##  Max.   :264200   Max.   :4014   Max.   :45.00   Max.   :4500  
##      senior         since           
##  Min.   :   1   Min.   :2005-01-02  
##  1st Qu.: 988   1st Qu.:2007-10-23  
##  Median :2087   Median :2010-04-15  
##  Mean   :1984   Mean   :2010-07-26  
##  3rd Qu.:2992   3rd Qu.:2013-04-18  
##  Max.   :4016   Max.   :2015-12-31
1.5 變數的分布狀況
p0 = par(cex=0.8, mfrow=c(2,2), mar=c(3,3,4,2))
hist(A$recent,20,main="recency",ylab="",xlab="")
hist(pmin(A$freq, 10),0:10,main="frequency",ylab="",xlab="")
hist(A$senior,20,main="seniority",ylab="",xlab="")
hist(log(A$money,10),,main="log(money)",ylab="",xlab="")


2. 層級式集群分析

2.1 RFM顧客分群
set.seed(111)
A$grp = kmeans(scale(A[,2:4]),10)$cluster
table(A$grp)  # 族群大小
## 
##    1    2    3    4    5    6    7    8    9   10 
## 1073 2266 1296 2237 3207 1942 1781 2392 2096  127
2.2 顧客群組屬性
p = group_by(A, grp) %>% summarise(
  recent=mean(recent), 
  freq=mean(freq), 
  money=mean(money), 
  size=n() ) %>% 
  mutate( revenue = size*money/1000 )  %>% 
  filter(size > 1) %>% 
  ggplot(aes(x=freq, y=money)) +
  geom_point(aes(size=revenue, col=recent),alpha=0.5) +
  scale_size(range=c(4,30)) +
  scale_color_gradient(low="green",high="red") +
  scale_x_log10() + scale_y_log10(limits=c(30,3000)) + 
  geom_text(aes(label = size ),size=3) +
  theme_bw() + guides(size=F) +
  labs(title="Customer Segements",
       subtitle="(bubble_size:revenue_contribution; text:group_size)",
       color="Recency") +
  xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")

plotly_build(p)


3. 規則分群

3.1 顧客分群規則
STS = c("N1","N2","R1","R2","S1","S2","S3")
Status = function(rx,fx,mx,sx,K) {factor(
  ifelse(sx < 2*K,
         ifelse(fx*mx > 50, "N2", "N1"),
         ifelse(rx < 2*K,
                ifelse(sx/fx < 0.75*K,"R2","R1"),
                ifelse(rx < 3*K,"S1",
                       ifelse(rx < 4*K,"S2","S3")))), STS)}

圖三、顧客分群規則

3.2 平均購買週期
K = as.integer(sum(A$senior[A$freq>1]) / sum(A$freq[A$freq>1])); K
## [1] 521

回購顧客的平均購買週期 K = 521 days

3.3 滑動資料窗格
Y = list()              # 建立一個空的LIST
for(y in 2010:2015) {   # 每年年底將顧客資料彙整成一個資料框
  D = as.Date(paste0(c(y, y-1),"-12-31")) # 當期、前期的期末日期 
  Y[[paste0("Y",y)]] = X %>%        # 從交易資料做起
    filter(date <= D[1]) %>%        # 將資料切齊到期末日期
    mutate(days = 1 + as.integer(D[1] - date)) %>%   # 交易距期末天數
    group_by(cid) %>% summarise(    # 依顧客彙總 ...
      recent = min(days),           #   最後一次購買距期末天數   
      freq = n(),                   #   購買次數 (至期末為止)   
      money = mean(amount),         #   平均購買金額 (至期末為止)
      senior = max(days),           #   第一次購買距期末天數
      status = Status(recent,freq,money,senior,K),  # 期末狀態
      since = min(date),                      # 第一次購買日期
      y_freq = sum(date > D[2]),              # 當期購買次數
      y_revenue = sum(amount[date > D[2]])    # 當期購買金額
    ) %>% data.frame }
head(Y$Y2015)
##   cid recent freq  money senior status      since y_freq y_revenue
## 1  10   3829    1  30.00   3829     S3 2005-07-08      0         0
## 2  80    343    7  71.43   3751     R1 2005-09-24      1        80
## 3  90    758   10 115.80   3783     R2 2005-08-23      0         0
## 4 120   1401    1  20.00   1401     S1 2012-03-01      0         0
## 5 130   2970    2  50.00   3710     S3 2005-11-04      0         0
## 6 160   2963    2  30.00   3577     S3 2006-03-17      0         0
3.4 每年年底的累計顧客人數
sapply(Y, nrow)
## Y2010 Y2011 Y2012 Y2013 Y2014 Y2015 
## 10407 11674 13562 15468 16905 18417
3.5 族群大小變化趨勢
cols = c("gold","orange","blue","green","pink","magenta","darkred")
sapply(Y, function(df) table(df$status)) %>% barplot(col=cols)
legend("topleft",rev(STS),fill=rev(cols))

3.6 族群屬性動態分析
CustSegments = do.call(rbind, lapply(Y, function(d) {
  group_by(d, status) %>% summarise(
    average_frequency = mean(freq),
    average_amount = mean(money),
    total_revenue = sum(y_revenue),
    total_no_orders = sum(y_freq),
    average_recency = mean(recent),
    average_seniority = mean(senior),
    group_size = n()
  )})) %>% ungroup %>% 
  mutate(year=rep(2010:2015, each=7)) %>% data.frame
head(CustSegments)
##   status average_frequency average_amount total_revenue total_no_orders
## 1     N1             1.154          26.96         41046            1500
## 2     N2             2.263          96.00        121034            1416
## 3     R1             2.674          43.25         19406             435
## 4     R2             5.406          60.31        108631            1588
## 5     S1             1.267          44.61             0               0
## 6     S2             1.167          60.65             0               0
##   average_recency average_seniority group_size year
## 1           449.2             507.1       3330 2010
## 2           241.2             629.9       1655 2010
## 3           492.4            1470.0       1298 2010
## 4           157.1            1547.3       1547 2010
## 5          1289.3            1373.1       2203 2010
## 6          1889.7            1922.4        360 2010
library(googleVis)
op <- options(gvis.plot.tag='chart')
plot( gvisMotionChart(
  CustSegments, "status", "year",
  options=list(width=900, height=600) ) )
3.7 族群屬性動態分析
df = merge(Y$Y2014[,c(1,6)], Y$Y2015[,c(1,6)],
           by="cid", all.x=T)
tx = table(df$status.x, df$status.y) %>% 
  as.data.frame.matrix() %>% as.matrix()
tx    # 流量矩陣
##      N1   N2   R1   R2  S1   S2   S3
## N1 1705  381  144   45 831    0    0
## N2    0 1131  267  430 263    0    0
## R1    0    0 1240   43 819    0    0
## R2    0    0  199 1742  75    0    0
## S1    0    0  115    3 819 1026    0
## S2    0    0   78    1   0  692 1339
## S3    0    0   97    0   0    0 3420
tx %>% prop.table(1) %>% round(3)   # 流量矩陣(%)
##       N1    N2    R1    R2    S1    S2    S3
## N1 0.549 0.123 0.046 0.014 0.268 0.000 0.000
## N2 0.000 0.541 0.128 0.206 0.126 0.000 0.000
## R1 0.000 0.000 0.590 0.020 0.390 0.000 0.000
## R2 0.000 0.000 0.099 0.864 0.037 0.000 0.000
## S1 0.000 0.000 0.059 0.002 0.417 0.523 0.000
## S2 0.000 0.000 0.037 0.000 0.000 0.328 0.635
## S3 0.000 0.000 0.028 0.000 0.000 0.000 0.972
3.8 互動式流量分析
chorddiag(tx, groupColors=cols)


4. 建立模型

在這個案例裡面,我們的資料是收到Y2015年底,所以我們可以假設現在的時間是Y2015年底,我們想要用現有的資料建立模型,來預測每一位顧客:

但是,我們並沒有Y2016的資料,為了要建立模型,我們需要先把時間回推一期,也就是說:

假如Y2016的情況(跟Y2015比)沒有太大的變化的話,接下來我們就可以

4.1 準備資料

我們用Y2014年底的資料做自變數,Y2015年的資料做應變數

CX = left_join(Y$Y2014, Y$Y2015[,c(1,8,9)], by="cid")
head(CX)
##   cid recent freq money senior status      since y_freq.x y_revenue.x
## 1  10   3464    1  30.0   3464     S3 2005-07-08        0           0
## 2  80    302    6  70.0   3386     R1 2005-09-24        1          80
## 3  90    393   10 115.8   3418     R2 2005-08-23        0           0
## 4 120   1036    1  20.0   1036     N1 2012-03-01        0           0
## 5 130   2605    2  50.0   3345     S3 2005-11-04        0           0
## 6 160   2598    2  30.0   3212     S3 2006-03-17        0           0
##   y_freq.y y_revenue.y
## 1        0           0
## 2        1          80
## 3        0           0
## 4        0           0
## 5        0           0
## 6        0           0
names(CX)[8:11] = c("freq0","revenue0","Retain", "Revenue")
CX$Retain = CX$Retain > 0
head(CX)
##   cid recent freq money senior status      since freq0 revenue0 Retain
## 1  10   3464    1  30.0   3464     S3 2005-07-08     0        0  FALSE
## 2  80    302    6  70.0   3386     R1 2005-09-24     1       80   TRUE
## 3  90    393   10 115.8   3418     R2 2005-08-23     0        0  FALSE
## 4 120   1036    1  20.0   1036     N1 2012-03-01     0        0  FALSE
## 5 130   2605    2  50.0   3345     S3 2005-11-04     0        0  FALSE
## 6 160   2598    2  30.0   3212     S3 2006-03-17     0        0  FALSE
##   Revenue
## 1       0
## 2      80
## 3       0
## 4       0
## 5       0
## 6       0
table(CX$Retain) %>% prop.table()  # 平均保留機率 = 22.54%
## 
##  FALSE   TRUE 
## 0.7701 0.2299
4.2 建立類別模型
mRet = glm(Retain ~ ., CX[,c(2:3,6,8:10)], family=binomial())
summary(mRet)
## 
## Call:
## glm(formula = Retain ~ ., family = binomial(), data = CX[, c(2:3, 
##     6, 8:10)])
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.689  -0.473  -0.298  -0.142   3.386  
## 
## Coefficients:
##              Estimate Std. Error z value        Pr(>|z|)    
## (Intercept) -1.074007   0.089431  -12.01         < 2e-16 ***
## recent      -0.002067   0.000131  -15.73         < 2e-16 ***
## freq         0.095217   0.013882    6.86 0.0000000000069 ***
## statusN2     0.669429   0.070234    9.53         < 2e-16 ***
## statusR1     0.488321   0.084389    5.79 0.0000000071864 ***
## statusR2     1.290002   0.110841   11.64         < 2e-16 ***
## statusS1     0.670604   0.146532    4.58 0.0000047279944 ***
## statusS2     1.353554   0.208210    6.50 0.0000000000798 ***
## statusS3     2.573689   0.275786    9.33         < 2e-16 ***
## freq0        0.566557   0.065532    8.65         < 2e-16 ***
## revenue0    -0.000132   0.000135   -0.98            0.33    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18228  on 16904  degrees of freedom
## Residual deviance: 11766  on 16894  degrees of freedom
## AIC: 11788
## 
## Number of Fisher Scoring iterations: 6
4.3 估計類別模型的準確性
pred = predict(mRet,type="response")
table(pred>0.5,CX$Retain) 
##        
##         FALSE  TRUE
##   FALSE 12045  1530
##   TRUE    974  2356
# 混淆矩陣 (Confusion Matrix)  
table(pred>0.5,CX$Retain) %>% 
  {sum(diag(.))/sum(.)}            # 正確率(ACC): 85.19% 
## [1] 0.8519
colAUC(pred,CX$Retain)             # 辯識率(AUC): 87.92%
##                  [,1]
## FALSE vs. TRUE 0.8792
prediction(pred, CX$Retain) %>%    # ROC CURVE 
  performance("tpr", "fpr") %>% 
  plot(print.cutoffs.at=seq(0,1,0.1))

4.4 建立數量模型
dx = subset(CX, Revenue > 0)  # 只對有來購買的人做模型
mRev = lm(log(Revenue) ~ recent + freq + log(1+money) + senior +
          status + freq0 + log(1+revenue0), dx)  
summary(mRev)                 # 判定係數:R2 = 0.713
## 
## Call:
## lm(formula = log(Revenue) ~ recent + freq + log(1 + money) + 
##     senior + status + freq0 + log(1 + revenue0), data = dx)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.245 -0.209 -0.067  0.205  3.435 
## 
## Coefficients:
##                     Estimate Std. Error t value         Pr(>|t|)    
## (Intercept)        0.0587930  0.0458344    1.28           0.1997    
## recent             0.0003541  0.0000507    6.98 0.00000000000337 ***
## freq               0.0526850  0.0046504   11.33          < 2e-16 ***
## log(1 + money)     0.9320818  0.0135203   68.94          < 2e-16 ***
## senior            -0.0001369  0.0000182   -7.52 0.00000000000007 ***
## statusN2           0.0127716  0.0262656    0.49           0.6268    
## statusR1           0.1927532  0.0407579    4.73 0.00000233405019 ***
## statusR2           0.0297685  0.0352479    0.84           0.3984    
## statusS1           0.0082406  0.0630355    0.13           0.8960    
## statusS2          -0.2406398  0.0865731   -2.78           0.0055 ** 
## statusS3          -0.3667341  0.1181061   -3.11           0.0019 ** 
## freq0              0.0103133  0.0172551    0.60           0.5501    
## log(1 + revenue0)  0.0632756  0.0094003    6.73 0.00000000001930 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.463 on 3873 degrees of freedom
## Multiple R-squared:  0.713,  Adjusted R-squared:  0.712 
## F-statistic:  802 on 12 and 3873 DF,  p-value: <2e-16
plot(log(dx$Revenue), predict(mRev), col='pink', cex=0.65)
abline(0,1,col='red') 


5. 估計顧客終生價值

5.1 Y2016的預測值

使用模型對Y2015年底的資料做預測,對資料中的每一位顧客,預測她們在Y2016的保留率和購買金額。

CX = Y$Y2015
names(CX)[8:9] = c("freq0","revenue0")

# 預測Y2016保留率
CX$ProbRetain = predict(mRet,CX,type='response')

# 預測Y2016購買金額
CX$PredRevenue = exp(predict(mRev,CX))
par(mfrow=c(1,2), mar=c(4,3,3,2), cex=0.8)
hist(CX$ProbRetain,main="ProbRetain", ylab="")
hist(log(CX$PredRevenue,10),main="log(PredRevenue)", ylab="")


5.2 估計顧客終生價值(CLV)
顧客\(i\)的終生價值

\[ V_i = \sum_{t=0}^N g \times m_i \frac{r_i^t}{(1+d)^t} = g \times m_i \sum_{t=0}^N (\frac{r_i}{1+d})^t \]

\(m_i\)\(r_i\):顧客\(i\)的預期(每期)營收貢獻、保留機率
\(g\)\(d\):公司的(稅前)營業利潤利率、資金成本
g = 0.5   # (稅前)獲利率
N = 5     # 期數 = 5
d = 0.1   # 利率 = 10%
CX$CLV = g * CX$PredRevenue * rowSums(sapply(
  0:N, function(i) (CX$ProbRetain/(1+d))^i ) )

summary(CX$CLV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3      16      24      51      45    5094
par(mar=c(2,2,3,1), cex=0.8)
hist(log(CX$CLV,10), xlab="", ylab="")

5.3 比較各族群的價值
# 各族群的平均營收貢獻、保留機率、終生價值
sapply(CX[,10:12], tapply, CX$status, mean)
##    ProbRetain PredRevenue    CLV
## N1    0.20269       31.98  20.17
## N2    0.44075      131.23 110.89
## R1    0.34150       69.85  54.60
## R2    0.74925       91.27 136.31
## S1    0.05724       56.10  29.66
## S2    0.03475       49.48  25.58
## S3    0.02326       49.36  25.17
par(mar=c(3,3,4,2), cex=0.8)
boxplot(log(CLV)~status, CX, main="CLV by Groups")


6. 設定行銷策略、規劃行銷工具



7. 選擇行銷對象

給定某一行銷工具的成本和預期效益,選擇可以施行這項工具的對象。

7.1 對R2族群進行保留

R2族群的預測保留率和購買金額

par(mfrow=c(1,2), mar=c(4,3,3,2), cex=0.8)
hist(CX$ProbRetain[CX$status=="R2"],main="ProbRetain",xlab="")
hist(log(CX$PredRevenue[CX$status=="R2"],10),main="PredRevenue",xlab="")

7.2 估計預期報酬

假設行銷工具的成本和預期效益為

cost = 10        # 成本
effect = 0.75    # 效益:下一期的購買機率

估計這項行銷工具對每一位R2顧客的預期報酬

Target = subset(CX, status=="R2")
Target$ExpReturn = (effect - Target$ProbRetain) * Target$PredRevenue - cost
summary(Target$ExpReturn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -515.8   -15.4   -11.5   -10.3    -8.1   646.9

這一項工具對R2顧客的預期報酬是負的

7.3 選擇行銷對象

但是,我們還是可以挑出許多預期報酬很大的行銷對象

Target %>% arrange(desc(ExpReturn)) %>% select(cid, ExpReturn) %>% head(15)
##       cid ExpReturn
## 1  141340    646.85
## 2  164930    609.02
## 3    2190    323.38
## 4  172750    305.75
## 5  156280    289.41
## 6  190570    236.91
## 7  134030    199.29
## 8  141430    150.73
## 9  123480    134.52
## 10  65830    110.61
## 11 150890    107.18
## 12 134590     99.48
## 13  72000     92.55
## 14 133080     84.54
## 15 147510     77.18
sum(Target$ExpReturn > 0)                 # 可實施對象:258
## [1] 258

在R2之中,有258人的預期報酬大於零,如果對這258人使用這項工具,我們的期望報酬是:

sum(Target$ExpReturn[Target$ExpReturn > 0])   # 預期報酬:6464
## [1] 6464
QUIZ:

我們可以算出對所有的族群實施這項工具的期望報酬 …

Target = CX
Target$ExpReturn = (effect - Target$ProbRetain) * Target$PredRevenue - cost
filter(Target, Target$ExpReturn > 0) %>%
  group_by(status) %>% summarise(
    No.Target = n(),
    AvgROI = mean(ExpReturn),
    TotalROI = sum(ExpReturn) ) %>% data.frame
##   status No.Target AvgROI TotalROI
## 1     N1      2211  9.973    22049
## 2     N2      1459 47.238    68920
## 3     R1      1596 25.459    40633
## 4     R2       258 25.054     6464
## 5     S1      2645 30.592    80915
## 6     S2      1609 27.134    43658
## 7     S3      4495 27.598   124052

這個結果是合理的嗎? 你想要怎麼修正這項分析的程序呢?




8. 結論

如果你只有顧客ID、交易日期、交易金額三個欄位的話,你可以做的分析包括:

一般而言,這一些分析的結果,足夠讓我們制定顧客發展和顧客保留策略;至於顧客吸收策略,我們通常還需要從CRM撈出顧客個人屬性資料才能做到。