善用商業數據分析的工具和技巧,光靠一份最簡單的交易紀錄(只有顧客ID、交易日期和交易金額三個欄位),我們就可以做一系列很深入、很有價值的顧客價值分析和行銷策略規劃,包括:
圖一、顧客價值管理的層次
從這一些分析我們可以看到公司主要的營收和獲利的重要來源,我們也可以看到這一些產生獲利的群組是不是有成長或者衰退的趨勢;據此我們可以設定行銷的重點,決定行銷的策略,和規劃行銷的工具。除了上述的敘述統計、集群分析、和資料視覺化之外,我們還可以利用這些簡單的交易紀錄:
利用這一些預測我們就可以進行全面客製化的:
圖二、顧客價值管理流程
Sys.setlocale("LC_ALL","C")
[1] "C"
packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","caTools","ROCR","caTools")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
if(!is.element("chorddiag", existing))
devtools::install_github("mattflor/chorddiag")
rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr)
library(ggplot2)
library(caTools)
library(ROCR)
library(googleVis)
library(chorddiag)
X = read.table(
'purchases.txt', header=FALSE, sep='\t', stringsAsFactors=F)
names(X) = c('cid','amount','date')
X$date = as.Date(X$date)
summary(X) # 交易次數 51243
cid amount date
Min. : 10 Min. : 5 Min. :2005-01-02
1st Qu.: 57720 1st Qu.: 25 1st Qu.:2009-01-17
Median :102440 Median : 30 Median :2011-11-23
Mean :108935 Mean : 62 Mean :2011-07-14
3rd Qu.:160525 3rd Qu.: 60 3rd Qu.:2013-12-29
Max. :264200 Max. :4500 Max. :2015-12-31
par(cex=0.8)
hist(X$date, "years", las=2, freq=T, xlab="", main="No. Transaction by Year")
n_distinct(X$cid) # 顧客數 18417
[1] 18417
A = X %>%
mutate(days = as.integer(as.Date("2016-01-01") - date)) %>%
group_by(cid) %>% summarise(
recent = min(days), # 最近購買距今天數
freq = n(), # 購買次數
money = mean(amount), # 平均購買金額
senior = max(days), # 第一次購買距今天數
since = min(date) # 第一次購買日期
) %>% data.frame
summary(A)
cid recent freq money senior since
Min. : 10 Min. : 1 Min. : 1.00 Min. : 5 Min. : 1 Min. :2005-01-02
1st Qu.: 81990 1st Qu.: 244 1st Qu.: 1.00 1st Qu.: 22 1st Qu.: 988 1st Qu.:2007-10-23
Median :136430 Median :1070 Median : 2.00 Median : 30 Median :2087 Median :2010-04-15
Mean :137574 Mean :1253 Mean : 2.78 Mean : 58 Mean :1984 Mean :2010-07-26
3rd Qu.:195100 3rd Qu.:2130 3rd Qu.: 3.00 3rd Qu.: 50 3rd Qu.:2992 3rd Qu.:2013-04-18
Max. :264200 Max. :4014 Max. :45.00 Max. :4500 Max. :4016 Max. :2015-12-31
p0 = par(cex=0.8, mfrow=c(2,2), mar=c(3,3,4,2))
hist(A$recent,20,main="recency",ylab="",xlab="")
hist(pmin(A$freq, 10),0:10,main="frequency",ylab="",xlab="")
hist(A$senior,20,main="seniority",ylab="",xlab="")
hist(log(A$money,10),,main="log(money)",ylab="",xlab="")
set.seed(111)
A$grp = kmeans(scale(A[,2:4]),10)$cluster
table(A$grp) # 族群大小
1 2 3 4 5 6 7 8 9 10
1073 2266 1296 2237 3207 1942 1781 2392 2096 127
group_by(A, grp) %>% summarise(
recent=mean(recent),
freq=mean(freq),
money=mean(money),
size=n() ) %>%
mutate( revenue = size*money/1000 ) %>%
filter(size > 1) %>%
ggplot(aes(x=freq, y=money)) +
geom_point(aes(size=revenue, col=recent),alpha=0.5) +
scale_size(range=c(4,30)) +
scale_color_gradient(low="green",high="red") +
scale_x_log10() + scale_y_log10(limits=c(30,3000)) +
geom_text(aes(label = size ),size=3) +
theme_bw() + guides(size=F) +
labs(title="Customer Segements",
subtitle="(bubble_size:revenue_contribution; text:group_size)",
color="Recency") +
xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")
STS = c("N1","N2","R1","R2","S1","S2","S3")
Status = function(rx,fx,mx,sx,K) {factor(
ifelse(sx < 2*K,
ifelse(fx*mx > 50, "N2", "N1"),
ifelse(rx < 2*K,
ifelse(sx/fx < 0.75*K,"R2","R1"),
ifelse(rx < 3*K,"S1",
ifelse(rx < 4*K,"S2","S3")))), STS)}
圖三、顧客分群規則
K = as.integer(sum(A$senior[A$freq>1]) / sum(A$freq[A$freq>1])); K
[1] 521
回購顧客的平均購買週期 K = 521 days
Y = list() # 建立一個空的LIST
for(y in 2010:2015) { # 每年年底將顧客資料彙整成一個資料框
D = as.Date(paste0(c(y, y-1),"-12-31")) # 當期、前期的期末日期
Y[[paste0("Y",y)]] = X %>% # 從交易資料做起
filter(date <= D[1]) %>% # 將資料切齊到期末日期
mutate(days = 1 + as.integer(D[1] - date)) %>% # 交易距期末天數
group_by(cid) %>% summarise( # 依顧客彙總 ...
recent = min(days), # 最後一次購買距期末天數
freq = n(), # 購買次數 (至期末為止)
money = mean(amount), # 平均購買金額 (至期末為止)
senior = max(days), # 第一次購買距期末天數
status = Status(recent,freq,money,senior,K), # 期末狀態
since = min(date), # 第一次購買日期
y_freq = sum(date > D[2]), # 當期購買次數
y_revenue = sum(amount[date > D[2]]) # 當期購買金額
) %>% data.frame }
head(Y$Y2015)
cid recent freq money senior status since y_freq y_revenue
1 10 3829 1 30.00 3829 S3 2005-07-08 0 0
2 80 343 7 71.43 3751 R1 2005-09-24 1 80
3 90 758 10 115.80 3783 R2 2005-08-23 0 0
4 120 1401 1 20.00 1401 S1 2012-03-01 0 0
5 130 2970 2 50.00 3710 S3 2005-11-04 0 0
6 160 2963 2 30.00 3577 S3 2006-03-17 0 0
sapply(Y, nrow)
Y2010 Y2011 Y2012 Y2013 Y2014 Y2015
10407 11674 13562 15468 16905 18417
cols = c("gold","orange","blue","green","pink","magenta","darkred")
sapply(Y, function(df) table(df$status)) %>% barplot(col=cols)
legend("topleft",rev(STS),fill=rev(cols))
CustSegments = do.call(rbind, lapply(Y, function(d) {
group_by(d, status) %>% summarise(
average_frequency = mean(freq),
average_amount = mean(money),
total_revenue = sum(y_revenue),
total_no_orders = sum(y_freq),
average_recency = mean(recent),
average_seniority = mean(senior),
group_size = n()
)})) %>% ungroup %>%
mutate(year=rep(2010:2015, each=7)) %>% data.frame
head(CustSegments)
status average_frequency average_amount total_revenue total_no_orders average_recency average_seniority
1 N1 1.154 26.96 41046 1500 449.2 507.1
2 N2 2.263 96.00 121034 1416 241.2 629.9
3 R1 2.674 43.25 19407 435 492.4 1470.0
4 R2 5.406 60.31 108631 1588 157.1 1547.3
5 S1 1.267 44.61 0 0 1289.3 1373.1
6 S2 1.167 60.65 0 0 1889.7 1922.4
group_size year
1 3330 2010
2 1655 2010
3 1298 2010
4 1547 2010
5 2203 2010
6 360 2010
plot( gvisMotionChart(
CustSegments, "status", "year",
options=list(width=900, height=600) ) )
圖四、顧客分群規則
df = merge(Y$Y2014[,c(1,6)], Y$Y2015[,c(1,6)],
by="cid", all.x=T)
tx = table(df$status.x, df$status.y) %>%
as.data.frame.matrix() %>% as.matrix()
tx # 流量矩陣
N1 N2 R1 R2 S1 S2 S3
N1 1705 381 144 45 831 0 0
N2 0 1131 267 430 263 0 0
R1 0 0 1240 43 819 0 0
R2 0 0 199 1742 75 0 0
S1 0 0 115 3 819 1026 0
S2 0 0 78 1 0 692 1339
S3 0 0 97 0 0 0 3420
tx %>% prop.table(1) %>% round(3) # 流量矩陣(%)
N1 N2 R1 R2 S1 S2 S3
N1 0.549 0.123 0.046 0.014 0.268 0.000 0.000
N2 0.000 0.541 0.128 0.206 0.126 0.000 0.000
R1 0.000 0.000 0.590 0.020 0.390 0.000 0.000
R2 0.000 0.000 0.099 0.864 0.037 0.000 0.000
S1 0.000 0.000 0.059 0.002 0.417 0.523 0.000
S2 0.000 0.000 0.037 0.000 0.000 0.328 0.635
S3 0.000 0.000 0.028 0.000 0.000 0.000 0.972
chorddiag(tx, groupColors=cols)
在這個案例裡面,我們的資料是收到Y2015年底,所以我們可以假設現在的時間是Y2015年底,我們想要用現有的資料建立模型,來預測每一位顧客:
但是,我們並沒有Y2016的資料,為了要建立模型,我們需要先把時間回推一期,也就是說:
假如Y2016的情況(跟Y2015比)沒有太大的變化的話,接下來我們就可以
我們用Y2014年底的資料做自變數,Y2015年的資料做應變數
CX = left_join(Y$Y2014, Y$Y2015[,c(1,8,9)], by="cid")
head(CX)
cid recent freq money senior status since y_freq.x y_revenue.x y_freq.y y_revenue.y
1 10 3464 1 30.0 3464 S3 2005-07-08 0 0 0 0
2 80 302 6 70.0 3386 R1 2005-09-24 1 80 1 80
3 90 393 10 115.8 3418 R2 2005-08-23 0 0 0 0
4 120 1036 1 20.0 1036 N1 2012-03-01 0 0 0 0
5 130 2605 2 50.0 3345 S3 2005-11-04 0 0 0 0
6 160 2598 2 30.0 3212 S3 2006-03-17 0 0 0 0
names(CX)[8:11] = c("freq0","revenue0","Retain", "Revenue")
CX$Retain = CX$Retain > 0
head(CX)
cid recent freq money senior status since freq0 revenue0 Retain Revenue
1 10 3464 1 30.0 3464 S3 2005-07-08 0 0 FALSE 0
2 80 302 6 70.0 3386 R1 2005-09-24 1 80 TRUE 80
3 90 393 10 115.8 3418 R2 2005-08-23 0 0 FALSE 0
4 120 1036 1 20.0 1036 N1 2012-03-01 0 0 FALSE 0
5 130 2605 2 50.0 3345 S3 2005-11-04 0 0 FALSE 0
6 160 2598 2 30.0 3212 S3 2006-03-17 0 0 FALSE 0
table(CX$Retain) %>% prop.table() # 平均保留機率 = 22.54%
FALSE TRUE
0.7701 0.2299
mRet = glm(Retain ~ ., CX[,c(2:3,6,8:10)], family=binomial())
summary(mRet)
Call:
glm(formula = Retain ~ ., family = binomial(), data = CX[, c(2:3,
6, 8:10)])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.689 -0.473 -0.298 -0.142 3.386
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.074007 0.089431 -12.01 < 2e-16 ***
recent -0.002067 0.000131 -15.73 < 2e-16 ***
freq 0.095217 0.013882 6.86 0.0000000000069 ***
statusN2 0.669429 0.070234 9.53 < 2e-16 ***
statusR1 0.488321 0.084389 5.79 0.0000000071864 ***
statusR2 1.290002 0.110841 11.64 < 2e-16 ***
statusS1 0.670604 0.146532 4.58 0.0000047279944 ***
statusS2 1.353554 0.208210 6.50 0.0000000000798 ***
statusS3 2.573689 0.275786 9.33 < 2e-16 ***
freq0 0.566557 0.065532 8.65 < 2e-16 ***
revenue0 -0.000132 0.000135 -0.98 0.33
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 18228 on 16904 degrees of freedom
Residual deviance: 11766 on 16894 degrees of freedom
AIC: 11788
Number of Fisher Scoring iterations: 6
pred = predict(mRet,type="response")
table(pred>0.5,CX$Retain)
FALSE TRUE
FALSE 12045 1530
TRUE 974 2356
# 混淆矩陣 (Confusion Matrix)
table(pred>0.5,CX$Retain) %>%
{sum(diag(.))/sum(.)} # 正確率(ACC): 85.19%
[1] 0.8519
colAUC(pred,CX$Retain) # 辯識率(AUC): 87.92%
[,1]
FALSE vs. TRUE 0.8792
prediction(pred, CX$Retain) %>% # ROC CURVE
performance("tpr", "fpr") %>%
plot(print.cutoffs.at=seq(0,1,0.1))
dx = subset(CX, Revenue > 0) # 只對有來購買的人做模型
mRev = lm(log(Revenue) ~ recent + freq + log(1+money) + senior +
status + freq0 + log(1+revenue0), dx)
summary(mRev) # 判定係數:R2 = 0.713
Call:
lm(formula = log(Revenue) ~ recent + freq + log(1 + money) +
senior + status + freq0 + log(1 + revenue0), data = dx)
Residuals:
Min 1Q Median 3Q Max
-3.245 -0.209 -0.067 0.205 3.435
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.0587930 0.0458344 1.28 0.1997
recent 0.0003541 0.0000507 6.98 0.00000000000337 ***
freq 0.0526850 0.0046504 11.33 < 2e-16 ***
log(1 + money) 0.9320818 0.0135203 68.94 < 2e-16 ***
senior -0.0001369 0.0000182 -7.52 0.00000000000007 ***
statusN2 0.0127716 0.0262656 0.49 0.6268
statusR1 0.1927532 0.0407579 4.73 0.00000233405019 ***
statusR2 0.0297685 0.0352479 0.84 0.3984
statusS1 0.0082406 0.0630355 0.13 0.8960
statusS2 -0.2406398 0.0865731 -2.78 0.0055 **
statusS3 -0.3667341 0.1181061 -3.11 0.0019 **
freq0 0.0103133 0.0172551 0.60 0.5501
log(1 + revenue0) 0.0632756 0.0094003 6.73 0.00000000001930 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.463 on 3873 degrees of freedom
Multiple R-squared: 0.713, Adjusted R-squared: 0.712
F-statistic: 802 on 12 and 3873 DF, p-value: <2e-16
plot(log(dx$Revenue), predict(mRev), col='pink', cex=0.65)
abline(0,1,col='red')
使用模型對Y2015年底的資料做預測,對資料中的每一位顧客,預測她們在Y2016的保留率和購買金額。
CX = Y$Y2015
names(CX)[8:9] = c("freq0","revenue0")
# 預測Y2016保留率
CX$ProbRetain = predict(mRet,CX,type='response')
# 預測Y2016購買金額
CX$PredRevenue = exp(predict(mRev,CX))
par(mfrow=c(1,2), mar=c(4,3,3,2), cex=0.8)
hist(CX$ProbRetain,main="ProbRetain", ylab="")
hist(log(CX$PredRevenue,10),main="log(PredRevenue)", ylab="")
\[ V_i = \sum_{t=0}^N g \times m_i \frac{r_i^t}{(1+d)^t} = g \times m_i \sum_{t=0}^N (\frac{r_i}{1+d})^t \]
g = 0.5 # (稅前)獲利率
N = 5 # 期數 = 5
d = 0.1 # 利率 = 10%
CX$CLV = g * CX$PredRevenue * rowSums(sapply(
0:N, function(i) (CX$ProbRetain/(1+d))^i ) )
summary(CX$CLV)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3 16 24 51 45 5094
par(mar=c(2,2,3,1), cex=0.8)
hist(log(CX$CLV,10), xlab="", ylab="")
# 各族群的平均營收貢獻、保留機率、終生價值
sapply(CX[,10:12], tapply, CX$status, mean)
ProbRetain PredRevenue CLV
N1 0.20269 31.98 20.17
N2 0.44075 131.23 110.89
R1 0.34150 69.85 54.60
R2 0.74925 91.27 136.31
S1 0.05724 56.10 29.66
S2 0.03475 49.48 25.58
S3 0.02326 49.36 25.17
par(mar=c(3,3,4,2), cex=0.8)
boxplot(log(CLV)~status, CX, main="CLV by Groups")
給定某一行銷工具的成本和預期效益,選擇可以施行這項工具的對象。
R2族群的預測保留率和購買金額
par(mfrow=c(1,2), mar=c(4,3,3,2), cex=0.8)
hist(CX$ProbRetain[CX$status=="R2"],main="ProbRetain",xlab="")
hist(log(CX$PredRevenue[CX$status=="R2"],10),main="PredRevenue",xlab="")
假設行銷工具的成本和預期效益為
cost = 10 # 成本
effect = 0.75 # 效益:下一期的購買機率
估計這項行銷工具對每一位R2顧客的預期報酬
Target = subset(CX, status=="R2")
Target$ExpReturn = (effect - Target$ProbRetain) * Target$PredRevenue - cost
summary(Target$ExpReturn)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-515.8 -15.4 -11.5 -10.3 -8.1 646.9
這一項工具對R2顧客的預期報酬是負的
但是,我們還是可以挑出許多預期報酬很大的行銷對象
Target %>% arrange(desc(ExpReturn)) %>% select(cid, ExpReturn) %>% head(15)
cid ExpReturn
1 141340 646.85
2 164930 609.02
3 2190 323.38
4 172750 305.75
5 156280 289.41
6 190570 236.91
7 134030 199.29
8 141430 150.73
9 123480 134.52
10 65830 110.61
11 150890 107.18
12 134590 99.48
13 72000 92.55
14 133080 84.54
15 147510 77.18
sum(Target$ExpReturn > 0) # 可實施對象:258
[1] 258
在R2之中,有258人的預期報酬大於零,如果對這258人使用這項工具,我們的期望報酬是:
sum(Target$ExpReturn[Target$ExpReturn > 0]) # 預期報酬:6464
[1] 6464
我們可以算出對所有的族群實施這項工具的期望報酬 …
Target = CX
Target$ExpReturn = (effect - Target$ProbRetain) * Target$PredRevenue - cost
filter(Target, Target$ExpReturn > 0) %>%
group_by(status) %>% summarise(
No.Target = n(),
AvgROI = mean(ExpReturn),
TotalROI = sum(ExpReturn) ) %>% data.frame
status No.Target AvgROI TotalROI
1 N1 2211 9.973 22049
2 N2 1459 47.238 68920
3 R1 1596 25.459 40633
4 R2 258 25.054 6464
5 S1 2645 30.592 80915
6 S2 1609 27.134 43658
7 S3 4495 27.598 124052
這個結果是合理的嗎? 你想要怎麼修正這項分析的程序呢?
如果你只有顧客ID、交易日期、交易金額三個欄位的話,你可以做的分析包括:
一般而言,這一些分析的結果,足夠讓我們制定顧客發展和顧客保留策略;至於顧客吸收策略,我們通常還需要從CRM撈出顧客個人屬性資料才能做到。