1.顧客分群
2.依據顧客族群價值屬性:設定行銷目標
3.製作模型:估計顧客的回購率、預期營收獲利、終身價值
4.根據特徵設計行銷方案
5.對方案的成本、效益進行假設
Zrm(list=ls(all=T))
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr, vcd,d3heatmap,Matrix, vcd )Z = read_csv("C:/BAR_group6/final/data/ta_feng_all_months_merged.csv") %>%
data.frame %>% setNames(c(
"date","cust","age","area","cat","prod","qty","cost","price"))## Parsed with column specification:
## cols(
## TRANSACTION_DT = col_character(),
## CUSTOMER_ID = col_character(),
## AGE_GROUP = col_character(),
## PIN_CODE = col_character(),
## PRODUCT_SUBCLASS = col_double(),
## PRODUCT_ID = col_character(),
## AMOUNT = col_double(),
## ASSET = col_double(),
## SALES_PRICE = col_double()
## )
## [1] 817741
age.group = c("<25","25-29","30-34","35-39","40-44",
"45-49","50-54","55-59","60-64",">65")
Z$age = c(paste0("a",seq(24,69,5)),"a99")[match(Z$age,age.group,11)]
Z$area = paste0("z",Z$area)#年齡級層和郵遞區號
par(mfrow=c(1,2),cex=0.7)
table(Z$age, useNA='ifany') %>% barplot(main="Age Groups", las=2)
table(Z$area,useNA='ifany') %>% barplot(main="Areas", las=2)## qty cost price
## 99% 6 858.0 1014.00
## 99.9% 14 2722.0 3135.82
## 99.95% 24 3799.3 3999.00
## [1] 817182
# No. cust, cat, prod, tid
#把項目彙總成訂單
#顧客人數 品類 產品數 訂單數(有重複購買)
#平均購買週期(一個月一次)
sapply(Z[c("cust","cat","prod","tid")], n_distinct)## cust cat prod tid
## 32256 2007 23789 119422
XX = Z %>% group_by(tid) %>% summarise(
date = min(date), # 交易日期
cust = min(cust), # 顧客 ID
age = min(age), # 顧客 年齡級別
area = min(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(X) # 119422 ## [1] 119422
## items pieces total gross
## 99.9% 54 81.0000 9009.579 1824.737
## 99.95% 62 94.2895 10611.579 2179.817
## 99.99% 82 133.0000 16044.401 3226.548
## tid date cust
## Min. : 1 Min. :2000-11-01 Length:119328
## 1st Qu.: 29855 1st Qu.:2000-11-29 Class :character
## Median : 59705 Median :2001-01-01 Mode :character
## Mean : 59712 Mean :2000-12-31
## 3rd Qu.: 89581 3rd Qu.:2001-02-02
## Max. :119422 Max. :2001-02-28
## age area items pieces
## Length:119328 Length:119328 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.802 Mean : 9.222
## 3rd Qu.: 9.000 3rd Qu.:12.000
## Max. :62.000 Max. :94.000
## total gross
## Min. : 5.0 Min. :-1645.0
## 1st Qu.: 227.0 1st Qu.: 21.0
## Median : 510.0 Median : 68.0
## Mean : 851.6 Mean : 130.9
## 3rd Qu.: 1080.0 3rd Qu.: 168.0
## Max. :15345.0 Max. : 3389.0
Ad0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>% group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = min(age), # age group
area = min(area), # area code
) %>% data.frame
nrow(A) # 32241## [1] 32241
## Observations: 119,328
## Variables: 9
## $ tid <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ date <date> 2000-11-01, 2000-11-01, 2000-11-01, 2000-11-01, 2000-1...
## $ cust <chr> "00038317", "00045902", "00045957", "00046855", "000586...
## $ age <chr> "a69", "a59", "a54", "a39", "a39", "a44", "a49", "a64",...
## $ area <chr> "z115", "z115", "z115", "z115", "z115", "z115", "z115",...
## $ items <int> 2, 4, 1, 3, 6, 14, 1, 5, 5, 18, 11, 8, 17, 1, 6, 6, 1, ...
## $ pieces <dbl> 3, 9, 1, 5, 6, 14, 8, 5, 12, 31, 11, 10, 22, 1, 11, 8, ...
## $ total <dbl> 76, 555, 133, 185, 306, 623, 365, 257, 242, 1514, 432, ...
## $ gross <dbl> -8, 95, -47, 14, 52, 88, -3, 55, 2, 193, -20, 101, 407,...
##
## 1 2 3 4 5 6 7
## a24 811 750 793 739 719 843 1005
## a29 1544 1383 1276 1389 1186 1671 2058
## a34 2757 2410 2341 2335 2222 3297 4294
## a39 3256 2871 2721 2930 2842 3999 5129
## a44 2623 2463 2392 2445 2323 3352 3976
## a49 1904 1810 1660 1682 1745 2343 2835
## a54 1188 1114 1031 1095 1059 1424 1579
## a59 557 541 504 520 496 649 700
## a64 463 475 429 542 467 570 609
## a69 788 783 788 901 815 875 868
## a99 638 712 509 649 574 611 681
X %>% mutate(wdate=factor(format(date, "%u"))) %>%
count(age, wdate) %>%
ggplot(aes(x=wdate, y=n, fill=age)) +
geom_bar(stat="Identity") +
facet_wrap(~age)X %>% mutate(wdate=format(date, "%u")) %>%
group_by(cust) %>% summarise(
weekday = mean(wdate <= 5)
) %>% right_join(A) -> A## Joining, by = "cust"
##
## (0,4] (4,8] (8,12] (12,20] (20,30] (30,Inf]
## MIXED 0.273 0.847 0.943 0.977 0.990 1.000
## WDAY 0.437 0.129 0.053 0.022 0.010 0.000
## WEND 0.290 0.024 0.005 0.001 0.000 0.000
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
## The following object is masked from 'package:magrittr':
##
## extract
A %>% group_by(km) %>% summarise_at(vars(r,f,m,weekday), mean) %>%
mutate_at(vars(r:weekday), scale) -> df
mx = df[,-1] %>% as.matrix.data.frame() %>% t
colnames(mx) = paste0("G", 1:5)
par(cex=0.75)
barplot(mx, beside=T, col=rainbow(4), ylim=c(-2,3))
legend('topright', legend=rownames(mx), fill=rainbow(4))
🗿
【G1】不常來/購買金額低/平日
【G2】不常/平日/錢少
【G3】近期有來/常來/購買金額低/平日
【G4】不常來/購買金額高/假日
【G5】不常來/假日/錢少
#各族群的年齡分布(人數/比率)
par(mfrow=c(2,1),cex=0.75)
table(A$age,A$km) %>% barplot(beside=T,main="分群後各年齡人數")
table(A$age,A$km) %>% prop.table(2) %>% barplot(beside=T,main="分群後各年齡比率")#各族群的主要分布地區
library(d3heatmap)
H=table(A$km,A$area) %>% as.data.frame.matrix
E = (rowSums(H) %o% colSums(H))/sum(H) # cells' expected value
r = (H - E)/sqrt(E) # standerdized residuals
r[r > 100] = 100 # adjust the outlier
r %>% d3heatmap(col=colorRamp(c('seagreen','lightyellow','red')))<div id="htmlwidget-e7063f7b99b99691c2e9" style="width:672px;height:480px;" class="d3heatmap html-widget">
######開始做預測 ##### The Demarcation Date Remove data after the demarcation date
#項目會總程訂單
X_new = group_by(Z_new, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額(已經*過件數了)
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387
#最大化營收還是最大化獲利
#營收最大的保整獲利最大的## tid date cust
## Min. : 1 Min. :2000-11-01 Length:88387
## 1st Qu.:22098 1st Qu.:2000-11-23 Class :character
## Median :44194 Median :2000-12-12 Mode :character
## Mean :44194 Mean :2000-12-15
## 3rd Qu.:66291 3rd Qu.:2001-01-12
## Max. :88387 Max. :2001-01-31
## age area items pieces
## Length:88387 Length:88387 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.994 Mean : 9.453
## 3rd Qu.: 9.000 3rd Qu.: 12.000
## Max. :112.000 Max. :339.000
## total gross
## Min. : 5.0 Min. :-1645.0
## 1st Qu.: 230.0 1st Qu.: 23.0
## Median : 522.0 Median : 72.0
## Mean : 888.7 Mean : 138.3
## 3rd Qu.: 1120.0 3rd Qu.: 174.0
## Max. :30171.0 Max. : 8069.0
##
## 1 2 3 4 5 6 7
## a24 603 563 531 529 492 635 754
## a29 1180 1023 879 1055 886 1249 1545
## a34 2094 1820 1628 1757 1660 2424 3241
## a39 2499 2118 1855 2159 2104 2953 3853
## a44 2015 1812 1636 1816 1726 2501 3072
## a49 1489 1343 1146 1218 1264 1731 2163
## a54 910 813 715 793 785 1081 1211
## a59 429 396 335 396 361 497 535
## a64 347 340 284 400 351 422 470
## a69 588 549 530 670 607 665 679
## a99 476 522 367 462 420 450 510
X_new %>% mutate(wdate=factor(format(date, "%u"))) %>%
count(age, wdate) %>%
ggplot(aes(x=wdate, y=n, fill=age)) +
geom_bar(stat="Identity") +
facet_wrap(~age)## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
#訂單彙總成顧客
d0 = max(X_new$date) + 1
A_new = X_new %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 28584
nrow(A_new)## [1] 28584
#切了三個月準備X(項目>>>訂單>>>顧客)
X_new %>% mutate(wdate=format(date, "%u")) %>%
group_by(cust) %>% summarise(
weekday = mean(wdate <= 5)
) %>% right_join(A_new) -> A_new## Joining, by = "cust"
A_new = A_new %>% mutate(
wday = ifelse(weekday == 0, "WEND",
ifelse(weekday == 1, "WDAY", "MIXED")) %>% factor )##
## (0,4] (4,8] (8,12] (12,20] (20,30] (30,Inf]
## MIXED 0.264 0.853 0.957 0.980 1.000 1.000
## WDAY 0.440 0.128 0.040 0.020 0.000 0.000
## WEND 0.296 0.019 0.002 0.000 0.000 0.000
A$amountSimply a Left Joint
A$buyA_new$buy = !is.na(A_new$amount)
#是=TRUE;不是=FALSE
#第二個Y 有買沒買
#一個月的保留率不到一半
table(A_new$buy, !is.na(A_new$amount))##
## FALSE TRUE
## FALSE 15342 0
## TRUE 0 13242
## cust weekday r s
## Length:28584 Min. :0.0000 Min. : 1.00 Min. : 1.00
## Class :character 1st Qu.:0.2000 1st Qu.:11.00 1st Qu.:47.00
## Mode :character Median :0.6667 Median :21.00 Median :68.00
## Mean :0.5929 Mean :32.12 Mean :61.27
## 3rd Qu.:1.0000 3rd Qu.:53.00 3rd Qu.:83.00
## Max. :1.0000 Max. :92.00 Max. :92.00
##
## f m rev raw
## Min. : 1.000 Min. : 8.0 Min. : 8 Min. : -742.0
## 1st Qu.: 1.000 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0
## Median : 2.000 Median : 709.5 Median : 1566 Median : 218.0
## Mean : 3.089 Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 4.000 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :60.000 Max. :10634.0 Max. :99597 Max. :15565.0
##
## age area wday amount
## Length:28584 Length:28584 MIXED:10750 Min. : 8
## Class :character Class :character WDAY :10821 1st Qu.: 454
## Mode :character Mode :character WEND : 7013 Median : 993
## Mean : 1499
## 3rd Qu.: 1955
## Max. :28089
## NA's :15342
## buy
## Mode :logical
## FALSE:15342
## TRUE :13242
##
##
##
##
#先做BLOCK才做RANDOM
X_new = subset(X_new, cust %in% A_new$cust & date < as.Date("2001-02-01"))
Z_new = subset(Z_new, cust %in% A_new$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A_new$buy, SplitRatio=0.7)
#set.seed(2018)讓每次RANDOM的結果都相同(隨機的種子)
#sample.split()樣本切割工具
#7成TRANING
#依據A中的比率切成7 3
c(nrow(A_new), sum(spl), sum(!spl))## [1] 28584 20008 8576
#把有買的人切出來
cbind(A_new, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)A2_new = subset(A_new, buy) %>% mutate_at(c("m","rev","amount"), log10)
#mutate_at(c("m","rev","amount"), log10)把跟錢相關的欄位取LOG
#C(X,X,Y)
n = nrow(A2_new)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
#把向量隨機的切成3分7分1:n %in% sample(1:n, round(0.7*n))
#在I~N中抽出7成的資料(ROUND把小數點去掉)
#在裡面就說是TRUE
#沒有切好TEAT和TRAIN的分布結果就會不同
c(nrow(A2_new), sum(spl2), sum(!spl2))## [1] 13242 9269 3973
######模型製作
## [1] "C"
#做類別模(邏輯式回歸)
#10:AMOUNT是Y(做GLM模型的準確度會變100%)
glm1 = glm(buy ~ ., TR[,c(2:11, 13)],family=binomial())
summary(glm1)#看係數(每個X對Y的效果)>>>商務數據不關心,是要拿X來預測Y(不用理會共線性的問題>>不影響對Y估計值的準確度)##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:11,
## 13)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7551 -0.8705 -0.6950 1.0314 1.8769
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.042e+00 1.576e-01 -6.611 3.82e-11 ***
## weekday -2.230e-01 1.723e-01 -1.294 0.19577
## r -1.165e-02 9.199e-04 -12.664 < 2e-16 ***
## s 8.964e-03 9.331e-04 9.606 < 2e-16 ***
## f 2.864e-01 1.670e-02 17.152 < 2e-16 ***
## m -2.494e-05 2.784e-05 -0.896 0.37035
## rev 3.774e-05 1.937e-05 1.948 0.05143 .
## raw -2.284e-04 8.551e-05 -2.672 0.00755 **
## agea29 -4.177e-02 8.669e-02 -0.482 0.62991
## agea34 1.775e-02 7.997e-02 0.222 0.82437
## agea39 7.861e-02 7.926e-02 0.992 0.32131
## agea44 8.773e-02 8.135e-02 1.078 0.28082
## agea49 1.935e-02 8.460e-02 0.229 0.81912
## agea54 1.964e-02 9.326e-02 0.211 0.83318
## agea59 1.770e-01 1.094e-01 1.618 0.10574
## agea64 6.038e-02 1.175e-01 0.514 0.60734
## agea69 2.663e-01 1.048e-01 2.543 0.01101 *
## agea99 -1.415e-01 1.499e-01 -0.944 0.34536
## areaz106 -4.420e-02 1.322e-01 -0.334 0.73813
## areaz110 -2.103e-01 1.046e-01 -2.011 0.04428 *
## areaz114 3.331e-02 1.112e-01 0.300 0.76452
## areaz115 2.543e-01 9.698e-02 2.622 0.00874 **
## areaz221 1.761e-01 9.769e-02 1.803 0.07141 .
## areazOthers -4.861e-02 1.046e-01 -0.465 0.64203
## areazUnknown -1.700e-01 1.233e-01 -1.378 0.16818
## wdayWDAY 1.040e-02 9.307e-02 0.112 0.91103
## wdayWEND -2.405e-01 1.025e-01 -2.347 0.01894 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23287 on 19981 degrees of freedom
## AIC: 23341
##
## Number of Fisher Scoring iterations: 5
#重要的是拿來做預測
#PRED模型預測會買不會買
#製作混淆矩陣
#pred > 0.5利用臨界值切開來
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5)
cm## predict
## actual FALSE TRUE
## FALSE 3724 879
## TRUE 1684 2289
acc.ts = cm %>% {sum(diag(.))/sum(.)}
#sum(diag(.))/sum(.)>>>>>分子:對角線(DIAG)相加;分母:整個矩陣相加
c(1-mean(TS$buy) , acc.ts) # 0.69998## [1] 0.5367304 0.7011427
## [,1]
## FALSE vs. TRUE 0.7555224
A2_new = subset(A_new, A_new$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2_new, spl2)
TS2 = subset(A2_new, !spl2)##
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:12)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.01379 -0.22930 0.04878 0.28566 1.37342
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.342e+00 5.879e-02 22.820 < 2e-16 ***
## weekday -1.857e-02 3.540e-02 -0.524 0.599977
## r 1.102e-04 2.414e-04 0.457 0.647914
## s 3.850e-04 2.361e-04 1.631 0.102929
## f 2.160e-02 1.436e-03 15.036 < 2e-16 ***
## m 4.743e-01 1.371e-02 34.590 < 2e-16 ***
## raw 6.950e-05 8.597e-06 8.084 7.05e-16 ***
## agea29 4.853e-02 2.519e-02 1.927 0.054049 .
## agea34 8.958e-02 2.323e-02 3.856 0.000116 ***
## agea39 1.205e-01 2.285e-02 5.274 1.37e-07 ***
## agea44 1.101e-01 2.346e-02 4.693 2.73e-06 ***
## agea49 6.663e-02 2.435e-02 2.736 0.006229 **
## agea54 8.623e-02 2.654e-02 3.249 0.001163 **
## agea59 3.922e-02 3.098e-02 1.266 0.205540
## agea64 7.702e-03 3.248e-02 0.237 0.812548
## agea69 -3.527e-02 2.883e-02 -1.223 0.221319
## agea99 9.553e-02 4.064e-02 2.351 0.018762 *
## areaz106 1.002e-01 4.259e-02 2.353 0.018628 *
## areaz110 5.595e-02 3.461e-02 1.617 0.105948
## areaz114 1.883e-02 3.638e-02 0.518 0.604810
## areaz115 2.241e-02 3.172e-02 0.706 0.479942
## areaz221 4.316e-02 3.198e-02 1.350 0.177204
## areazOthers 3.860e-02 3.433e-02 1.124 0.260965
## areazUnknown 1.538e-02 3.859e-02 0.399 0.690249
## wdayWDAY 1.392e-02 1.996e-02 0.697 0.485643
## wdayWEND 1.897e-02 2.408e-02 0.788 0.430941
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4234 on 9243 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.2831
## F-statistic: 147.4 on 25 and 9243 DF, p-value: < 2.2e-16
#看準確率看AUC R(小) 平均誤差(大) 均方差(大)>>>不准
r2.tr = summary(lm1)$r.sq#TRAIN的R-SQL(TEST要自己算)
#變異:偏離平均值的程度
#做完模型和預測
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
#均方差(變藝術的概念)的般位會是Y單位的平方
#R 的單位(0~1的值)>>沒有單位 是比率(解釋程度,所能界是的Y的變異輛)
r2.ts = 1 - (SSE/SST)
c(R2train=r2.tr, R2test=r2.ts)## R2train R2test
## 0.2849970 0.2894786
Aggregate data 2000-12-01 ~ 2001~02-28.
#資料再重新切割一次
#把三個月的資料再加起來
#重新開始分析
d0 = max(X$date) + 1
B = X %>%
filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 28584
nrow(B)## [1] 28531
X %>% mutate(wdate=format(date, "%u")) %>%
group_by(cust) %>% summarise(
weekday = mean(wdate <= 5)
) %>% right_join(B) -> B## Joining, by = "cust"
B = B %>% mutate(
wday = ifelse(weekday == 0, "WEND",
ifelse(weekday == 1, "WDAY", "MIXED")) %>% factor )##
## (0,4] (4,8] (8,12] (12,20] (20,30] (30,Inf]
## MIXED 0.363 0.893 0.969 0.986 0.993 1.000
## WDAY 0.392 0.094 0.026 0.014 0.007 0.000
## WEND 0.245 0.012 0.004 0.000 0.000 0.000
In B, there is a record for each customer. B$Buy is the probability of buying in March.
💡: 預測購買金額時要記得做指數、對數轉換!
B2 = B %>% mutate_at(c("m","rev"), log10)
B$Rev = 10^predict(lm1, B2) #指數再轉回來
#如果這個人來買會買多少錢
#三月份會來買的機率已集會買多少錢## Joining, by = "cust"
## cust weekday r s
## Length:28531 Min. :0.0000 Min. : 1.00 Min. : 1.00
## Class :character 1st Qu.:0.3333 1st Qu.: 8.00 1st Qu.:39.00
## Mode :character Median :0.6667 Median :21.00 Median :62.00
## Mean :0.6015 Mean :28.84 Mean :57.84
## 3rd Qu.:1.0000 3rd Qu.:43.00 3rd Qu.:82.00
## Max. :1.0000 Max. :90.00 Max. :90.00
## f m rev raw
## Min. : 1.000 Min. : 8.0 Min. : 8 Min. : -686.0
## 1st Qu.: 1.000 1st Qu.: 354.0 1st Qu.: 638 1st Qu.: 66.0
## Median : 2.000 Median : 691.4 Median : 1516 Median : 213.0
## Mean : 3.068 Mean : 986.9 Mean : 2600 Mean : 407.7
## 3rd Qu.: 4.000 3rd Qu.: 1271.6 3rd Qu.: 3304 3rd Qu.: 524.0
## Max. :70.000 Max. :13017.0 Max. :95153 Max. :16146.0
## age area wday Buy
## Length:28531 Length:28531 MIXED:13221 Min. :0.1177
## Class :character Class :character WDAY : 9531 1st Qu.:0.2835
## Mode :character Mode :character WEND : 5779 Median :0.4027
## Mean :0.4656
## 3rd Qu.:0.6165
## Max. :1.0000
## Rev km
## Min. : 80.18 Min. :1.000
## 1st Qu.: 557.31 1st Qu.:1.000
## Median : 822.78 Median :2.000
## Mean : 972.60 Mean :2.601
## 3rd Qu.: 1174.36 3rd Qu.:4.000
## Max. :127567.23 Max. :5.000
#計算顧客終身價值
g = sum(Z$price - Z$cost)/sum(Z$price) # gross margin 0.15
N = 36 # 期數 = 12
d = 0.01 # 月利率 = 1%
B$CLV = g * B$Rev * rowSums(sapply(0:N, function(i) (B$Buy/(1+d))^i ) )
summary(B$CLV)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.3 144.0 231.8 728.9 404.0 611217.7
## # A tibble: 5 x 2
## km GD
## <int> <dbl>
## 1 1 111.
## 2 2 128.
## 3 3 607.
## 4 4 282.
## 5 5 140.
## Buy Rev km CLV
## 1 0.3493214 693.1770 1 174.4133
## 2 0.6064438 866.3059 2 519.5251
## 3 0.9607145 2511.0609 3 9850.6303
## 4 0.3621250 1879.7162 4 525.8552
## 5 0.3575009 864.5956 5 231.5886
## Joining, by = "cust"
######開始規劃行銷企劃
根據每個年齡、每一天的前十個產品的價格與數量變化,做單價與總銷量的表與相關性 顧客的價格敏感度:價格與銷量的斜率(每個商品的價格敏感度都不一樣,所以從個別商品來看)
df = do.call(rbind, lapply(20:1, function(i) { #do.call(rbind把後面的list合併成資料匡
money %>% filter(cat == top20[i]) %>% group_by(km,date) %>% summarise(
unit.price = mean(price/qty),
total.qty = sum(qty)
) %>% summarise(
cor = cor(unit.price, total.qty),
) %>%
mutate(cat = top20[i])
}))越藍:相關性越低 越紅:相關性越高 將年齡與品項的價格敏感程度(相關係數)透過tapply變成矩陣,再做成heatmap
<!iv id="htmlwidget-395ea3a81e8b34f945c8" style="width:672px;height:480px;" class="d3heatmap html-widget">
#install.packages("mosaic")
library(vcd)
MOSA = function(formula, data) mosaic(formula, data, shade=T,
margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)## Joining, by = "cust"
從馬賽克圖中我們觀察到G2(潛在客群)的消費習慣在某些產品上和其他族群不同 因此我們繼續深入分析這些產品的屬性 並推測這些產品編號可能代表哪些產品
# glimpse(Z)
filter(Z, cat %in% c('110117', '110217','110106','130106','100205','110411','500201')) %>% group_by(cat) %>%
summarise(
no_prod = n_distinct(prod), #產品數
total_freq = n(), #賣出次數
total_qty = sum(qty), #賣出總量
qty_freq = total_qty/total_freq, #每次買幾個
avg_price = sum(price)/sum(qty), #平均價格
avg_cost = sum(cost)/sum(qty), #平均成本
avg_margin = sum(price - cost)/sum(qty)#平均利潤
)## # A tibble: 7 x 8
## cat no_prod total_freq total_qty qty_freq avg_price avg_cost
## <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 100205 275 20394 24553 1.20 49.8 41.6
## 2 110106 7 7990 13327 1.67 17.1 19.6
## 3 110117 60 7591 10241 1.35 58.0 55.4
## 4 110217 36 11047 14325 1.30 154. 160.
## 5 110411 105 12794 21203 1.66 24.6 20.5
## 6 130106 17 7636 10861 1.42 17.6 14.8
## 7 500201 17 11414 16970 1.49 130. 127.
## # ... with 1 more variable: avg_margin <dbl>
#'110117':文具
#'110217':食品類
#'110106':快速消耗衛生品
#'130106':零食
#主要可維持的產品(G2會在我們店買)
#'100205':飾品類
#'110411':飲料
#'500201':酒類
#strengthen可加強的產品(G2不會在我們店買)
#週五part time?(和食物一起銷售)酒類飲品點數加倍送
#假設:客戶喜歡其他家的相同品類商品
#獨特性:推出聯名品牌根據G3喜好產品的屬性 提供抽獎獎品品項
## Joining, by = "tid"
## cat tid cust
## Min. :100101 Min. : 1 Length:617410
## 1st Qu.:110103 1st Qu.:21989 Class :character
## Median :130102 Median :44016 Mode :character
## Mean :282765 Mean :44415
## 3rd Qu.:520202 3rd Qu.:67986
## Max. :780510 Max. :88387
## Joining, by = "cust"
## cat tid cust r
## Min. :100101 Min. : 1 Length:617410 Min. : 1.00
## 1st Qu.:110103 1st Qu.:21989 Class :character 1st Qu.: 4.00
## Median :130102 Median :44016 Mode :character Median :13.00
## Mean :282765 Mean :44415 Mean :22.24
## 3rd Qu.:520202 3rd Qu.:67986 3rd Qu.:37.00
## Max. :780510 Max. :88387 Max. :90.00
## NA's :38474
## s f m
## Min. : 1.00 Min. : 1.00 Min. : 8.0
## 1st Qu.:60.00 1st Qu.: 2.00 1st Qu.: 545.7
## Median :81.00 Median : 4.00 Median : 934.4
## Mean :72.04 Mean : 6.64 Mean : 1280.1
## 3rd Qu.:87.00 3rd Qu.: 8.00 3rd Qu.: 1650.9
## Max. :90.00 Max. :70.00 Max. :13017.0
## NA's :38474 NA's :38474 NA's :38474
XZrsfm_new = XZrsfm %>% filter(cat ==
c("130315","130106","110217","530101","110411",
"130204","130206","120103","560201","500201",
"530101","560201","500201","130315","130204"),
f, m) %>%
group_by(cat) %>% summarize(
frequency = mean(f),
monetary = mean(m)) %>% ungroup %>%
ggplot(aes(y=monetary, x=frequency)) +
geom_point(aes(col=cat), alpha=0.5) +
geom_text(aes(label=cat)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") +
ylab("平均客單價") + xlab("平均來店率")## Warning in cat == c("130315", "130106", "110217", "530101", "110411",
## "130204", : 較長的物件長度並非較短物件長度的倍數
###估算不同群集顧客的消費週期
#before 8 no money
#activity="soil day"
money$date = as.Date(money$date, format="%m/%d/%Y")
dates = as.Date(c("2000-11-8","2000-12-8","2001-01-8","2001-02-8"))
par(mfcol=c(2,2), cex=0.8)
hist(money$date,'days',freq=T,las=2,main="total")
abline(v=dates, col='red')
hist(money$date[money$km==2],'days',freq=T,las=2,main="potential")
abline(v=dates, col='red')
hist(money$date[money$km==3],'days',freq=T,las=2,ylim=c(0,6000),main="neighbor")
abline(v=dates, col='red')
hist(money$date[money$km==4],'days',freq=T,las=2,ylim=c(0,6000),main="worker")
abline(v=dates, col='red')取銷量前10名商品的名字
##### 成本、效益模擬假設 ##### Loading & Preparing Data
Cost Effect Functions: $$ P_2(x) = L(x|m_{p2},b_{p2},a_{p2}) ; ; ; M_2(x) = L(x|m_{m2},b_{m2},a_{m2}) \
P_3(x) = L(x|m_{p3},b_{p3},a_{p3}) ; ; ; M_3(x) = L(x|m_{m3},b_{m3},a_{m3}) \
P_4(x) = L(x|m_{p4},b_{p4},a_{p4}) ; ; ; M_4(x) = L(x|m_{m4},b_{m4},a_{m4}) \ $$
Customers’ Expected Profit \[E[\pi] = Gm(Rev * Buy)\]
Tool’s Expected Marginal Profit \[E[\pi(x)] = Gm[ Rev(1+\Delta M) * (Buy+\Delta P) - Rev * Buy ] - x\]
##
## 1 2 3 4 5
## 8512 9835 1152 2588 6444
K = data.frame(
DpM = c( 0.20, 0.1, 0.2),
DpB = c( 20, 20, 30),
DpA = c( 20, 25, 30),
DmM = c( 0.10, 0.40, 0.25),
DmB = c( 10, 40, 30),
DmA = c( 30, 40, 20)
)
# rownames(K) = c("T2", "T3","T4")
gm = 0.25
X = seq(10, 200, 2)
DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}
df = do.call(rbind, lapply(1:3, function(i) {
sapply(X, function(x) {
dp = DP(x,K$DpM[i],K$DpB[i],K$DpA[i])
dp = ifelse(B$Buy[B$km == i+1]+dp>1, 1-B$Buy[B$km == i+1], dp)
dm = DP(x,K$DmM[i],K$DmB[i],K$DmA[i])
eR = gm*(B$Rev[B$km == i+1]*(1+dm) * (B$Buy[B$km == i+1] + dp) - B$Buy[B$km == i+1] *
B$Rev[B$km == i+1]) - x
c(i=i, x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
}) %>% t %>% data.frame
}))
df %>% gather('key','value',-i,-x) %>%
mutate(Instrument = paste0('I',i+1)) %>%
ggplot(aes(x=x, y=value, col=Instrument)) +
geom_hline(yintercept=0, linetype='dashed', col='blue') +
geom_line(size=1.5,alpha=0.5) +
xlab('工具選項(成本)') + ylab('預期報償') +
ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()利用購物籃分析,找出會帶來高獲利品項銷售的關聯規則。
options(scipen=10, digits=3)
pacman::p_load(arules, arulesViz)
load("C:/BAR_group6/final/data/tf0.rdata")tr做購物籃分析之前,需要將訂單裡面的品項製作成一個transactions物件(tr)
## transactions in sparse format with
## 119422 transactions (rows) and
## 23789 items (columns)
使用arules::apriori()這個方法找出品項間的關聯法則(Association Rules);通常我們會先放寬限制條件,先找一組可能用到的法則。
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.0001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 11
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23789 item(s), 119422 transaction(s)] done [0.47s].
## sorting and recoding items ... [10166 item(s)] done [0.01s].
## creating transaction tree ... done [0.05s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 done [0.61s].
## writing ... [9795 rule(s)] done [0.14s].
## creating S4 object ... done [0.04s].
## set of 9795 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6 7 8 9
## 1143 3362 2429 1385 874 448 136 18
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 3.00 4.00 3.95 5.00 9.00
##
## summary of quality measures:
## support confidence lift count
## Min. :0.00010 Min. :0.250 Min. : 4 Min. : 12
## 1st Qu.:0.00012 1st Qu.:0.433 1st Qu.: 75 1st Qu.: 14
## Median :0.00017 Median :0.632 Median : 264 Median : 20
## Mean :0.00023 Mean :0.638 Mean : 560 Mean : 28
## 3rd Qu.:0.00023 3rd Qu.:0.857 3rd Qu.:1106 3rd Qu.: 27
## Max. :0.00586 Max. :1.000 Max. :5331 Max. :700
##
## mining info:
## data ntransactions support confidence
## tr 119422 0.0001 0.25
然後我們可以設定條件,找到會帶來高營收品項(rhs)的關聯規則(lhs => rhs):
support : lhs品項被購買的基礎機率confidence : lhs品項被購買時rhs被購買的機率lift : lhs品項被購買時,rhs被購買所增加機率的倍數count : 交易筆數(交易筆數如果太少,分析就沒有實質意義)## lhs rhs support confidence lift count
## [1] {4716114000312} => {4716114000329} 0.001273 0.553 231.6 152
## [2] {4716114000329} => {4716114000312} 0.001273 0.533 231.6 152
## [3] {4710154015138} => {4710154015206} 0.000996 0.374 52.1 119
## [4] {4713754987614} => {4713754987607} 0.001139 0.304 78.5 136
## [5] {4713754987607} => {4713754987614} 0.001139 0.294 78.5 136
## [6] {4710011402026} => {4710011402019} 0.002822 0.674 90.2 337
## [7] {4710088414328} => {4710088414311} 0.001792 0.466 86.2 214
## [8] {4710011401142} => {4710011406123} 0.001532 0.413 50.3 183
## [9] {4710085172702} => {4710085172696} 0.002428 0.540 62.0 290
## [10] {4710254049323} => {4710254049521} 0.002010 0.431 55.6 240
## [11] {4710011409056} => {4710011406123} 0.002629 0.414 50.4 314
## [12] {4710011409056} => {4710011401128} 0.004446 0.700 51.0 531
## [13] {4710085120093} => {4710085172696} 0.003743 0.498 57.2 447
## [14] {4710011401135} => {4710011401128} 0.005862 0.753 54.9 700
## [15] {4710011401142,
## 4710011409056} => {4710011401128} 0.001197 0.745 54.3 143
## [16] {4710011401135,
## 4710011401142} => {4710011406123} 0.000888 0.484 59.0 106
## [17] {4710011401135,
## 4710011401142} => {4710011401128} 0.001390 0.758 55.3 166
## [18] {4710011401142,
## 4710011405133} => {4710011401128} 0.001248 0.687 50.1 149
## [19] {4710011401128,
## 4710011401142} => {4710011406123} 0.001013 0.457 55.6 121
## [20] {4710085120093,
## 4710085172702} => {4710085172696} 0.001348 0.654 75.2 161
## [21] {4710085120093,
## 4710085172702} => {4710085120628} 0.001281 0.622 54.7 153
## [22] {4710085172696,
## 4710085172702} => {4710085120628} 0.001491 0.614 54.0 178
## [23] {4710085120628,
## 4710085172702} => {4710085172696} 0.001491 0.605 69.5 178
## [24] {4710011401135,
## 4710011409056} => {4710011406123} 0.001599 0.472 57.5 191
## [25] {4710011401135,
## 4710011409056} => {4710011401128} 0.002721 0.802 58.5 325
## [26] {4710011405133,
## 4710011409056} => {4710011406123} 0.001474 0.493 60.1 176
## [27] {4710011405133,
## 4710011409056} => {4710011401128} 0.002278 0.762 55.6 272
## [28] {4710011406123,
## 4710011409056} => {4710011401128} 0.001993 0.758 55.3 238
## [29] {4710011401128,
## 4710011409056} => {4710011406123} 0.001993 0.448 54.6 238
## [30] {4710085120093,
## 4710085172696} => {4710085120628} 0.002135 0.570 50.2 255
## [31] {4710085120093,
## 4710085120628} => {4710085172696} 0.002135 0.539 61.9 255
## [32] {4710011401135,
## 4710011405133} => {4710011406123} 0.001633 0.444 54.1 195
## [33] {4710011401135,
## 4710011405133} => {4710011401128} 0.002839 0.772 56.3 339
## [34] {4710011401135,
## 4710011406123} => {4710011401128} 0.002453 0.803 58.6 293
## [35] {4710011401128,
## 4710011401135} => {4710011406123} 0.002453 0.419 51.0 293
## [36] {4710011405133,
## 4710011406123} => {4710011401128} 0.002227 0.702 51.2 266
## [37] {4710011401128,
## 4710011405133} => {4710011406123} 0.002227 0.429 52.3 266
## [38] {4710011401135,
## 4710011401142,
## 4710011409056} => {4710011401128} 0.000946 0.856 62.5 113
## [39] {4710085120093,
## 4710085172696,
## 4710085172702} => {4710085120628} 0.000879 0.652 57.4 105
## [40] {4710085120093,
## 4710085120628,
## 4710085172702} => {4710085172696} 0.000879 0.686 78.8 105
## [41] {4710011401135,
## 4710011405133,
## 4710011409056} => {4710011406123} 0.000996 0.527 64.2 119
## [42] {4710011401135,
## 4710011405133,
## 4710011409056} => {4710011401128} 0.001574 0.832 60.7 188
## [43] {4710011401135,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.001340 0.838 61.1 160
## [44] {4710011401128,
## 4710011401135,
## 4710011409056} => {4710011406123} 0.001340 0.492 60.0 160
## [45] {4710011405133,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.001164 0.790 57.6 139
## [46] {4710011401128,
## 4710011405133,
## 4710011409056} => {4710011406123} 0.001164 0.511 62.3 139
## [47] {4710011401135,
## 4710011405133,
## 4710011406123} => {4710011401128} 0.001357 0.831 60.6 162
## [48] {4710011401128,
## 4710011401135,
## 4710011405133} => {4710011406123} 0.001357 0.478 58.2 162
## [49] {4710011401135,
## 4710011405133,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.000862 0.866 63.1 103
## [50] {4710011401128,
## 4710011401135,
## 4710011405133,
## 4710011409056} => {4710011406123} 0.000862 0.548 66.8 103
## Joining, by = "cust"
以族群3為例,相較其他族群來說最常購買前三個,在購買其他產品時,也會推薦相對應關聯性高的其他產品,以達到精準個人化的購物體驗