Zrm(list=ls(all=T))
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr, vcd, Matrix,plotly)Z = read_csv("data/ta_feng_all_months_merged.csv") %>%
data.frame %>% setNames(c(
"date","cust","age","area","cat","prod","qty","cost","price"))##
## -- Column specification --------------------------------------------------------
## cols(
## TRANSACTION_DT = col_character(),
## CUSTOMER_ID = col_character(),
## AGE_GROUP = col_character(),
## PIN_CODE = col_character(),
## PRODUCT_SUBCLASS = col_double(),
## PRODUCT_ID = col_character(),
## AMOUNT = col_double(),
## ASSET = col_double(),
## SALES_PRICE = col_double()
## )
nrow(Z)## [1] 817741
四個月的交易總數
Z$date = as.Date(Z$date, format="%m/%d/%Y")
par(cex=0.8)
#hist(Z$date,'weeks',freq=T,las=2)age.group = c("<25","25-29","30-34","35-39","40-44",
"45-49","50-54","55-59","60-64",">65")
Z$age = c(paste0("a",seq(24,69,5)),"a99")[match(Z$age,age.group,11)]
Z$area = paste0("z",Z$area)#par(mfrow=c(1,2),cex=0.7)
#table(Z$age, useNA='ifany') %>% barplot(main="Age Groups", las=2)
#table(Z$area,useNA='ifany') %>% barplot(main="Areas", las=2)# Quantile of Variables
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))## qty cost price
## 99% 6 858.0 1014.00
## 99.9% 14 2722.0 3135.82
## 99.95% 24 3799.3 3999.00
# Remove Outliers
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000)
nrow(Z) ## [1] 817182
把每一天、每一為顧客的交易項目彙總為一張訂單
Z$tid = group_indices(Z, date, cust) # same customer same day## Warning: The `...` argument of `group_keys()` is deprecated as of dplyr 1.0.0.
## Please `group_by()` first
# No. cust, cat, prod, tid
sapply(Z[c("cust","cat","prod","tid")], n_distinct)## cust cat prod tid
## 32256 2007 23789 119422
XX = Z %>% group_by(tid) %>% summarise(
date = min(date), # 交易日期
cust = min(cust), # 顧客 ID
age = min(age), # 顧客 年齡級別
area = min(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(X) # 119422 ## [1] 119422
# Check Quantile & Remove Outliers
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))## items pieces total gross
## 99.9% 54 81.0000 9009.579 1824.737
## 99.95% 62 94.2895 10611.579 2179.817
## 99.99% 82 133.0000 16044.401 3226.548
# Remove Outliers
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328summary(X) ## tid date cust age
## Min. : 1 Min. :2000-11-01 Length:119328 Length:119328
## 1st Qu.: 29855 1st Qu.:2000-11-29 Class :character Class :character
## Median : 59705 Median :2001-01-01 Mode :character Mode :character
## Mean : 59712 Mean :2000-12-31
## 3rd Qu.: 89581 3rd Qu.:2001-02-02
## Max. :119422 Max. :2001-02-28
## area items pieces total
## Length:119328 Min. : 1.000 Min. : 1.000 Min. : 5.0
## Class :character 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 227.0
## Mode :character Median : 5.000 Median : 6.000 Median : 510.0
## Mean : 6.802 Mean : 9.222 Mean : 851.6
## 3rd Qu.: 9.000 3rd Qu.:12.000 3rd Qu.: 1080.0
## Max. :62.000 Max. :94.000 Max. :15345.0
## gross
## Min. :-1645.0
## 1st Qu.: 21.0
## Median : 68.0
## Mean : 130.9
## 3rd Qu.: 168.0
## Max. : 3389.0
par(cex=0.8)
hist(X$date, "weeks", freq=T, las=2, main="No. Transaction per Week")Ad0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>% group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = min(age), # age group
area = min(area), # area code
) %>% data.frame
nrow(A) # 32241## [1] 32241
par(mfrow=c(1,2),cex=0.7)
table(A$age, useNA='ifany') %>% barplot(main="Age Groups",las=2)
table(A$area, useNA='ifany') %>% barplot(main="Areas",las=2) > 顧客大多集中在南港及汐止,且消費族群多為30~45歲
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")load("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)## cust tid items
## 32241 119328 817182
共32241位顧客,119328筆交易資料
#畫出年齡與地區兩類別所形成的馬賽克圖。以z115地區為例:跟所有地區年齡分佈相比,住在z115地區的人,24與49歲以上的人顯著的多;34~39歲的人顯著得少。
MOSA = function(formula, data) mosaic(formula, data, shade=T,
margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)
MOSA(~area+age, A0)#從這張泡泡圖可以推測,44歲、49歲為主力顧客
A0 %>% filter(age!="a99") %>% # 濾掉沒有年齡資料的顧客('a99')
group_by(age) %>% summarise(
Group.Size = n(), # 族群人數
avg.Days = mean(r), # 平均購買次數
avg.Revenue = sum(f*m)/sum(f) # 平均客單價
) %>%
ggplot(aes(x=avg.Days, y=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") +
ylab("平均客單價") + xlab("近期消費") -> aa
ggplotly(aa)我這邊想分四群來進行分析
cats = Z0 %>% group_by(cat) %>% summarise(
noProd = n_distinct(prod),#每類產品的編號(可看有幾類產品)
totalQty = sum(qty),#每一個類別的產品的售出總數量
totalRev = sum(price),#獲利
totalGross = sum(price) - sum(cost),#毛利
grossMargin = totalGross/totalRev,#毛利率
avgPrice = totalRev/totalQty#平均售價
)#畫出營收貢獻最大的前40個品類
par(mfrow=c(2,1), cex=0.7)
cats$totalRev %>% sort(dec=T) %>% {cumsum(.)[1:40]/sum(.)} %>%
barplot(names=1:40,las=2,main="acc. percentage of reveune")
abline(h=seq(0,1,0.1),col='dark red')
cats$totalRev %>% sort(dec=T) %>% {cumsum(.)[1:40]/sum(.)} %>%
barplot(names=1:40,las=2,main="acc. percentage of gross profit")
abline(h=seq(0,1,0.1),col='dark orange')top20 = tapply(Z0$qty,Z0$cat,sum) %>% sort %>% tail(20) %>% names
#跟整體顧客的購買習慣相比,39歲的族群購買100205、100312、560201這幾類顯著的多
#意思就是可以從這張圖看到各年齡(族群)的偏好購買類別
MOSA(~age+cat, Z0[Z0$cat %in% top20,])#畫出週間的交易數量
X0$wday = format(X0$date, "%u")
par(cex=0.7, mar=c(2,3,2,1))
table(X0$wday) %>% barplot(main="No. Transactions in Week Days")#從這張圖可以看出各年齡層喜歡在星期幾去做消費
#跟所有年齡的顧客相比,34與39歲的顧客喜歡在禮拜日買東西
MOSA(~age+wday, X0)#可以看出哪個類別在平日賣得比較好,哪些在假日賣得比較好,以訂定行銷策略
df = Z0 %>% filter(cat %in% top20) %>% mutate(wday = format(date, '%u'))
MOSA(~cat+wday, df) >可以分群後再畫一次