library(readr)
library(dplyr)
library(purrr)
library(ggplot2)
library(lubridate)
library(pROC)
這一份資料是從2000/11 ~ 2001/2的交易資料
TRANSACTION_DT: 交易日期CUSTOMER_ID: 顧客IDAGE_GROUP: 年齡分群,從<25 ~ >65,5歲為一區隔PIN_CODE: 地區編碼PRODUCT_SUBCLASS: 產品類別PRODUCT_ID: 產品IDAMOUNT: 購買產品數量ASSET: 成本SALES_PRICE: 交易金額data <- read_csv("data/ta_feng_all_months_merged.csv") %>%
setNames(c("date", "customer", "age", "area", "category", "product", "qty", "cost", "price"))
## Parsed with column specification:
## cols(
## TRANSACTION_DT = col_character(),
## CUSTOMER_ID = col_character(),
## AGE_GROUP = col_character(),
## PIN_CODE = col_character(),
## PRODUCT_SUBCLASS = col_integer(),
## PRODUCT_ID = col_character(),
## AMOUNT = col_integer(),
## ASSET = col_integer(),
## SALES_PRICE = col_integer()
## )
將資料讀取進來,並將colnames換成較簡易的名稱。
先看個欄位的NA值有多少,只有AGE_GRUOP有22362個NA,大概佔所有 observation 2%左右。 可以有兩種處理方式,第一種把NA值全部拿掉,第二個預測NA的可能族群
sapply(data, function(x) sum(is.na(x)))
## date customer age area category product qty cost
## 0 0 22362 0 0 0 0 0
## price
## 0
將date格式從char轉為Date,age group的na用文字unknown標籤取代
data$date <- mdy(data$date)
data$age[is.na(data$age)] <- "Unknown"
data$age <- factor(data$age, levels = c("Unknown", "<25", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", ">65"), labels = c("Unknown", "a20", "a25", "a30", "a35", "a40", "a45", "a50", "a55", "a60", "a65")) %>% as.character()
summary(data)
## date customer age
## Min. :2000-11-01 Length:817741 Length:817741
## 1st Qu.:2000-11-28 Class :character Class :character
## Median :2001-01-01 Mode :character Mode :character
## Mean :2000-12-30
## 3rd Qu.:2001-01-30
## Max. :2001-02-28
## area category product qty
## Length:817741 Min. :100101 Length:817741 Min. : 1.000
## Class :character 1st Qu.:110106 Class :character 1st Qu.: 1.000
## Mode :character Median :130106 Mode :character Median : 1.000
## Mean :284950 Mean : 1.382
## 3rd Qu.:520314 3rd Qu.: 1.000
## Max. :780510 Max. :1200.000
## cost price
## Min. : 0.0 Min. : 1.0
## 1st Qu.: 35.0 1st Qu.: 42.0
## Median : 62.0 Median : 76.0
## Mean : 112.1 Mean : 131.9
## 3rd Qu.: 112.0 3rd Qu.: 132.0
## Max. :432000.0 Max. :444000.0
qty、cost、price找這三欄outlier threshold值會是多少
sapply(data[, 7:9], quantile, prob = c(.99, .999, .9995))
## qty cost price
## 99% 6 858.0 1014.00
## 99.9% 14 2722.0 3135.82
## 99.95% 24 3799.3 3999.00
超過99.95%的資料我們不要
grocery <- subset(data, qty <= 24 & cost <= 3799 & price <= 3999)
假設同一天同一位顧客當天的交易都是同一筆,並賦予交易序號
grocery$tid <- group_indices(grocery, date, customer) # same customer same day
head(arrange(grocery, tid), 10)
## # A tibble: 10 x 10
## date customer age area category produ… qty cost price tid
## <date> <chr> <chr> <chr> <int> <chr> <int> <int> <int> <int>
## 1 2000-11-01 00038317 a65 115 130315 47149… 2 56 48 1
## 2 2000-11-01 00038317 a65 115 120105 47190… 1 28 28 1
## 3 2000-11-01 00045902 a55 115 100304 47101… 1 24 28 2
## 4 2000-11-01 00045902 a55 115 130204 47100… 1 114 119 2
## 5 2000-11-01 00045902 a55 115 100511 47105… 6 210 313 2
## 6 2000-11-01 00045902 a55 115 100113 47102… 1 112 95 2
## 7 2000-11-01 00045957 a50 115 110217 47102… 1 180 133 3
## 8 2000-11-01 00046855 a35 115 110401 47100… 1 43 39 4
## 9 2000-11-01 00046855 a35 115 110117 47100… 1 77 89 4
## 10 2000-11-01 00046855 a35 115 110411 47100… 3 51 57 4
處理完後檢視看看資料
grocery %>%
select(customer, category, product, tid) %>%
sapply(n_distinct)
## customer category product tid
## 32256 2007 23789 119422
summary(grocery)
## date customer age
## Min. :2000-11-01 Length:817181 Length:817181
## 1st Qu.:2000-11-28 Class :character Class :character
## Median :2001-01-01 Mode :character Mode :character
## Mean :2000-12-30
## 3rd Qu.:2001-01-30
## Max. :2001-02-28
## area category product qty
## Length:817181 Min. :100101 Length:817181 Min. : 1.000
## Class :character 1st Qu.:110106 Class :character 1st Qu.: 1.000
## Mode :character Median :130106 Mode :character Median : 1.000
## Mean :284784 Mean : 1.358
## 3rd Qu.:520311 3rd Qu.: 1.000
## Max. :780510 Max. :24.000
## cost price tid
## Min. : 0.0 Min. : 1.0 Min. : 1
## 1st Qu.: 35.0 1st Qu.: 42.0 1st Qu.: 28783
## Median : 62.0 Median : 76.0 Median : 59391
## Mean : 106.2 Mean : 125.5 Mean : 58845
## 3rd Qu.: 112.0 3rd Qu.: 132.0 3rd Qu.: 87391
## Max. :3798.0 Max. :3999.0 Max. :119422
grocery %>%
group_by(age) %>%
summarize(count = n()) %>%
ggplot(aes(x = age, y = count, fill = age)) +
geom_bar(stat = "identity")
前五名的商品
top6product <- grocery %>%
count(product) %>%
arrange(desc(n)) %>%
top_n(6)
## Selecting by n
grocery <- grocery %>%
mutate(unitPrice = round(price / qty))
grocery %>%
semi_join(top6product, by = "product") %>%
group_by(date, product) %>%
summarize(unitPrice = mean(unitPrice),
sum_qty = sum(qty)) %>%
arrange(date, product) %>%
ggplot(aes(date, unitPrice, color = factor(product))) +
geom_line() +
geom_line(aes(x = date, y = sum_qty), linetype = "dotdash") +
facet_wrap(~ product, scales = "free_y") +
theme(legend.position = "none")
transaction <- grocery %>%
group_by(tid) %>%
summarise(
date = date[1], # 交易日期
customer = customer[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
tcount = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
items = n_distinct(category),# 產品種類數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
)
建立完交易資料後,一樣處理outlier。
sapply(transaction[, 6:9], quantile, prob = c(.999, .9995, .9999))
## tcount pieces items total
## 99.9% 54 81.0000 42.0000 9009.579
## 99.95% 62 94.2895 47.0000 10611.579
## 99.99% 82 133.0000 61.1158 16044.401
transaction <- subset(transaction, tcount <= 62 & pieces < 95 & total < 16000) # 119328
par(cex = 0.8)
hist(transaction$date, "weeks", freq = T, las = 2, col = "lightblue", main = "No. Transaction per Week")
library(d3heatmap)
table(transaction$age, format(transaction$date, "%u")) %>%
{./rowSums(.)} %>%
as.data.frame.matrix() %>%
d3heatmap(F, F, col = colorRamp(c("seagreen", "lightyellow", "red")))
可以看出消費頻率比較高的集中在週日與30~40歲壯年人口,整體來說假日的消費頻率比較高。
table(transaction$area, format(transaction$date, "%u")) %>%
{./rowSums(.)} %>%
as.data.frame.matrix() %>%
d3heatmap(F, F, col = colorRamp(c("seagreen", "lightyellow", "red")))
table(transaction$age, transaction$area) %>%
{./rowSums(.)} %>%
as.data.frame.matrix() %>%
d3heatmap(F, F, col = colorRamp(c("seagreen", "lightyellow", "red")))
transaction %>%
group_by(area, date) %>%
summarize(num_tran = n()) %>%
ggplot(aes(x = date, y = num_tran)) +
geom_bar(aes(x = date, y = num_tran, color = area), stat = "identity", alpha = 0.8) +
facet_wrap(~area) +
labs(y = "# Transactions", x = "Date")
我們假設今天為拿到資料的隔天2001/3/1要來分析與做行銷策略。
today <- max(grocery$date) + 1
customer <- transaction %>%
mutate(days = as.integer(today - date)) %>%
group_by(customer) %>%
summarise(
recency = min(days), # 最近一次購買距今天數
senior = max(days), # 第一次購買距今天數
coming = as.integer(!recency == senior), # 第一次來買有沒有再來買一次
freq = n(), # 消費次數
money = round(mean(total), 2), # 平均購買金額
revenue = sum(total), # 公司在他身上拿了多少
net = sum(gross), # 公司淨賺他多少
items = items[1], # 不同產品種類數
age = age[1],
area = area[1],
since = min(date) # 第一次購買日期
) %>%
arrange(customer)
視覺化RFM,看分佈
par(mfrow = c(2, 2))
hist(customer$recency, main = "recency")
hist(log10(customer$freq), main = "log_frequency")
hist(log10(customer$money), main = "log_money")
hist(customer$senior, main = "senior")
map(list("grocery" = grocery, "transaction" = transaction, "customer" = customer), function(x) colSums(is.na(x)))
## $grocery
## date customer age area category product qty
## 0 0 0 0 0 0 0
## cost price tid unitPrice
## 0 0 0 0
##
## $transaction
## tid date customer age area tcount pieces items
## 0 0 0 0 0 0 0 0
## total gross
## 0 0
##
## $customer
## customer recency senior coming freq money revenue net
## 0 0 0 0 0 0 0 0
## items age area since
## 0 0 0 0
先用階層式分群來分析顧客資料
linkage algo 用 complete方式
#rfm_dist <- dist(scale(customer[, c("recency", "coming", "freq", "money", "items")]))
#rfm_hc <- hclust(rfm_dist, method = "complete");rm(rfm_dist)
load("rfm_hc.RData")
plot(rfm_hc)
rfm_cluster <- cutree(rfm_hc, h = 11)
table(rfm_cluster)
## rfm_cluster
## 1 2 3 4
## 29466 2612 103 60
customer <- mutate(customer, cluster = rfm_cluster)
先分4群看看,看起來會是比較好的結果,接下來畫RFM以及分群結果
group_by(customer, cluster) %>%
summarise(
recency = mean(recency),
freq = mean(freq),
money = mean(money),
size = n()
) %>%
mutate(revenue = size * money / 1000 ) %>%
filter(size > 1) %>%
ggplot(aes(x = freq, y = money)) +
geom_point(aes(size = revenue, col = recency), alpha=0.5) +
scale_size(range = c(4, 40)) +
scale_color_gradient(low = "green", high = "red") +
scale_x_log10() + scale_y_log10() +
geom_text(aes(label = size )) +
theme_bw() + guides(size = F) +
labs(title = "Customer Segements",
subtitle = "(bubble_size:revenue_contribution; text:group_size)",
color = "Recency") +
xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")
接下來我們好奇這4群顧客變數分別平均會是多少,看他們的特性。
customer %>%
select(-c(customer, age, area, since)) %>%
group_by(cluster) %>%
summarise_all(funs(mean(.)))
## # A tibble: 4 x 9
## cluster recency senior coming freq money revenue net items
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 36.7 81.4 0.650 3.68 792. 2777. 416. 5.77
## 2 2 46.7 72.9 0.419 2.18 3121. 6174. 1074. 18.6
## 3 3 2.28 118. 1 49.6 593. 30062. 4274. 5.25
## 4 4 54.0 62.0 0.15 1.27 7734. 9435. 1773. 29.1
par(cex = 0.7)
customer %>%
select(-c(1, 4, 10, 11, 12, 13)) %>%
mutate_all(scale) %>%
split(customer$cluster) %>%
sapply(colMeans) %>%
barplot(beside = T, col = rainbow(7))
legend('topleft',legend=colnames(customer[, -c(1, 4, 10, 11, 12, 13)]), fill = rainbow(7))
tot_withinss <- map_dbl(1:10, function(k){
model <- kmeans(x = customer %>% select(-c("customer", "age", "area", "since", "cluster")), centers = k)
model$tot.withinss
})
elbow_df <- data.frame(
k = 1:10 ,
tot_withinss = tot_withinss
)
# Plot the elbow plot
ggplot(elbow_df, aes(x = k, y = tot_withinss)) +
geom_line() +
scale_x_continuous(breaks = 1:10)
由上圖可知,k=3或4模型的複雜度增加沒有顯著增長,最好的分群方式會是分3或4群為主,將資料用kmeans model分群。
rfm_km.3 <- customer %>%
select(-c("customer", "age", "area", "since", "cluster")) %>%
kmeans(centers = 3)
table(rfm_km.3$cluster)
##
## 1 2 3
## 717 25238 6286
customer %>%
mutate(cluster = rfm_km.3$cluster) %>%
group_by(cluster) %>%
summarise(
recency = mean(recency),
freq = mean(freq),
money = mean(money),
size = n()
) %>%
mutate(revenue = size * money / 1000 ) %>%
filter(size > 1) %>%
ggplot(aes(x = freq, y = money)) +
geom_point(aes(size = revenue, col = recency), alpha=0.5) +
scale_size(range = c(4,40)) +
scale_color_gradient(low="green", high="red") +
scale_x_log10() + scale_y_log10() +
geom_text(aes(label = size )) +
theme_bw() + guides(size = F) +
labs(title="Customer Segements",
subtitle="(bubble_size:revenue_contribution; text:group_size)",
color="Recency") +
xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")
rfm_km.4 <- customer %>%
select(-c("customer", "age", "area", "since", "cluster")) %>%
kmeans(centers = 4)
table(rfm_km.4$cluster)
##
## 1 2 3 4
## 1775 175 22659 7632
customer %>%
mutate(cluster = rfm_km.4$cluster) %>%
group_by(cluster) %>%
summarise(
recency = mean(recency),
freq = mean(freq),
money = mean(money),
size = n()
) %>%
mutate(revenue = size * money / 1000 ) %>%
filter(size > 1) %>%
ggplot(aes(x = freq, y = money)) +
geom_point(aes(size = revenue, col = recency), alpha=0.5) +
scale_size(range = c(4,40)) +
scale_color_gradient(low="green", high="red") +
scale_x_log10() + scale_y_log10() +
geom_text(aes(label = size )) +
theme_bw() + guides(size = F) +
labs(title="Customer Segements",
subtitle="(bubble_size:revenue_contribution; text:group_size)",
color="Recency") +
xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
set.seed(2020)
rfm_km.7 <- customer %>%
select(-c("customer", "age", "area", "since", "cluster")) %>%
kmeans(centers = 7, nstart = 20)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1612050)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1612050)
table(rfm_km.7$cluster)
##
## 1 2 3 4 5 6 7
## 875 30 8761 4684 15407 246 2238
customer %>%
mutate(cluster = rfm_km.7$cluster) %>%
group_by(cluster) %>%
summarise(
recency = mean(recency),
freq = mean(freq),
money = mean(money),
size = n()
) %>%
mutate(revenue = size * money / 1000 ) %>%
filter(size > 1) %>%
ggplot(aes(x = freq, y = money)) +
geom_point(aes(size = revenue, col = recency), alpha=0.5) +
scale_size(range = c(4,40)) +
scale_color_gradient(low="green", high="red") +
scale_x_log10() + scale_y_log10() +
geom_text(aes(label = size )) +
theme_bw() + guides(size = F) +
labs(title="Customer Segements",
subtitle="(bubble_size:revenue_contribution; text:group_size)",
color="Recency") +
xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")
將顧客分結果join回去交易資料
customer_group <- customer %>% select(customer, cluster)
transaction_cluster <- transaction %>% left_join(customer_group, by = "customer")
grocery_cluster <- grocery %>% left_join(customer_group, by = "customer")
觀看分群顧客最喜歡哪些產品,能帶來較大獲利
grocery_cluster %>%
filter(!is.na(cluster)) %>%
mutate(gross = price - cost) %>%
group_by(cluster, product) %>%
summarize(
tcount = n(),
sum_total = sum(price),
sum_gross = sum(gross)
) %>%
arrange(desc(sum_gross)) %>%
top_n(5) %>%
ungroup() %>%
ggplot(aes(x = product, y = sum_gross)) +
geom_col(aes(fill = product), alpha = 0.8) +
facet_wrap(~ cluster, scales = "free") +
theme(axis.text.x = element_text(angle = 25))
## Selecting by sum_gross
table(customer$age, customer$cluster) %>%
{./rowSums(.)} %>%
as.data.frame.matrix() %>%
d3heatmap(F, F, col = colorRamp(c("seagreen", "lightyellow", "red")))
取出2001-02-01之前的資料,然後做跟上面一樣的事情
grocery.x <- subset(grocery, date < ymd("2001-02-01"))
tail(grocery.x, 10)
## # A tibble: 10 x 11
## date customer age area category product qty cost price tid
## <date> <chr> <chr> <chr> <int> <chr> <int> <int> <int> <int>
## 1 2001-01-31 01564846 a35 221 130208 471313… 1 61 79 87915
## 2 2001-01-31 01813319 a30 Othe… 100514 471062… 1 138 147 88087
## 3 2001-01-31 01866926 a35 110 100414 471126… 1 105 139 88149
## 4 2001-01-31 01469271 a40 115 530301 471018… 1 171 204 87878
## 5 2001-01-31 02000558 a25 115 100507 471005… 2 32 42 88230
## 6 2001-01-31 02005409 a25 221 560402 871204… 3 1959 1980 88235
## 7 2001-01-31 01759372 a20 115 100324 471070… 1 160 189 88052
## 8 2001-01-31 02107356 a40 115 100309 471074… 1 79 95 88315
## 9 2001-01-31 01804744 a35 115 300711 471148… 1 32 48 88082
## 10 2001-01-31 01846683 Unkn… Unkn… 500501 205438… 1 78 72 88120
## # ... with 1 more variable: unitPrice <dbl>
transaction.x <- grocery.x %>%
group_by(tid) %>%
summarise(
date = date[1], # 交易日期
customer = customer[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
tcount = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
items = n_distinct(category),# 產品種類數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
)
sapply(transaction.x[, 6:10], quantile, prob = c(.999, .9995, .9999))
## tcount pieces items total gross
## 99.9% 56.0000 84.0000 43 9378.684 1883.228
## 99.95% 64.0000 98.0000 49 11261.751 2317.087
## 99.99% 85.6456 137.6456 66 17699.325 3389.646
transaction.x <- subset(transaction.x, tcount <= 64 & pieces <= 98 & total <= 11261)
max_date <- max(transaction.x$date)
customer.x <- transaction.x %>%
mutate(days = as.integer(max_date - date)) %>%
group_by(customer) %>%
summarise(
recency = min(days), # 最近一次購買距今天數
senior = max(days), # 第一次購買距今天數
coming = as.integer(!recency == senior), # 第一次來買有沒有再來買一次
freq = n(), # 消費次數
money = round(mean(total), 2), # 平均購買金額
revenue = sum(total), # 公司在他身上拿了多少
net = sum(gross), # 公司淨賺他多少
items = items[1], # 不同產品種類數
age = age[1],
area = area[1],
since = min(date) # 第一次購買日期
) %>%
arrange(customer)
Y: 2月是否有購買
先準備每個顧客2月的購買金額,若沒有則為NA。
feb_customer <- transaction %>%
filter(date >= ymd("2001-02-01")) %>%
group_by(customer) %>%
summarize(feb_amount = sum(total))
將2月顧客的消費金額left_join回去11~1月的顧客資料,若2月沒有購買feb_amount會顯示NA,新增一個欄位buy是2月有購買不是NA為TRUE。
customer.x <- left_join(customer.x, feb_customer, by = "customer")
customer.x$buy <- !is.na(customer.x$feb_amount)
tapply(customer.x$buy, customer.x$age, mean) %>%
barplot(xlab = "Age", ylab = "Mean of Buy")
abline(h = mean(customer.x$buy), col = 'red')
tapply(customer.x$buy, customer.x$area, mean) %>%
barplot(xlab = "Area", ylab = "Mean of Buy")
abline(h = mean(customer.x$buy), col = 'red')
因為transaction.x處理過outlier,確保原grocery.x的資料沒有outlier。
grocery.x <- filter(grocery.x, customer %in% customer.x$customer)
library(caTools)
set.seed(2019)
spl.x <- sample.split(customer.x$buy, SplitRatio = 0.7)
c(nrow(customer.x), sum(spl.x), sum(!spl.x))
## [1] 28584 20008 8576
檢查training&testing分佈是否一致
cbind(customer.x, spl.x) %>%
ggplot(aes(x = log(feb_amount), fill = spl.x)) +
geom_density(alpha = 0.5)
## Warning: Removed 15342 rows containing non-finite values (stat_density).
buy_train <- customer.x[spl.x, ]
buy_test <- customer.x[!spl.x, ]
buy.model <- glm(buy ~ money + recency * freq + items, buy_train, family = binomial)
summary(buy.model)
##
## Call:
## glm(formula = buy ~ money + recency * freq + items, family = binomial,
## data = buy_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3823 -0.8991 -0.7509 1.0989 1.8540
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.057e+00 5.063e-02 -20.872 < 2e-16 ***
## money -9.185e-05 2.141e-05 -4.289 1.79e-05 ***
## recency -4.046e-03 1.111e-03 -3.641 0.000272 ***
## freq 4.492e-01 1.434e-02 31.334 < 2e-16 ***
## items 7.315e-03 3.409e-03 2.146 0.031881 *
## recency:freq -1.739e-03 5.407e-04 -3.216 0.001298 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23360 on 20002 degrees of freedom
## AIC: 23372
##
## Number of Fisher Scoring iterations: 6
套用logistic function發現只有revenue不顯著。
混淆矩陣
buy.pred <- predict(buy.model, buy_test, type = "response")
cm <- table(actual = buy_test$buy, predict = buy.pred > 0.5); cm
## predict
## actual FALSE TRUE
## FALSE 3830 773
## TRUE 1839 2134
cm %>% {sum(diag(.))/sum(.)}
## [1] 0.6954291
ROC AUC=0.7334
ROC <- roc(buy_test$buy, buy.pred)
plot(ROC, col = "red")
auc(ROC)
## Area under the curve: 0.7443
buy_train$buy <- buy_train$buy %>% as.integer() %>% as.factor()
buy_test$buy <- buy_test$buy %>% as.integer() %>% as.factor()
library(rpart)
buy_tree <- rpart(buy ~ ., data = buy_train[, -c(1, 10, 11, 12, 13)], method = "class", control = rpart.control(cp = 0, maxdepth = 10))
library(rpart.plot)
# rpart.plot(buy_tree_pruned)
rpart.plot(buy_tree, type = 3, box.palette = c("red", "green"), fallen.leaves = TRUE)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
buy.tree.pred <- predict(buy_tree, buy_test, type = "class")
table(buy.tree.pred, buy_test$buy)
##
## buy.tree.pred 0 1
## 0 3588 1684
## 1 1015 2289
mean(buy.tree.pred == buy_test$buy)
## [1] 0.6852845
修剪決策樹,發現準確度有微微上升至70%
buy_tree_pruned <- prune(buy_tree, cp = 0.0015)
buy_test$pred <- predict(buy_tree_pruned, buy_test, type = "class")
mean(buy_test$pred == buy_test$buy)
## [1] 0.7017257
試試看隨機森林的準確度
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
buy_forest <- randomForest(buy ~ ., data = buy_train[, -c(1, 10, 11, 12, 13)])
buy_test$pred <- predict(buy_forest, buy_test, type = "class")
mean(buy_test$pred == buy_test$buy)
## [1] 0.6983442
只篩選有回購的顧客,並對金額取log10
customer.y <- subset(customer.x, buy) %>%
mutate(money = money + 1, revenue = revenue + 1, feb_amount = feb_amount + 1) %>%
mutate_at(c("money", "revenue", "feb_amount"), log10)
spl.y <- sample(nrow(customer.y), nrow(customer.y) * 0.7)
amount_train <- customer.y[spl.y, ]
amount_test <- customer.y[-spl.y, ]
lm.model <- lm(feb_amount ~ money + recency * freq + revenue, amount_train)
summary(lm.model)
##
## Call:
## lm(formula = feb_amount ~ money + recency * freq + revenue, data = amount_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94471 -0.23435 0.04971 0.28524 1.60731
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1825476 0.0350875 33.703 < 2e-16 ***
## money 0.3917323 0.0322272 12.155 < 2e-16 ***
## recency 0.0020201 0.0003540 5.707 1.18e-08 ***
## freq 0.0222892 0.0016726 13.326 < 2e-16 ***
## revenue 0.1674661 0.0305376 5.484 4.27e-08 ***
## recency:freq -0.0006890 0.0001207 -5.709 1.17e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4274 on 9263 degrees of freedom
## Multiple R-squared: 0.271, Adjusted R-squared: 0.2706
## F-statistic: 688.8 on 5 and 9263 DF, p-value: < 2.2e-16
plot(amount_train$feb_amount, predict(lm.model), col = 'pink', cex = 0.65)
abline(0, 1,col='red')
篩選出2000-12-01~20001-02-28
B <- transaction %>%
filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(today, date, units = "days"))) %>%
group_by(customer) %>%
summarise(
recency = min(days), # 最近一次購買距今天數
senior = max(days), # 第一次購買距今天數
coming = as.integer(!recency == senior), # 第一次來買有沒有再來買一次
freq = n(), # 消費次數
money = round(mean(total), 2), # 平均購買金額
revenue = sum(total), # 公司在他身上拿了多少
net = sum(gross), # 公司淨賺他多少
items = items[1], # 不同產品種類數
age = age[1],
area = area[1],
since = min(date) # 第一次購買日期
)
nrow(B) #28531
## [1] 28531
預測3月購買機率與金額
B$buy <- predict(buy.model, B, type="response")
B2 <- B %>% mutate_at(c("money","revenue"), log10)
B$rev <- 10 ^ predict(lm.model, B2); rm(B2)
par(mfrow = c(1,2), cex = 0.8)
hist(B$buy)
hist(log(B$rev,10))
effect = .5 # 回購機率
cost = 200 # 成本假設
Target <- B %>% inner_join(customer_group, by = "customer")
Target$ExpReturn <- (effect - Target$buy) * Target$rev - cost
Target %>%
filter(Target$ExpReturn > 0) %>%
group_by(cluster) %>%
summarise(
No.Target = n(),
AvgROI = mean(ExpReturn),
TotalROI = sum(ExpReturn)
) %>%
filter(cluster == 4)
## # A tibble: 1 x 4
## cluster No.Target AvgROI TotalROI
## <int> <int> <dbl> <dbl>
## 1 4 46 607. 27919.
target4 <- filter(Target, cluster == 4)
random <- sample(1:nrow(target4), nrow(target4))
c("no.target" = round(length(random)),
"AvgROI" = mean(target4[random, ]$rev - 100),
"TotalROI" = sum(target4[random, ]$rev - 100))
## no.target AvgROI TotalROI
## 48.000 2679.282 128605.553
group3 <- Target %>%
filter(cluster == 3) %>%
mutate(ExpReturn = rev * 1.2 * 0.5 - 150) %>%
group_by(cluster) %>%
summarize(
No.Target = round(n() * 0.5),
AvgROI = mean(ExpReturn),
TotalROI = sum(ExpReturn)
)