R商業分析期末報告

期末影片

https://www.youtube.com/watch?v=t9VmmugshPE&feature=youtu.be

載入套件

library(readr)
library(dplyr)
library(purrr)
library(ggplot2)
library(lubridate)
library(pROC)

讀取資料

這一份資料是從2000/11 ~ 2001/2的交易資料

TRANSACTION_DT: 交易日期
CUSTOMER_ID: 顧客ID
AGE_GROUP: 年齡分群，從<25 ~ >65，5歲為一區隔
PIN_CODE: 地區編碼
PRODUCT_SUBCLASS: 產品類別
PRODUCT_ID: 產品ID
AMOUNT: 購買產品數量
ASSET: 成本
SALES_PRICE: 交易金額

data <- read_csv("data/ta_feng_all_months_merged.csv") %>% 
  setNames(c("date", "customer", "age", "area", "category", "product", "qty", "cost", "price"))

## Parsed with column specification:
## cols(
##   TRANSACTION_DT = col_character(),
##   CUSTOMER_ID = col_character(),
##   AGE_GROUP = col_character(),
##   PIN_CODE = col_character(),
##   PRODUCT_SUBCLASS = col_integer(),
##   PRODUCT_ID = col_character(),
##   AMOUNT = col_integer(),
##   ASSET = col_integer(),
##   SALES_PRICE = col_integer()
## )

將資料讀取進來，並將colnames換成較簡易的名稱。

Data Cleaning

格式處理

先看個欄位的NA值有多少，只有AGE_GRUOP有22362個NA，大概佔所有 observation 2%左右。可以有兩種處理方式，第一種把NA值全部拿掉，第二個預測NA的可能族群

sapply(data, function(x) sum(is.na(x)))

##     date customer      age     area category  product      qty     cost 
##        0        0    22362        0        0        0        0        0 
##    price 
##        0

將date格式從char轉為Date，age group的na用文字unknown標籤取代

data$date <- mdy(data$date)
data$age[is.na(data$age)] <- "Unknown"
data$age <- factor(data$age, levels = c("Unknown", "<25", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", ">65"), labels = c("Unknown", "a20", "a25", "a30", "a35", "a40", "a45", "a50", "a55", "a60", "a65")) %>% as.character()
summary(data)

##       date              customer             age           
##  Min.   :2000-11-01   Length:817741      Length:817741     
##  1st Qu.:2000-11-28   Class :character   Class :character  
##  Median :2001-01-01   Mode  :character   Mode  :character  
##  Mean   :2000-12-30                                        
##  3rd Qu.:2001-01-30                                        
##  Max.   :2001-02-28                                        
##      area              category        product               qty          
##  Length:817741      Min.   :100101   Length:817741      Min.   :   1.000  
##  Class :character   1st Qu.:110106   Class :character   1st Qu.:   1.000  
##  Mode  :character   Median :130106   Mode  :character   Median :   1.000  
##                     Mean   :284950                      Mean   :   1.382  
##                     3rd Qu.:520314                      3rd Qu.:   1.000  
##                     Max.   :780510                      Max.   :1200.000  
##       cost              price         
##  Min.   :     0.0   Min.   :     1.0  
##  1st Qu.:    35.0   1st Qu.:    42.0  
##  Median :    62.0   Median :    76.0  
##  Mean   :   112.1   Mean   :   131.9  
##  3rd Qu.:   112.0   3rd Qu.:   132.0  
##  Max.   :432000.0   Max.   :444000.0

處理outlier

qty、cost、price找這三欄outlier threshold值會是多少

sapply(data[, 7:9], quantile, prob = c(.99, .999, .9995))

##        qty   cost   price
## 99%      6  858.0 1014.00
## 99.9%   14 2722.0 3135.82
## 99.95%  24 3799.3 3999.00

超過99.95%的資料我們不要

grocery <- subset(data, qty <= 24 & cost <= 3799 & price <= 3999)

假設同一天同一位顧客當天的交易都是同一筆，並賦予交易序號

grocery$tid <-  group_indices(grocery, date, customer) # same customer same day
head(arrange(grocery, tid), 10)

## # A tibble: 10 x 10
##    date       customer age   area  category produ…   qty  cost price   tid
##    <date>     <chr>    <chr> <chr>    <int> <chr>  <int> <int> <int> <int>
##  1 2000-11-01 00038317 a65   115     130315 47149…     2    56    48     1
##  2 2000-11-01 00038317 a65   115     120105 47190…     1    28    28     1
##  3 2000-11-01 00045902 a55   115     100304 47101…     1    24    28     2
##  4 2000-11-01 00045902 a55   115     130204 47100…     1   114   119     2
##  5 2000-11-01 00045902 a55   115     100511 47105…     6   210   313     2
##  6 2000-11-01 00045902 a55   115     100113 47102…     1   112    95     2
##  7 2000-11-01 00045957 a50   115     110217 47102…     1   180   133     3
##  8 2000-11-01 00046855 a35   115     110401 47100…     1    43    39     4
##  9 2000-11-01 00046855 a35   115     110117 47100…     1    77    89     4
## 10 2000-11-01 00046855 a35   115     110411 47100…     3    51    57     4

處理完後檢視看看資料

grocery %>% 
  select(customer, category, product, tid) %>% 
  sapply(n_distinct)

## customer category  product      tid 
##    32256     2007    23789   119422

summary(grocery)

##       date              customer             age           
##  Min.   :2000-11-01   Length:817181      Length:817181     
##  1st Qu.:2000-11-28   Class :character   Class :character  
##  Median :2001-01-01   Mode  :character   Mode  :character  
##  Mean   :2000-12-30                                        
##  3rd Qu.:2001-01-30                                        
##  Max.   :2001-02-28                                        
##      area              category        product               qty        
##  Length:817181      Min.   :100101   Length:817181      Min.   : 1.000  
##  Class :character   1st Qu.:110106   Class :character   1st Qu.: 1.000  
##  Mode  :character   Median :130106   Mode  :character   Median : 1.000  
##                     Mean   :284784                      Mean   : 1.358  
##                     3rd Qu.:520311                      3rd Qu.: 1.000  
##                     Max.   :780510                      Max.   :24.000  
##       cost            price             tid        
##  Min.   :   0.0   Min.   :   1.0   Min.   :     1  
##  1st Qu.:  35.0   1st Qu.:  42.0   1st Qu.: 28783  
##  Median :  62.0   Median :  76.0   Median : 59391  
##  Mean   : 106.2   Mean   : 125.5   Mean   : 58845  
##  3rd Qu.: 112.0   3rd Qu.: 132.0   3rd Qu.: 87391  
##  Max.   :3798.0   Max.   :3999.0   Max.   :119422

經營現況

每個年齡層分布數量

grocery %>%
  group_by(age) %>%
  summarize(count = n()) %>% 
  ggplot(aes(x = age, y = count, fill = age)) + 
  geom_bar(stat = "identity")

產品資料分析

前五名的商品

top6product <- grocery %>% 
  count(product) %>% 
  arrange(desc(n)) %>% 
  top_n(6)

## Selecting by n

grocery <- grocery %>% 
  mutate(unitPrice = round(price / qty))

前6名產品價格波動

grocery %>% 
  semi_join(top6product, by = "product") %>% 
  group_by(date, product) %>% 
  summarize(unitPrice = mean(unitPrice), 
            sum_qty = sum(qty)) %>% 
  arrange(date, product) %>% 
  ggplot(aes(date, unitPrice, color = factor(product))) +
    geom_line() +
    geom_line(aes(x = date, y = sum_qty), linetype = "dotdash") +
    facet_wrap(~ product, scales = "free_y") +
    theme(legend.position = "none")

交易資料整理

transaction <- grocery %>% 
  group_by(tid) %>% 
  summarise(
    date = date[1],              # 交易日期  
    customer = customer[1],      # 顧客 ID
    age = age[1],                # 顧客 年齡級別
    area = area[1],              # 顧客 居住區別
    tcount = n(),                # 交易項目(總)數
    pieces = sum(qty),           # 產品(總)件數
    items = n_distinct(category),# 產品種類數
    total = sum(price),          # 交易(總)金額
    gross = sum(price - cost)    # 毛利
  )

建立完交易資料後，一樣處理outlier。

sapply(transaction[, 6:9], quantile, prob = c(.999, .9995, .9999))

##        tcount   pieces   items     total
## 99.9%      54  81.0000 42.0000  9009.579
## 99.95%     62  94.2895 47.0000 10611.579
## 99.99%     82 133.0000 61.1158 16044.401

transaction <- subset(transaction, tcount <= 62 & pieces < 95 & total < 16000) # 119328

par(cex = 0.8)
hist(transaction$date, "weeks", freq = T, las = 2, col = "lightblue", main = "No. Transaction per Week")

依週、顧客年齡畫消費頻率熱圖

library(d3heatmap)
table(transaction$age, format(transaction$date, "%u")) %>% 
  {./rowSums(.)} %>% 
  as.data.frame.matrix() %>% 
  d3heatmap(F, F, col = colorRamp(c("seagreen", "lightyellow", "red")))

可以看出消費頻率比較高的集中在週日與30~40歲壯年人口，整體來說假日的消費頻率比較高。

依週、地區畫消費頻率熱圖

table(transaction$area, format(transaction$date, "%u")) %>% 
  {./rowSums(.)} %>% 
  as.data.frame.matrix() %>% 
  d3heatmap(F, F, col = colorRamp(c("seagreen", "lightyellow", "red")))

地區、年齡消費頻率熱圖

table(transaction$age, transaction$area) %>% 
  {./rowSums(.)} %>% 
  as.data.frame.matrix() %>% 
  d3heatmap(F, F, col = colorRamp(c("seagreen", "lightyellow", "red")))

各個區域的交易發生時間

transaction %>%
  group_by(area, date) %>%
  summarize(num_tran = n()) %>% 
  ggplot(aes(x = date, y = num_tran)) + 
  geom_bar(aes(x = date, y = num_tran, color = area), stat = "identity", alpha = 0.8) +
  facet_wrap(~area) + 
  labs(y = "# Transactions", x = "Date")

顧客資料整理

我們假設今天為拿到資料的隔天2001/3/1要來分析與做行銷策略。

today <- max(grocery$date) + 1
customer <- transaction %>% 
  mutate(days = as.integer(today - date)) %>% 
  group_by(customer) %>% 
  summarise(
    recency = min(days),                        # 最近一次購買距今天數
    senior = max(days),                         # 第一次購買距今天數
    coming = as.integer(!recency == senior),    # 第一次來買有沒有再來買一次
    freq = n(),                                 # 消費次數
    money = round(mean(total), 2),              # 平均購買金額
    revenue = sum(total),                       # 公司在他身上拿了多少
    net = sum(gross),                           # 公司淨賺他多少
    items = items[1],                           # 不同產品種類數
    age = age[1],
    area = area[1],
    since = min(date)                           # 第一次購買日期
  ) %>% 
  arrange(customer)

視覺化RFM，看分佈

par(mfrow = c(2, 2))
hist(customer$recency, main = "recency")
hist(log10(customer$freq), main = "log_frequency")
hist(log10(customer$money), main = "log_money")
hist(customer$senior, main = "senior")

檢查每個dataframe有沒有NA

map(list("grocery" = grocery, "transaction" = transaction, "customer" = customer), function(x) colSums(is.na(x)))

## $grocery
##      date  customer       age      area  category   product       qty 
##         0         0         0         0         0         0         0 
##      cost     price       tid unitPrice 
##         0         0         0         0 
## 
## $transaction
##      tid     date customer      age     area   tcount   pieces    items 
##        0        0        0        0        0        0        0        0 
##    total    gross 
##        0        0 
## 
## $customer
## customer  recency   senior   coming     freq    money  revenue      net 
##        0        0        0        0        0        0        0        0 
##    items      age     area    since 
##        0        0        0        0

集群分析

先用階層式分群來分析顧客資料

hierarchical clustering

linkage algo 用 complete方式

#rfm_dist <- dist(scale(customer[, c("recency", "coming", "freq", "money", "items")]))
#rfm_hc <- hclust(rfm_dist, method = "complete");rm(rfm_dist)
load("rfm_hc.RData")
plot(rfm_hc)

rfm_cluster <- cutree(rfm_hc, h = 11)
table(rfm_cluster)

## rfm_cluster
##     1     2     3     4 
## 29466  2612   103    60

customer <- mutate(customer, cluster = rfm_cluster)

先分4群看看，看起來會是比較好的結果，接下來畫RFM以及分群結果

顧客分隔

group_by(customer, cluster) %>% 
  summarise(
    recency = mean(recency), 
    freq = mean(freq), 
    money = mean(money), 
    size = n()
  ) %>% 
  mutate(revenue = size * money / 1000 )  %>% 
  filter(size > 1) %>% 
  ggplot(aes(x = freq, y = money)) +
  geom_point(aes(size = revenue, col = recency), alpha=0.5) +
  scale_size(range = c(4, 40)) +
  scale_color_gradient(low = "green", high = "red") +
  scale_x_log10() + scale_y_log10() + 
  geom_text(aes(label = size )) +
  theme_bw() + guides(size = F) +
  labs(title = "Customer Segements",
       subtitle = "(bubble_size:revenue_contribution; text:group_size)",
       color = "Recency") +
  xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")

接下來我們好奇這4群顧客變數分別平均會是多少，看他們的特性。

customer %>% 
  select(-c(customer, age, area, since)) %>% 
  group_by(cluster) %>% 
  summarise_all(funs(mean(.)))

## # A tibble: 4 x 9
##   cluster recency senior coming  freq money revenue   net items
##     <int>   <dbl>  <dbl>  <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1       1   36.7    81.4  0.650  3.68  792.   2777.  416.  5.77
## 2       2   46.7    72.9  0.419  2.18 3121.   6174. 1074. 18.6 
## 3       3    2.28  118.   1     49.6   593.  30062. 4274.  5.25
## 4       4   54.0    62.0  0.15   1.27 7734.   9435. 1773. 29.1

群組特性圖

par(cex = 0.7)
customer %>% 
  select(-c(1, 4, 10, 11, 12, 13)) %>% 
  mutate_all(scale) %>% 
  split(customer$cluster) %>% 
  sapply(colMeans) %>% 
  barplot(beside = T, col = rainbow(7))
legend('topleft',legend=colnames(customer[, -c(1, 4, 10, 11, 12, 13)]), fill = rainbow(7))

kmeans

tot_withinss <- map_dbl(1:10,  function(k){
  model <- kmeans(x = customer %>% select(-c("customer", "age", "area", "since", "cluster")), centers = k)
  model$tot.withinss
})

elbow_df <- data.frame(
  k = 1:10 ,
  tot_withinss = tot_withinss
)

# Plot the elbow plot
ggplot(elbow_df, aes(x = k, y = tot_withinss)) +
  geom_line() +
  scale_x_continuous(breaks = 1:10)

由上圖可知，k=3或4模型的複雜度增加沒有顯著增長，最好的分群方式會是分3或4群為主，將資料用kmeans model分群。

k=3

rfm_km.3 <- customer %>% 
  select(-c("customer", "age", "area", "since", "cluster")) %>% 
  kmeans(centers = 3)
table(rfm_km.3$cluster)

## 
##     1     2     3 
##   717 25238  6286

customer %>% 
  mutate(cluster = rfm_km.3$cluster) %>% 
  group_by(cluster) %>% 
  summarise(
    recency = mean(recency), 
    freq = mean(freq), 
    money = mean(money), 
    size = n()
  ) %>% 
  mutate(revenue = size * money / 1000 )  %>% 
  filter(size > 1) %>% 
  ggplot(aes(x = freq, y = money)) +
  geom_point(aes(size = revenue, col = recency), alpha=0.5) +
  scale_size(range = c(4,40)) +
  scale_color_gradient(low="green", high="red") +
  scale_x_log10() + scale_y_log10() + 
  geom_text(aes(label = size )) +
  theme_bw() + guides(size = F) +
  labs(title="Customer Segements",
       subtitle="(bubble_size:revenue_contribution; text:group_size)",
       color="Recency") +
  xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")

k=4

rfm_km.4 <- customer %>% 
  select(-c("customer", "age", "area", "since", "cluster")) %>% 
  kmeans(centers = 4)
table(rfm_km.4$cluster)

## 
##     1     2     3     4 
##  1775   175 22659  7632

customer %>% 
  mutate(cluster = rfm_km.4$cluster) %>% 
  group_by(cluster) %>% 
  summarise(
    recency = mean(recency), 
    freq = mean(freq), 
    money = mean(money), 
    size = n()
  ) %>% 
  mutate(revenue = size * money / 1000 )  %>% 
  filter(size > 1) %>% 
  ggplot(aes(x = freq, y = money)) +
  geom_point(aes(size = revenue, col = recency), alpha=0.5) +
  scale_size(range = c(4,40)) +
  scale_color_gradient(low="green", high="red") +
  scale_x_log10() + scale_y_log10() + 
  geom_text(aes(label = size )) +
  theme_bw() + guides(size = F) +
  labs(title="Customer Segements",
       subtitle="(bubble_size:revenue_contribution; text:group_size)",
       color="Recency") +
  xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")

k=7

library(factoextra)

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

set.seed(2020)
rfm_km.7 <- customer %>% 
  select(-c("customer", "age", "area", "since", "cluster")) %>% 
  kmeans(centers = 7, nstart = 20)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1612050)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1612050)

table(rfm_km.7$cluster)

## 
##     1     2     3     4     5     6     7 
##   875    30  8761  4684 15407   246  2238

customer %>% 
  mutate(cluster = rfm_km.7$cluster) %>% 
  group_by(cluster) %>% 
  summarise(
    recency = mean(recency), 
    freq = mean(freq), 
    money = mean(money), 
    size = n()
  ) %>% 
  mutate(revenue = size * money / 1000 )  %>% 
  filter(size > 1) %>% 
  ggplot(aes(x = freq, y = money)) +
  geom_point(aes(size = revenue, col = recency), alpha=0.5) +
  scale_size(range = c(4,40)) +
  scale_color_gradient(low="green", high="red") +
  scale_x_log10() + scale_y_log10() + 
  geom_text(aes(label = size )) +
  theme_bw() + guides(size = F) +
  labs(title="Customer Segements",
       subtitle="(bubble_size:revenue_contribution; text:group_size)",
       color="Recency") +
  xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")

分群資料探索

將顧客分結果join回去交易資料

customer_group <- customer %>% select(customer, cluster)
transaction_cluster <- transaction %>% left_join(customer_group, by = "customer")
grocery_cluster <- grocery %>% left_join(customer_group, by = "customer")

分群顧客利潤最高產品

觀看分群顧客最喜歡哪些產品，能帶來較大獲利

grocery_cluster %>% 
  filter(!is.na(cluster)) %>% 
  mutate(gross = price - cost) %>% 
  group_by(cluster, product) %>% 
  summarize(
    tcount = n(),
    sum_total = sum(price),
    sum_gross = sum(gross)
  ) %>% 
  arrange(desc(sum_gross)) %>% 
  top_n(5) %>% 
  ungroup() %>% 
  ggplot(aes(x = product, y = sum_gross)) +
    geom_col(aes(fill = product), alpha = 0.8) +
    facet_wrap(~ cluster, scales = "free") +
    theme(axis.text.x = element_text(angle = 25))

## Selecting by sum_gross

table(customer$age, customer$cluster) %>% 
  {./rowSums(.)} %>% 
  as.data.frame.matrix() %>% 
  d3heatmap(F, F, col = colorRamp(c("seagreen", "lightyellow", "red")))

資料切割

取出2001-02-01之前的資料，然後做跟上面一樣的事情

grocery.x <- subset(grocery, date < ymd("2001-02-01"))
tail(grocery.x, 10)

## # A tibble: 10 x 11
##    date       customer age   area  category product   qty  cost price   tid
##    <date>     <chr>    <chr> <chr>    <int> <chr>   <int> <int> <int> <int>
##  1 2001-01-31 01564846 a35   221     130208 471313…     1    61    79 87915
##  2 2001-01-31 01813319 a30   Othe…   100514 471062…     1   138   147 88087
##  3 2001-01-31 01866926 a35   110     100414 471126…     1   105   139 88149
##  4 2001-01-31 01469271 a40   115     530301 471018…     1   171   204 87878
##  5 2001-01-31 02000558 a25   115     100507 471005…     2    32    42 88230
##  6 2001-01-31 02005409 a25   221     560402 871204…     3  1959  1980 88235
##  7 2001-01-31 01759372 a20   115     100324 471070…     1   160   189 88052
##  8 2001-01-31 02107356 a40   115     100309 471074…     1    79    95 88315
##  9 2001-01-31 01804744 a35   115     300711 471148…     1    32    48 88082
## 10 2001-01-31 01846683 Unkn… Unkn…   500501 205438…     1    78    72 88120
## # ... with 1 more variable: unitPrice <dbl>

11~1月的交易資料

transaction.x <- grocery.x %>% 
  group_by(tid) %>% 
  summarise(
    date = date[1],              # 交易日期  
    customer = customer[1],      # 顧客 ID
    age = age[1],                # 顧客 年齡級別
    area = area[1],              # 顧客 居住區別
    tcount = n(),                # 交易項目(總)數
    pieces = sum(qty),           # 產品(總)件數
    items = n_distinct(category),# 產品種類數
    total = sum(price),          # 交易(總)金額
    gross = sum(price - cost)    # 毛利
  )

sapply(transaction.x[, 6:10], quantile, prob = c(.999, .9995, .9999))

##         tcount   pieces items     total    gross
## 99.9%  56.0000  84.0000    43  9378.684 1883.228
## 99.95% 64.0000  98.0000    49 11261.751 2317.087
## 99.99% 85.6456 137.6456    66 17699.325 3389.646

transaction.x <- subset(transaction.x, tcount <= 64 & pieces <= 98 & total <= 11261)

11~1月的顧客資料

max_date <- max(transaction.x$date)
customer.x <- transaction.x %>% 
  mutate(days = as.integer(max_date - date)) %>% 
  group_by(customer) %>% 
  summarise(
    recency = min(days),                        # 最近一次購買距今天數
    senior = max(days),                         # 第一次購買距今天數
    coming = as.integer(!recency == senior),    # 第一次來買有沒有再來買一次
    freq = n(),                                 # 消費次數
    money = round(mean(total), 2),              # 平均購買金額
    revenue = sum(total),                       # 公司在他身上拿了多少
    net = sum(gross),                           # 公司淨賺他多少
    items = items[1],                           # 不同產品種類數
    age = age[1],
    area = area[1],
    since = min(date)                           # 第一次購買日期
  ) %>% 
  arrange(customer)

準備 predictor Y

Y: 2月是否有購買

先準備每個顧客2月的購買金額，若沒有則為NA。

feb_customer <- transaction %>% 
  filter(date >= ymd("2001-02-01")) %>% 
  group_by(customer) %>% 
  summarize(feb_amount = sum(total))

將2月顧客的消費金額left_join回去11~1月的顧客資料，若2月沒有購買feb_amount會顯示NA，新增一個欄位buy是2月有購買不是NA為TRUE。

customer.x <- left_join(customer.x, feb_customer, by = "customer")
customer.x$buy <- !is.na(customer.x$feb_amount)

依年齡分平均購買比例

tapply(customer.x$buy, customer.x$age, mean) %>% 
  barplot(xlab = "Age", ylab = "Mean of Buy")
abline(h = mean(customer.x$buy), col = 'red')

依地區分平均購買比例

tapply(customer.x$buy, customer.x$area, mean) %>% 
  barplot(xlab = "Area", ylab = "Mean of Buy")
abline(h = mean(customer.x$buy), col = 'red')

因為transaction.x處理過outlier，確保原grocery.x的資料沒有outlier。

grocery.x <- filter(grocery.x, customer %in% customer.x$customer)

建立模型

預測「是否購買」

準備訓練與測試資料

library(caTools)
set.seed(2019)
spl.x <-  sample.split(customer.x$buy, SplitRatio = 0.7)
c(nrow(customer.x), sum(spl.x), sum(!spl.x))

## [1] 28584 20008  8576

檢查training&testing分佈是否一致

cbind(customer.x, spl.x) %>% 
  ggplot(aes(x = log(feb_amount), fill = spl.x)) + 
  geom_density(alpha = 0.5)

## Warning: Removed 15342 rows containing non-finite values (stat_density).

buy_train <- customer.x[spl.x, ]
buy_test <- customer.x[!spl.x, ]

logistic model

buy.model <- glm(buy ~ money + recency * freq + items, buy_train, family = binomial)
summary(buy.model)

## 
## Call:
## glm(formula = buy ~ money + recency * freq + items, family = binomial, 
##     data = buy_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3823  -0.8991  -0.7509   1.0989   1.8540  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.057e+00  5.063e-02 -20.872  < 2e-16 ***
## money        -9.185e-05  2.141e-05  -4.289 1.79e-05 ***
## recency      -4.046e-03  1.111e-03  -3.641 0.000272 ***
## freq          4.492e-01  1.434e-02  31.334  < 2e-16 ***
## items         7.315e-03  3.409e-03   2.146 0.031881 *  
## recency:freq -1.739e-03  5.407e-04  -3.216 0.001298 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27629  on 20007  degrees of freedom
## Residual deviance: 23360  on 20002  degrees of freedom
## AIC: 23372
## 
## Number of Fisher Scoring iterations: 6

套用logistic function發現只有revenue不顯著。

混淆矩陣

buy.pred <- predict(buy.model, buy_test, type = "response")
cm <- table(actual = buy_test$buy, predict = buy.pred > 0.5); cm

##        predict
## actual  FALSE TRUE
##   FALSE  3830  773
##   TRUE   1839 2134

cm %>% {sum(diag(.))/sum(.)}

## [1] 0.6954291

ROC AUC=0.7334

ROC <- roc(buy_test$buy, buy.pred)
plot(ROC, col = "red")

auc(ROC)

## Area under the curve: 0.7443

decision tree

buy_train$buy <- buy_train$buy %>% as.integer() %>% as.factor()
buy_test$buy <- buy_test$buy %>% as.integer() %>% as.factor()

library(rpart)
buy_tree <- rpart(buy ~ ., data = buy_train[, -c(1, 10, 11, 12, 13)], method = "class", control = rpart.control(cp = 0, maxdepth = 10))

library(rpart.plot)
# rpart.plot(buy_tree_pruned)
rpart.plot(buy_tree, type = 3, box.palette = c("red", "green"), fallen.leaves = TRUE)

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

buy.tree.pred <- predict(buy_tree, buy_test, type = "class")
table(buy.tree.pred, buy_test$buy)

##              
## buy.tree.pred    0    1
##             0 3588 1684
##             1 1015 2289

mean(buy.tree.pred == buy_test$buy)

## [1] 0.6852845

修剪決策樹，發現準確度有微微上升至70%

buy_tree_pruned <- prune(buy_tree, cp = 0.0015)
buy_test$pred <- predict(buy_tree_pruned, buy_test, type = "class")
mean(buy_test$pred == buy_test$buy)

## [1] 0.7017257

Random Forest

試試看隨機森林的準確度

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

buy_forest <- randomForest(buy ~ ., data = buy_train[, -c(1, 10, 11, 12, 13)])
buy_test$pred <- predict(buy_forest, buy_test, type = "class")
mean(buy_test$pred == buy_test$buy)

## [1] 0.6983442

預測「購買金額」

只篩選有回購的顧客，並對金額取log10

customer.y <- subset(customer.x, buy) %>% 
  mutate(money = money + 1, revenue = revenue + 1, feb_amount = feb_amount + 1) %>% 
  mutate_at(c("money", "revenue", "feb_amount"), log10)
spl.y <- sample(nrow(customer.y), nrow(customer.y) * 0.7)
amount_train <- customer.y[spl.y, ]
amount_test <- customer.y[-spl.y, ]

lm.model <- lm(feb_amount ~ money + recency * freq + revenue, amount_train)
summary(lm.model)

## 
## Call:
## lm(formula = feb_amount ~ money + recency * freq + revenue, data = amount_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.94471 -0.23435  0.04971  0.28524  1.60731 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.1825476  0.0350875  33.703  < 2e-16 ***
## money         0.3917323  0.0322272  12.155  < 2e-16 ***
## recency       0.0020201  0.0003540   5.707 1.18e-08 ***
## freq          0.0222892  0.0016726  13.326  < 2e-16 ***
## revenue       0.1674661  0.0305376   5.484 4.27e-08 ***
## recency:freq -0.0006890  0.0001207  -5.709 1.17e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4274 on 9263 degrees of freedom
## Multiple R-squared:  0.271,  Adjusted R-squared:  0.2706 
## F-statistic: 688.8 on 5 and 9263 DF,  p-value: < 2.2e-16

plot(amount_train$feb_amount, predict(lm.model), col = 'pink', cex = 0.65)
abline(0, 1,col='red')

行銷規劃

篩選出2000-12-01~20001-02-28

B <- transaction %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(today, date, units = "days"))) %>% 
  group_by(customer) %>% 
  summarise(
    recency = min(days),                        # 最近一次購買距今天數
    senior = max(days),                         # 第一次購買距今天數
    coming = as.integer(!recency == senior),    # 第一次來買有沒有再來買一次
    freq = n(),                                 # 消費次數
    money = round(mean(total), 2),              # 平均購買金額
    revenue = sum(total),                       # 公司在他身上拿了多少
    net = sum(gross),                           # 公司淨賺他多少
    items = items[1],                           # 不同產品種類數
    age = age[1],
    area = area[1],
    since = min(date)                           # 第一次購買日期
  )
nrow(B) #28531

## [1] 28531

預測3月購買機率與金額

B$buy <- predict(buy.model, B, type="response")
B2 <- B %>% mutate_at(c("money","revenue"), log10)
B$rev <- 10 ^ predict(lm.model, B2); rm(B2)

購買機率、購買金額的機率分佈

par(mfrow = c(1,2), cex = 0.8)
hist(B$buy)
hist(log(B$rev,10))

effect = .5 # 回購機率
cost = 200   # 成本假設

Target <- B %>% inner_join(customer_group, by = "customer")
Target$ExpReturn <- (effect - Target$buy) * Target$rev - cost
Target %>% 
  filter(Target$ExpReturn > 0) %>%
  group_by(cluster) %>% 
  summarise(
    No.Target = n(),
    AvgROI = mean(ExpReturn),
    TotalROI = sum(ExpReturn)
  ) %>% 
  filter(cluster == 4)

## # A tibble: 1 x 4
##   cluster No.Target AvgROI TotalROI
##     <int>     <int>  <dbl>    <dbl>
## 1       4        46   607.   27919.

target4 <- filter(Target, cluster == 4)
random <- sample(1:nrow(target4), nrow(target4))
c("no.target" = round(length(random)), 
  "AvgROI" = mean(target4[random, ]$rev - 100),
  "TotalROI" = sum(target4[random, ]$rev - 100))

##  no.target     AvgROI   TotalROI 
##     48.000   2679.282 128605.553

第三群增加金額

group3 <- Target %>% 
  filter(cluster == 3) %>% 
  mutate(ExpReturn = rev * 1.2 * 0.5 - 150) %>% 
  group_by(cluster) %>% 
  summarize(
    No.Target = round(n() * 0.5),
    AvgROI = mean(ExpReturn),
    TotalROI = sum(ExpReturn)
  )