Group4:
B034020012 謝雨靜
B034020027 陳韻卉
B044012015 王譯苓
M064111025 黃威豪
M064111039 王 欣
行傳所 孟祥瑄


packages = c(
  "dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
  "arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
LOAD = TRUE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)


A. RFM分群

load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()
k = 8; set.seed(777)
A$group = kmeans(scale(A[,2:6]), k)$cluster; table(A$group)
df = group_by(A, group) %>% summarise(
  avg_frequency = mean(f),
  avg_monetary = mean(m),
  total_revenue = sum(rev),
  group_size = n(),
  avg_recency = mean(r),
  profit = sum(raw)
  ) %>% ungroup %>% 
  mutate(
    pc_revenue = round(100*total_revenue/sum(total_revenue),1), # % revenue contrib.
    pc_profit = round(100*profit/sum(profit),1),                # % profit contrib. 
    dummy = 2001  # to fit googleViz's data format
  ) %>% data.frame
head(df)
plot( gvisMotionChart(
  df, "group", "dummy",
  options=list(width=800, height=600) 
  ))


【QUIZ】 如果我們把X,Y軸分別改成log(avg_monetary)和log(avg_recency),根據圖上的顯示 …



B. 顧客產品矩陣

n_distinct(Z$cust)  # 32256
n_distinct(Z$prod)  # 23789

操作矩陣運算之前,通常我會載入這兩個套件

library(Matrix)
library(slam)

製作顧客產品矩陣其實很快、也很容易

mx = xtabs(~ cust + prod, Z, sparse=T)

顧客產品矩陣通常是一個很稀疏的矩陣

dim(mx)
mean(mx > 0)

有一些產品沒什麼人買

table(colSums(mx) < 10)       # 12588 

刪去購買次數小於10的產品,然後刪去沒有購買產品的顧客

mx = mx[, colSums(mx) > 10]   # 
mx = mx[rowSums(mx) > 0, ]    #
dim(mx)                       # 32256>32066 23789>10675

檢查一下矩陣裡面的值分布

max(mx)                       # 49
table(mx@x) %>% prop.table %>% round(4) %>% head(10)


【QUIZ】 如果mx[i, j] = 3,這表示 ….



C. 依購買頻率分群

C1. 改變(稀疏)矩陣格式

稀疏矩陣有很多種格式,不同的工具會使用不同的格式

library(slam)
tmx = as(mx,"dgTMatrix")
tmx = simple_triplet_matrix(
  1+tmx@i, 1+tmx@j, tmx@x, dimnames=mx@Dimnames)
dim(tmx)
C2. 估計各產品的平均重要性 (Average TF-IDF Scores)

我們借用文字分析裡面估計單字在文章之中的重要性的方法(TF-IDF),計算各產品在所有顧客之間的平均重要性

tfidf = tapply(tmx$v/row_sums(tmx)[tmx$i], tmx$j, mean) *
  log2(nrow(tmx)/col_sums(tmx > 0))
summary(tfidf)
hist(tfidf)
C3. 篩選產品、重新排列

篩去平均重要性比較低的產品、然後將產品依被購買次數降冪排列

tmx = tmx[, tfidf > mean(tfidf)]    # remove products with low tfidf score
tmx = tmx[, order(-col_sums(tmx))]  # order product by frequency
dim(tmx)
C4. 依(購買頻率最高的)產品對顧客分群
nop= 400  # no. product
k = 200   # no. cluster
set.seed(111); kg = kmeans(tmx[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort
C5. 各群組平均屬性

將分群結果併入顧客資料框(A)

df = inner_join(A, data.frame(
  cust = as.integer(tmx$dimnames$cust), kg))

計算各群組的平均屬性

df = data.frame(
  aggregate(. ~ kg, df[,c(2:7,11)], mean),
  size = as.vector(table(kg))
  )

head(df) 
C6. 互動式泡泡圖
df$dummy = 2001                        # dummy column for googleViz
plot( gvisMotionChart(
  subset(df[,c(1,4,5,6,8,2,3,7,9)], 
         size >= 20 & size <= 5000     # range of group size
         ), 
  "kg", "dummy", options=list(width=800, height=600) ) )
C7. 各群組的代表性產品 (Signature Product)
Sig = function(gx, P=1000, H=10) {
  print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
  bx = tmx[,1:P]
  data.frame(n = col_sums(bx[kg==gx,])) %>%      # frequency
    mutate(
      share = round(100*n/col_sums(bx),2),       # %prod sold to this cluster
      conf = round(100*n/sum(kg==gx),2),         # %buy this product, given cluster
      base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust 
      lift = round(conf/base,1),                 # conf/base  
      name = colnames(bx)                        # name of prod
    ) %>% arrange(desc(lift)) %>% head(H)
  }
Sig(1)
Sig(19)[,1]%>% sort(T)
Sig(19)[Sig(19)$n==57,]

【QUIZ】 從互動式泡泡圖看來 …

  • 平均客單價、平均購買次數、平均營收貢獻、平均獲利貢獻最大的分別是哪一個族群?
    • 平均客單價:G190
    • 平均購買次數:G186
    • 平均營收貢獻:G19
    • 平均獲利貢獻:G19
  • 它們的特徵產品分別是什麼?
    • 8712045008539
    • 4710063031106
    • 4710628079017
    • 4710628079017

【QUIZ】 在以上這個段落裡面 …

  • 我們分群的區隔變數是什麼?
    • 所有的產品為區隔變數
  • 我們在泡泡圖裡面觀察的變數是哪一些?
    • r(最後一次購買距今天數)
    • f(購買頻率)
    • m(客單價)
    • raw(獲利貢獻)
    • rev(營收貢獻)
    • group_size(族群大小)
    • s(第一次購買距今天數)
  • 它們是相同的嗎?
    • 不同。
  • 這個分析的程序,跟Airlines CRM那一個例子有甚麼不同?
    資料型態 常態化 篩選 區隔變數相同?
    Airlines Yes,讓變數之間的權重一樣 No,保留所有的顧客做分群,由分群結果看特徵 Yes,利用現有變數分群,再用「相同」變數解讀不同分群特徵
    顧客產品矩陣 屬於稀鬆矩陣 No,矩陣本身為數量的計數所以不須常態化

    Yes,有以下2種情形:

    • 顧客購買太少產品
    • 產品被購買的次數過少,所提供資訊太少,故刪除
    NO,用產品購買頻率分群,用分群結果與顧客交易資料合併,看群的特徵
  • 我們從這個例子學到什麼?
    • 利用xtabs可以容易地將顧客產品矩陣製作出來。
    • 用購買產品頻率與用顧客購買習慣所分群出來的結果會不一樣。
    • 可以透過此方法從產品的角度出發去做行銷決策。
    • 再由各群組的代表性產品、互動泡泡圖來獲得更多資訊。


D. 使用尺度縮減方法抽取顧客(產品)的特徵向量

D1. 巨大尺度縮減 (SVD, Sigular Value Decomposition)
library(irlba)
if(LOAD) {
  load("data/svd.rdata")
} else {
  smx = mx
  smx@x = pmin(smx@x, 2)            # cap at 2, similar to normalization  
  t0 = Sys.time()
  svd = irlba(smx, 
              nv=400,               # length of feature vector
              maxit=800, work=800)    
  print(Sys.time() - t0)            # 1.8795 mins
  save(svd, file = "data/svd.rdata")
}
D2. 依顧客向量對顧客分群
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort
D3. 互動式泡泡圖 (Google Motion Chart)
# clustster summary
df = left_join(A, data.frame(         
  cust = as.integer(smx@Dimnames$cust), kg)) %>% 
  group_by(kg) %>% summarise(
    avg_frequency = mean(f),
    avg_monetary = mean(m),
    avg_revenue_contr = mean(rev),
    group_size = n(),
    avg_recency = mean(r),
    avg_gross_profit = mean(raw)) %>% 
  ungroup %>% 
  mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>% 
  data.frame

# Google Motion Chart
plot( gvisMotionChart(
  subset(df, group_size >= 20 & group_size <= 5000),     
  "kg", "dummy", options=list(width=800, height=600) ) )
D4. 互動式泡泡圖 (ggplot + plotly)
filter(df, group_size >= 20 & group_size <= 5000)$group_size %>% 
  sqrt %>% range    # for bubble size adjustment
library(ggplot2)
library(plotly)

p = df %>% filter(group_size >= 20 & group_size <= 5000) %>% 
  ggplot(aes(x=avg_frequency, y=avg_monetary)) +
  geom_point(aes(size=group_size, col=avg_revenue_contr),alpha=0.7) +
  geom_text(aes(label=kg), alpha=0) +
  scale_size(range=c(1.5,12)) +
  #scale_color_gradient(low="green",high="magenta") +
  scale_colour_gradientn(
    colours = rev(c("red","yellow","green","lightblue","darkblue"))) +
  theme_bw() + guides(size=F) + labs(
    title="顧客集群(依購買產品)",
    color="平均營收貢獻", size="集群人數") +
  xlab("平均購買次數") + 
  ylab("平均購買金額")
plotly_build(p)
D5. 群組的代表性產品 (Signature Product)
Sig(138)


E. 購物籃分析 Baskets Analysis

dim(mx)   # 32066 cust * 10675 prod
E1. 準備資料 (for Association Rule Analysis)
library(arules)
library(arulesViz)
bx = subset(Z, prod %in% as.numeric(colnames(mx)), 
            select=c("cust","prod"))  # select product items
bx = split(bx$prod, bx$cust)          # split by customer
bx = as(bx, "transactions")           # data for arules package
E2. Top20 熱賣產品
itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)
E3. 關聯規則和Apriori演算法

關聯規則(A => B)

rules = apriori(bx, parameter=list(supp=0.005, conf=0.6))
summary(rules)
E4. 檢視關聯規則

關聯規則 (A => B):

options(digits=4)
inspect(rules)
# install.packages(
#   "https://cran.r-project.org/bin/windows/contrib/3.5/arulesViz_1.3-1.zip",
#   repos=NULL)

# install.packages("arulesViz_1.3-1.zip", repos=NULL)
# library(plotly)
# plotly_arules(rules,colors=c("red","green"),
#               marker=list(opacity=.6,size=10))
# plotly_arules(rules,method="matrix",
#               shading="lift",
#               colors=c("red", "green"))
# 
E5. 互動圖表顯示
plot(rules,colors=c("red","green"),engine="htmlwidget",
     marker=list(opacity=.6,size=8))
plot(rules,method="matrix",shading="lift",engine="htmlwidget",
     colors=c("red", "green"))
E6. 篩選產品、互動式關聯圖
r1 = subset(rules, subset = rhs %in% c("4719090790000"))
summary(r1)
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan") 
r2 = subset(rules, subset = rhs %in% c("4710011401135"))
summary(r2)
plot(r2,method="graph",engine="htmlwidget",itemCol="cyan") 


F. 產品推薦 Product Recommendation

F1. 篩選顧客、產品

太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選

library(recommenderlab)
rx = mx[, colSums(mx > 0) >= 50]
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]
dim(rx)
F2. 選擇產品評分方式

可以選擇要用

做模型。

rx = as(rx, "realRatingMatrix")  # realRatingMatrix
bx = binarize(rx, minRating=1)   # binaryRatingMatrix
dim(bx)
F3. 設定模型(準確性)驗證方式
set.seed(4321)
scheme = evaluationScheme(     
  bx, method="split", train = .75,  given=5)
F4. 設定推薦方法(參數)
algorithms = list(            
  AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
  AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
  RANDOM = list(name="RANDOM", param=NULL),
  POPULAR = list(name="POPULAR", param=NULL),
  UBCF = list(name="UBCF", param=NULL),
  IBCF = list(name="IBCF", param=NULL) )
F5. 建模、預測、驗證(準確性)
if(LOAD) {
  load("results.rdata")
} else {
  t0 = Sys.time()
  results = evaluate(            
    scheme, algorithms, type="topNList",
    n=c(5, 10, 15, 20))
  print(Sys.time() - t0)
  save(results, file="results.rdata")
}
F6. 模型準確性比較
# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)
F7. 儲存產品推薦模型
save(results, file="data/results.rdata")






---
title: "Product：產品銷售資訊"
author: "Group4, 2018/08/01"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

<ul class="nav">
  <li><a href="#A">RFM分群</a></li>
  <li><a href="#B">顧客產品矩陣</a></li>
  <li><a href="#C">依購買頻率分群</a></li>
  <li><a href="#D">尺度縮減</a></li>
  <li><a href="#E">購物籃分析</a></li>
  <li><a href="#F">產品推薦</a></li>
</ul>

<div class="comment">
  Group4:<br>
  B034020012 謝雨靜<br>
  B034020027 陳韻卉<br>
  B044012015 王譯苓<br>
  M064111025 黃威豪<br>
  M064111039 王  欣<br>
  行傳所     孟祥瑄<br>
</div>


<br>

```{r}
packages = c(
  "dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
  "arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
```

```{r warning=F, message=F, cache=F, error=F}
rm(list=ls(all=TRUE))
LOAD = TRUE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)
```
<br><hr>

<div id="A">
### A. RFM分群
</div>

```{r}
load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()
```

```{r}
k = 8; set.seed(777)
A$group = kmeans(scale(A[,2:6]), k)$cluster; table(A$group)
```

```{r}
df = group_by(A, group) %>% summarise(
  avg_frequency = mean(f),
  avg_monetary = mean(m),
  total_revenue = sum(rev),
  group_size = n(),
  avg_recency = mean(r),
  profit = sum(raw)
  ) %>% ungroup %>% 
  mutate(
    pc_revenue = round(100*total_revenue/sum(total_revenue),1), # % revenue contrib.
    pc_profit = round(100*profit/sum(profit),1),                # % profit contrib. 
    dummy = 2001  # to fit googleViz's data format
  ) %>% data.frame
```

```{r}
head(df)
```

```{r}
plot( gvisMotionChart(
  df, "group", "dummy",
  options=list(width=800, height=600) 
  ))
```
<br>

<div class="comment">
**【QUIZ】** 如果我們把X，Y軸分別改成log(avg_monetary)和log(avg_recency)，根據圖上的顯示 ...

+ 你認為這家店目前最需要對那一個族群？
    + Group 5
+ 做哪一個動作？
    + 我們將Group 5定義為核心顧客，假設能取得這些顧客購買習慣、偏好、搜尋活動的相關資料，強化對這些顧客的了解，而為了增加其購買單價跟提高購買次數，採取以下兩種行銷方案。
    + 一、在店內進行「產品大容量，折數越划算」之方式，刺激顧客雖購買價格較高但性價比也較高的產品，以提高客單價。
    + 二、推出不同主題檔期：分析這些顧客產品購買紀錄找出較熱門的產品，並配合產品使用週期，來推出不同檔期的不同促銷季！同時也要大量放出優惠消息讓這些核心顧客接觸到，達成顧客來店頻率增加。

+ 為什麼？
    + 在泡泡圖上我們可以看出在avg_monetary和avg_recency 兩軸上，此族群皆較屬於中間的位置，雖然Group 2在相較於Group 5有絕對優勢(avg_monetary較高 & avg_recency較低)，可是Group 5族群大小為所有族群中最大的，故本組認為其開發潛力極高。
    
  </div>



<br><hr>

<div id="B">
### B. 顧客產品矩陣
</div>


```{r}
n_distinct(Z$cust)  # 32256
n_distinct(Z$prod)  # 23789
```

操作矩陣運算之前，通常我會載入這兩個套件
```{r}
library(Matrix)
library(slam)
```

製作顧客產品矩陣其實很快、也很容易
```{r}
mx = xtabs(~ cust + prod, Z, sparse=T)
```

顧客產品矩陣通常是一個很稀疏的矩陣
```{r}
dim(mx)
mean(mx > 0)
```

有一些產品沒什麼人買
```{r}
table(colSums(mx) < 10)       # 12588 
```

刪去購買次數小於10的產品，然後刪去沒有購買產品的顧客
```{r}
mx = mx[, colSums(mx) > 10]   # 
mx = mx[rowSums(mx) > 0, ]    #
dim(mx)                       # 32256>32066 23789>10675
```

檢查一下矩陣裡面的值分布
```{r}
max(mx)                       # 49
table(mx@x) %>% prop.table %>% round(4) %>% head(10)
```

<br>

<div class="comment">
**【QUIZ】** 如果`mx[i, j] = 3`，這表示 ....

+ $Customer_i$總共買$Prodict_j$買了三次
    + 顧客產品矩陣只能辨識有購買產品的行為，無法解讀數量，故矩陣內的值為購買次數。

</div>
<br><hr>
<div id="C">
### C. 依購買頻率分群 
</div>

##### C1. 改變(稀疏)矩陣格式
稀疏矩陣有很多種格式，不同的工具會使用不同的格式
```{r}
library(slam)
tmx = as(mx,"dgTMatrix")
tmx = simple_triplet_matrix(
  1+tmx@i, 1+tmx@j, tmx@x, dimnames=mx@Dimnames)
dim(tmx)
```

##### C2. 估計各產品的平均重要性 (Average TF-IDF Scores)
我們借用文字分析裡面估計單字在文章之中的重要性的方法(TF-IDF)，計算各產品在所有顧客之間的平均重要性
```{r}
tfidf = tapply(tmx$v/row_sums(tmx)[tmx$i], tmx$j, mean) *
  log2(nrow(tmx)/col_sums(tmx > 0))
summary(tfidf)
```

+ TF(term-frequency)：一個文字在一篇文章出現的次數。
+ IDF：在幾篇文章出現。
+ 去排除高頻率購買可是重要性不高的產品。
```{r fig.height=3, fig.width=7.2}
hist(tfidf)
```

##### C3. 篩選產品、重新排列
篩去平均重要性比較低的產品、然後將產品依被購買次數降冪排列
```{r}
tmx = tmx[, tfidf > mean(tfidf)]    # remove products with low tfidf score
tmx = tmx[, order(-col_sums(tmx))]  # order product by frequency
dim(tmx)
```

##### C4. 依(購買頻率最高的)產品對顧客分群
```{r}
nop= 400  # no. product
k = 200   # no. cluster
set.seed(111); kg = kmeans(tmx[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort
```

##### C5. 各群組平均屬性
將分群結果併入顧客資料框(`A`)
```{r}
df = inner_join(A, data.frame(
  cust = as.integer(tmx$dimnames$cust), kg))
```

計算各群組的平均屬性
```{r}
df = data.frame(
  aggregate(. ~ kg, df[,c(2:7,11)], mean),
  size = as.vector(table(kg))
  )

head(df) 
```

##### C6. 互動式泡泡圖
```{r}
df$dummy = 2001                        # dummy column for googleViz
plot( gvisMotionChart(
  subset(df[,c(1,4,5,6,8,2,3,7,9)], 
         size >= 20 & size <= 5000     # range of group size
         ), 
  "kg", "dummy", options=list(width=800, height=600) ) )
```

##### C7. 各群組的代表性產品 (Signature Product)
```{r}
Sig = function(gx, P=1000, H=10) {
  print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
  bx = tmx[,1:P]
  data.frame(n = col_sums(bx[kg==gx,])) %>%      # frequency
    mutate(
      share = round(100*n/col_sums(bx),2),       # %prod sold to this cluster
      conf = round(100*n/sum(kg==gx),2),         # %buy this product, given cluster
      base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust 
      lift = round(conf/base,1),                 # conf/base  
      name = colnames(bx)                        # name of prod
    ) %>% arrange(desc(lift)) %>% head(H)
  }
```

```{r}
Sig(1)
Sig(19)[,1]%>% sort(T)
Sig(19)[Sig(19)$n==57,]
```

+ G族群一共買了`n`個P產品
+ P產品有`share%`是賣給G族群
+ G族群每人平均購買`conf/100`個P產品
+ 所有顧客之中，每人平均購買`base/100`個P產品
+ 跟所有顧客相比，G族群購買P產品的機率上升了`lift-1`倍 

<div class="comment">

**【QUIZ】** 從互動式泡泡圖看來 ...

+ 平均客單價、平均購買次數、平均營收貢獻、平均獲利貢獻最大的分別是哪一個族群？
    + 平均客單價：G190
    + 平均購買次數：G186
    + 平均營收貢獻：G19
    + 平均獲利貢獻：G19
+ 它們的特徵產品分別是什麼？
    + 8712045008539
    + 4710063031106
    + 4710628079017
    + 4710628079017

**【QUIZ】** 在以上這個段落裡面 ...

+ 我們分群的區隔變數是什麼？
    + 所有的產品為區隔變數
+ 我們在泡泡圖裡面觀察的變數是哪一些？
    + r(最後一次購買距今天數)
    + f(購買頻率)
    + m(客單價)
    + raw(獲利貢獻)
    + rev(營收貢獻)
    + group_size(族群大小)
    + s(第一次購買距今天數)
+ 它們是相同的嗎？
    + 不同。
+ 這個分析的程序，跟Airlines CRM那一個例子有甚麼不同？
<table class="tg">
  <tr>
    <th class="tg-b44r"></th>
    <th class="tg-b44r">資料型態</th>
    <th class="tg-b44r">常態化</th>
    <th class="tg-b44r">篩選</th>
    <th class="tg-b44r">區隔變數相同?</th>
  </tr>
  <tr>
    <td class="tg-b44r">Airlines</td>
    <td class="tg-yw4l"></td>
    <td class="tg-yw4l">Yes,讓變數之間的權重一樣</td>
    <td class="tg-yw4l">No,保留所有的顧客做分群，由分群結果看特徵</td>
    <td class="tg-yw4l">Yes,利用現有變數分群，再用「相同」變數解讀不同分群特徵
    </td>
  </tr>
  <tr>
    <td class="tg-b44r">顧客產品矩陣</td>
    <td class="tg-yw4l">屬於稀鬆矩陣</td>
    <td class="tg-yw4l">No,矩陣本身為數量的計數所以不須常態化</td>
    <td class="tg-yw4l">
    Yes,有以下2種情形：
    
    + 顧客購買太少產品
    + 產品被購買的次數過少，所提供資訊太少，故刪除
    </td>
    <td class="tg-yw4l">NO,用產品購買頻率分群，用分群結果與顧客交易資料合併，看群的特徵</td>
  </tr>
</table>    
+ 我們從這個例子學到什麼？
    + 利用xtabs可以容易地將顧客產品矩陣製作出來。
    + 用購買產品頻率與用顧客購買習慣所分群出來的結果會不一樣。
    + 可以透過此方法從產品的角度出發去做行銷決策。
    + 再由各群組的代表性產品、互動泡泡圖來獲得更多資訊。
    
</div>

<br><hr>



<div id="D">
### D. 使用尺度縮減方法抽取顧客(產品)的特徵向量
</div>
 

##### D1. 巨大尺度縮減 (SVD, Sigular Value Decomposition)
```{r}
library(irlba)
if(LOAD) {
  load("data/svd.rdata")
} else {
  smx = mx
  smx@x = pmin(smx@x, 2)            # cap at 2, similar to normalization  
  t0 = Sys.time()
  svd = irlba(smx, 
              nv=400,               # length of feature vector
              maxit=800, work=800)    
  print(Sys.time() - t0)            # 1.8795 mins
  save(svd, file = "data/svd.rdata")
}
```

##### D2. 依顧客向量對顧客分群
```{r}
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort
```

##### D3. 互動式泡泡圖 (Google Motion Chart)
```{R}
# clustster summary
df = left_join(A, data.frame(         
  cust = as.integer(smx@Dimnames$cust), kg)) %>% 
  group_by(kg) %>% summarise(
    avg_frequency = mean(f),
    avg_monetary = mean(m),
    avg_revenue_contr = mean(rev),
    group_size = n(),
    avg_recency = mean(r),
    avg_gross_profit = mean(raw)) %>% 
  ungroup %>% 
  mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>% 
  data.frame

# Google Motion Chart
plot( gvisMotionChart(
  subset(df, group_size >= 20 & group_size <= 5000),     
  "kg", "dummy", options=list(width=800, height=600) ) )
```

##### D4. 互動式泡泡圖 (ggplot + plotly)
```{r}
filter(df, group_size >= 20 & group_size <= 5000)$group_size %>% 
  sqrt %>% range    # for bubble size adjustment
```

```{r fig.height=7, fig.width=8}
library(ggplot2)
library(plotly)

p = df %>% filter(group_size >= 20 & group_size <= 5000) %>% 
  ggplot(aes(x=avg_frequency, y=avg_monetary)) +
  geom_point(aes(size=group_size, col=avg_revenue_contr),alpha=0.7) +
  geom_text(aes(label=kg), alpha=0) +
  scale_size(range=c(1.5,12)) +
  #scale_color_gradient(low="green",high="magenta") +
  scale_colour_gradientn(
    colours = rev(c("red","yellow","green","lightblue","darkblue"))) +
  theme_bw() + guides(size=F) + labs(
    title="顧客集群(依購買產品)",
    color="平均營收貢獻", size="集群人數") +
  xlab("平均購買次數") + 
  ylab("平均購買金額")
plotly_build(p)
```

##### D5. 群組的代表性產品 (Signature Product)
```{r}
Sig(138)
```
<br><hr>


<div id="E">
### E. 購物籃分析 Baskets Analysis 
</div>


```{r}
dim(mx)   # 32066 cust * 10675 prod
```

##### E1. 準備資料 (for Association Rule Analysis)
```{r}
library(arules)
library(arulesViz)
bx = subset(Z, prod %in% as.numeric(colnames(mx)), 
            select=c("cust","prod"))  # select product items
bx = split(bx$prod, bx$cust)          # split by customer
bx = as(bx, "transactions")           # data for arules package
```

##### E2. Top20 熱賣產品
```{r fig.height=3, fig.width=7.2}
itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)
```

##### E3. 關聯規則和Apriori演算法

關聯規則(A => B)

+ support: A被購買的機率 (A的基礎機率)
+ confidence: A被購買時，B被購買的機率
+ lift: A被購買時，B被購買的機率增加的倍數 (與B的基礎機率相比)
+ 一般來講support、confidence和lift越高的關聯規則越重要
+ support、confidence和lift設的越低(高)，找到的關聯規則越多(少)
+ 建議一開始把標準設低，先找到多一點規則，之後再用subset篩選出特定的規則來看


```{r}
rules = apriori(bx, parameter=list(supp=0.005, conf=0.6))
summary(rules)
```

##### E4. 檢視關聯規則

關聯規則 (A => B)：

+ support: A被購買的機率 (A的基礎機率)
+ confidence: A被購買時，B被購買的機率
+ lift: A被購買時，B被購買的機率增加的倍數 (與B的基礎機率相比)

```{r}
options(digits=4)
inspect(rules)
```

```{r}
# install.packages(
#   "https://cran.r-project.org/bin/windows/contrib/3.5/arulesViz_1.3-1.zip",
#   repos=NULL)

# install.packages("arulesViz_1.3-1.zip", repos=NULL)
# library(plotly)
# plotly_arules(rules,colors=c("red","green"),
#               marker=list(opacity=.6,size=10))
# plotly_arules(rules,method="matrix",
#               shading="lift",
#               colors=c("red", "green"))
# 
```

##### E5. 互動圖表顯示
```{r}
plot(rules,colors=c("red","green"),engine="htmlwidget",
     marker=list(opacity=.6,size=8))
```

```{r}
plot(rules,method="matrix",shading="lift",engine="htmlwidget",
     colors=c("red", "green"))
```

##### E6. 篩選產品、互動式關聯圖
```{r}
r1 = subset(rules, subset = rhs %in% c("4719090790000"))
summary(r1)
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan") 
```

+ 泡泡大小：support: A被購買的機率 (A的基礎機率)
+ 泡泡顏色：lift: A被購買時，B被購買的機率增加的倍數 (與B的基礎機率相比)


```{r}
r2 = subset(rules, subset = rhs %in% c("4710011401135"))
summary(r2)
plot(r2,method="graph",engine="htmlwidget",itemCol="cyan") 
```
<br><hr>

<div id="F">
### F. 產品推薦 Product Recommendation
</div>


##### F1. 篩選顧客、產品
太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法，所以我們先對顧客和產品做一次篩選
```{r}
library(recommenderlab)
rx = mx[, colSums(mx > 0) >= 50]
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]
dim(rx)
```

##### F2. 選擇產品評分方式
可以選擇要用

+ 購買次數 (realRatingMatrix) 或
+ 是否購買 (binaryRatingMatrix)

做模型。
```{r}
rx = as(rx, "realRatingMatrix")  # realRatingMatrix
bx = binarize(rx, minRating=1)   # binaryRatingMatrix
dim(bx)
```

##### F3. 設定模型(準確性)驗證方式
```{r}
set.seed(4321)
scheme = evaluationScheme(     
  bx, method="split", train = .75,  given=5)
```

##### F4. 設定推薦方法(參數)
```{r}
algorithms = list(            
  AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
  AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
  RANDOM = list(name="RANDOM", param=NULL),
  POPULAR = list(name="POPULAR", param=NULL),
  UBCF = list(name="UBCF", param=NULL),
  IBCF = list(name="IBCF", param=NULL) )
```

##### F5. 建模、預測、驗證(準確性)
```{r}
if(LOAD) {
  load("results.rdata")
} else {
  t0 = Sys.time()
  results = evaluate(            
    scheme, algorithms, type="topNList",
    n=c(5, 10, 15, 20))
  print(Sys.time() - t0)
  save(results, file="results.rdata")
}
```

##### F6. 模型準確性比較
```{r fig.height=5, fig.width=5}
# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)
```

##### F7. 儲存產品推薦模型
```{r}
save(results, file="data/results.rdata")
```

<br><br><hr><br><br><br>

<style>

.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

.qiz {
  line-height: 1.75;
  background: #f0f0f0;
  border-left: 12px solid #ccffcc;
  padding: 4px;
  padding-left: 10px;
  color: #009900;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #0066ff;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}


h3{
  color: #008800;
  background: #e6ffe6;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #006000;
  background: #f8f8f8;
  line-height: 1.5;
  font-weight: bold;
}

.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{font-family:Arial, sans-serif;font-size:14px;padding:10px 5px;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:black;}
.tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:10px 5px;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:black;}
.tg .tg-b44r{background-color:#cbcefb;vertical-align:top}
.tg .tg-yw4l{vertical-align:top}

.comment {
    border-radius: 10px;
    border: 2px solid #ca8eff;
    padding: 20px; 
}

.nav {
    list-style-type: none;
    margin: 0;
    padding: 0;
    width: 200px;
    background-color: #f1f1f1;
    position: fixed;
    left: 5px;
}

li a {
    display: block;
    color: #000;
    padding: 8px 16px;
    text-decoration: none;
}

li a.active {
    background-color: #4CAF50;
    color: white;
}

li a:hover:not(.active) {
    background-color: #555;
    color: white;
}

</style>

