Preparing Data

Libraries
Sys.setlocale("LC_ALL","C")
[1] "C"
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)

新增變數

新增CHR
X$CHR=as.numeric(X$date)
X$CHR1=X$CHR>=11309 & X$CHR<=11315
X$CHR2=as.numeric(X$CHR1)
計算maxweekday
# table(X$cust) %>% sort %>% tail
maxWeekday = function(x){
  y = table(x) %>% t %>% as.matrix
  result = which.max(y) %>% 
    colnames(y)[.] %>% 
    as.Date %>% 
    weekdays
  return(result)
}
計算其餘9個變數
d0 = max(X$date)
NewVar = group_by(X, cust) %>% summarise(
  prodPrice = mean(total / pieces), # 商品平均單價
  weekday = maxWeekday(date), # 顧客最常禮拜幾來買
  CHR = sum(CHR2)>0, # chistmas
  k = (1 + as.integer(difftime(max(date), min(date), units="days"))) / n(), # 個人平均購買週期
  areaEFH = ((first(area)=="E") | (first(area)=="F") | (first(area)=="H")), # 是否住在EFH區域
  areaACDEF = ((first(area)=="A") | (first(area)=="C") | (first(area)=="D") | (first(area)=="E") | (first(area)=="F")),# 是否住在ACDEF區域
  mon1 = sum(date<="2000-11-30"),  # 顧客第一個月來購買的次數
  mon2 = sum(date<="2000-12-31" & date>"2000-11-30"), #第二個月
  mon3 = sum(date<="2001-01-31" & date>"2000-12-31")  #及第三個月來購買的次數
  ) %>% data.frame    # 28584
將新增的變數和資料A合併
A = merge(A, NewVar, all.x = T)
把weekday轉換成有順序性的facotr變數
A$weekday = factor(A$weekday
    , ordered=TRUE
    , levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

新增變數(2)

計算年齡區間
A$ageGroup = sapply(seq(1,nrow(A),1), function(x){
  if((A[x, "age"] == "A") | (A[x, "age"] == "B") | (A[x, "age"] == "C")) 
    A$ageGroup = 1 #35歲以下者
  else if((A[x, "age"] == "D") | (A[x, "age"] == "E") | (A[x, "age"] == "F")) 
    A$ageGroup = 2 #35~50歲
  else
    A$ageGroup = 3 #50歲以上者
})
A$ageGroup = as.factor(A$ageGroup)
計算Weekday Percentage: W1 ~ W7
X = X %>% mutate(wday = format(date, "%w"))
# table(X$wday)
# 製作顧客星期矩陣
mx = xtabs(~ cust + wday, X)
# dim(mx)
# mx[1:5,]
mx = mx / rowSums(mx)
# mx[1:5,]
將矩陣併入入data frame
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>% 
  setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>% 
  right_join(A, by='cust')

新增變數(3)

計算顧客為weekend或weekday shopper
A$weekend = sapply(seq(1,nrow(A),1), function(x){
  w15 = A[x,c("W1","W2","W3","W4","W5")] %>% sum
  w67 = A[x,c("W6","W7")] %>% sum
  if(w67>w15) 
    A$weekend = 1 #偏好週末消費
  else if(w67==w15)
    A$weekend = 2 #無特別偏好
  else
    A$weekend = 3 #偏好平日者
})
A$weekend = as.factor(A$weekend)
顧客產品矩陣
library(Matrix)
library(slam)
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
cpm = xtabs(~ cust + prod, Z, sparse=T)

刪去購買次數小於10的產品

table(colSums(cpm) < 10)       # 12862

FALSE  TRUE 
 9652 12862 
cpm = cpm[, colSums(cpm) > 10]   
cpm = cpm[, order(-colSums(cpm))]   # order product by frequency
dim(cpm)                       # 28584  9149
[1] 28584  9149

是否有購買前5大、10大、20大熱銷產品

hot5 = cpm[,1:5] %>% rowSums
A$hot5 = hot5>0
hot10 = cpm[,1:10] %>% rowSums
A$hot10 = hot10>0
hot20 = cpm[,1:20] %>% rowSums
A$hot20 = hot20>0
前20個熱銷產品分別購買次數

確認cpm和A的顧客順序及大小是否有相同

cpm = cpm %>% as.matrix
(A$cust == rownames(cpm)) %>% sum
[1] 28584

合併兩個資料並將欄位名稱變更

A = cbind(A,cpm[,1:20])
colnames(A)[33:52] = sapply(seq(33,52,1), function(x){
  names = paste0("p",colnames(A)[x])
})
summary
summary(A)
      cust                W1               W2               W3               W4       
 Min.   :    1069   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
 1st Qu.: 1060898   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
 Median : 1654100   Median :0.0000   Median :0.0000   Median :0.0000   Median :0.000  
 Mean   : 1461070   Mean   :0.2310   Mean   :0.1399   Mean   :0.1176   Mean   :0.103  
 3rd Qu.: 1945003   3rd Qu.:0.3484   3rd Qu.:0.2000   3rd Qu.:0.1333   3rd Qu.:0.000  
 Max.   :20002000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
                                                                                      
       W5               W6               W7               r               s               f         
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
 Median :0.0000   Median :0.0000   Median :0.0000   Median :21.00   Median :68.00   Median : 2.000  
 Mean   :0.1188   Mean   :0.1136   Mean   :0.1762   Mean   :32.12   Mean   :61.27   Mean   : 3.089  
 3rd Qu.:0.1354   3rd Qu.:0.1111   3rd Qu.:0.2500   3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :92.00   Max.   :92.00   Max.   :60.000  
                                                                                                    
       m                rev             raw               age            area          amount     
 Min.   :    8.0   Min.   :    8   Min.   : -742.0   D      :5832   E      :9907   Min.   :    8  
 1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   C      :5238   F      :7798   1st Qu.:  454  
 Median :  709.5   Median : 1566   Median :  218.0   E      :4514   C      :3169   Median :  993  
 Mean   : 1012.4   Mean   : 2711   Mean   :  420.8   F      :3308   G      :3052   Mean   : 1499  
 3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0   B      :2802   D      :1778   3rd Qu.: 1955  
 Max.   :10634.0   Max.   :99597   Max.   :15565.0   G      :1940   H      :1295   Max.   :28089  
                                                     (Other):4950   (Other):1585   NA's   :15342  
    buy            prodPrice            weekday        CHR                k           areaEFH       
 Mode :logical   Min.   :   5.00   Sunday   :6761   Mode :logical   Min.   : 1.000   Mode :logical  
 FALSE:15342     1st Qu.:  63.36   Monday   :3463   FALSE:27066     1st Qu.: 1.000   FALSE:9584     
 TRUE :13242     Median :  86.44   Tuesday  :2900   TRUE :1518      Median : 5.571   TRUE :19000    
                 Mean   : 115.97   Wednesday:2727                   Mean   : 8.550                  
                 3rd Qu.: 123.20   Thursday :3878                   3rd Qu.:14.000                  
                 Max.   :3939.00   Friday   :3617                   Max.   :45.000                  
                                   Saturday :5238                                                   
 areaACDEF            mon1             mon2              mon3        ageGroup  weekend  
 Mode :logical   Min.   : 0.000   Min.   : 0.0000   Min.   : 0.000   1: 9445   1: 5337  
 FALSE:5130      1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.: 0.000   2:13654   2: 3205  
 TRUE :23454     Median : 1.000   Median : 1.0000   Median : 1.000   3: 5485   3:20042  
                 Mean   : 1.112   Mean   : 0.9339   Mean   : 1.043                      
                 3rd Qu.: 1.000   3rd Qu.: 1.0000   3rd Qu.: 1.000                      
                 Max.   :26.000   Max.   :21.0000   Max.   :25.000                      
                                                                                        
    hot5           hot10           hot20         p4711271000014   p4714981010038    p4719090900065   
 Mode :logical   Mode :logical   Mode :logical   Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000  
 FALSE:19029     FALSE:16425     FALSE:13336     1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000  
 TRUE :9555      TRUE :12159     TRUE :15248     Median :0.0000   Median : 0.0000   Median :0.00000  
                                                 Mean   :0.1837   Mean   : 0.1631   Mean   :0.07679  
                                                 3rd Qu.:0.0000   3rd Qu.: 0.0000   3rd Qu.:0.00000  
                                                 Max.   :5.0000   Max.   :16.0000   Max.   :6.00000  
                                                                                                     
 p4711080010112     p4710908131589   p4713985863121  p4710114128038    p4710421090059   
 Min.   : 0.00000   Min.   :0.0000   Min.   :0.000   Min.   :0.00000   Min.   : 0.0000  
 1st Qu.: 0.00000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.00000   1st Qu.: 0.0000  
 Median : 0.00000   Median :0.0000   Median :0.000   Median :0.00000   Median : 0.0000  
 Mean   : 0.06014   Mean   :0.0543   Mean   :0.053   Mean   :0.05153   Mean   : 0.0515  
 3rd Qu.: 0.00000   3rd Qu.:0.0000   3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.: 0.0000  
 Max.   :21.00000   Max.   :8.0000   Max.   :9.000   Max.   :5.00000   Max.   :13.0000  
                                                                                        
 p4710291112172   p4712425010712    p4710583996008    p4710011401128    p4710088410139   
 Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
 Mean   :0.0508   Mean   :0.05076   Mean   :0.04894   Mean   :0.04807   Mean   :0.04803  
 3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :5.0000   Max.   :5.00000   Max.   :7.00000   Max.   :8.00000   Max.   :5.00000  
                                                                                         
 p4719090900058     p4710094097768     p4710088410610    p4710085120628    p4710265849066   
 Min.   : 0.00000   Min.   : 0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.: 0.00000   1st Qu.: 0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median : 0.00000   Median : 0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
 Mean   : 0.04628   Mean   : 0.04429   Mean   :0.04394   Mean   :0.04121   Mean   :0.04058  
 3rd Qu.: 0.00000   3rd Qu.: 0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :13.00000   Max.   :17.00000   Max.   :6.00000   Max.   :7.00000   Max.   :4.00000  
                                                                                            
 p4710018004605    p4710908131534   
 Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.00000   Median :0.00000  
 Mean   :0.03985   Mean   :0.03778  
 3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :6.00000   Max.   :5.00000  
                                    



Spliting the Data

A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR = A[spl, ]
TS = A[!spl, ]
# 把"cust"及"amount"欄位拿掉
TR1 = subset(TR, select = -c(cust, amount))
TS1 = subset(TS, select = -c(cust, amount))



Turn on Parallel Processing
#做平行運算
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4



CART Classification Model

CV Control for Classification
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)
CV: rpart(), Classification Tree
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR1, method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(.cp = seq(0.0002,0.001,0.0001)))
Sys.time() - t0
Time difference of 41.4574 secs
plot(cv.rpart);cv.rpart
CART 

20008 samples
   49 predictor
    2 classes: 'no', 'yes' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 2 times) 
Summary of sample sizes: 18007, 18007, 18007, 18007, 18007, 18007, ... 
Resampling results across tuning parameters:

  cp     ROC        Sens       Spec     
  2e-04  0.6945636  0.7386622  0.5807015
  3e-04  0.7063938  0.7728375  0.5760073
  4e-04  0.7161350  0.7787498  0.5792441
  5e-04  0.7234962  0.7929951  0.5694282
  6e-04  0.7256986  0.7994671  0.5675938
  7e-04  0.7275238  0.8028204  0.5660290
  8e-04  0.7258432  0.8104117  0.5597687
  9e-04  0.7236823  0.8129257  0.5577191
  1e-03  0.7224932  0.8109231  0.5605789

ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 7e-04.

Classification Tree, Final Model
#Create a CART model
rpart1 = rpart(buy ~ ., TR1, method="class", cp=0.0008)
PredictCART1 = predict(rpart1, TS, type="prob")[,2] 
compute AUC
colAUC(PredictCART1,TS$buy,T) #0.7433228
                [,1]
no vs. yes 0.7433228

CART model without cp Classification Model

# CART model
rpart2 = rpart(buy ~ ., data = TR1, method="class")
# Make predictions
PredictCART2 = predict(rpart2, newdata = TS1, type = "prob")[,2]
compute AUC
colAUC(PredictCART2, TS$buy,T) #0.6983998
                [,1]
no vs. yes 0.6983998

GLM Classification Model

glm2 = glm(buy ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6 -age -areaEFH -areaACDEF -weekday -weekend -hot20 -hot10 -hot5 -ageGroup - CHR -mon1 -mon2 -mon3 , TR1[,1:30], family=binomial())
summary(glm2)

Call:
glm(formula = buy ~ . - W7 - W1 - W2 - W3 - W4 - W5 - W6 - age - 
    areaEFH - areaACDEF - weekday - weekend - hot20 - hot10 - 
    hot5 - ageGroup - CHR - mon1 - mon2 - mon3, family = binomial(), 
    data = TR1[, 1:30])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.5126  -0.8816  -0.7090   1.0349   2.3789  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.034e+00  1.083e-01  -9.551  < 2e-16 ***
r           -2.108e-02  2.319e-03  -9.091  < 2e-16 ***
s            1.821e-02  2.341e-03   7.776 7.47e-15 ***
f            2.252e-01  2.215e-02  10.170  < 2e-16 ***
m           -3.110e-06  2.780e-05  -0.112  0.91093    
rev          4.874e-05  1.958e-05   2.489  0.01281 *  
raw         -2.850e-04  8.663e-05  -3.290  0.00100 ** 
areaB       -3.508e-02  1.321e-01  -0.266  0.79059    
areaC       -2.072e-01  1.044e-01  -1.985  0.04716 *  
areaD        3.703e-02  1.109e-01   0.334  0.73850    
areaE        2.606e-01  9.672e-02   2.695  0.00705 ** 
areaF        1.773e-01  9.738e-02   1.821  0.06859 .  
areaG       -4.751e-02  1.043e-01  -0.456  0.64869    
areaH       -1.878e-01  1.215e-01  -1.545  0.12234    
prodPrice   -7.209e-04  1.491e-04  -4.836 1.32e-06 ***
k           -2.034e-02  5.019e-03  -4.054 5.04e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27629  on 20007  degrees of freedom
Residual deviance: 23270  on 19992  degrees of freedom
AIC: 23302

Number of Fisher Scoring iterations: 6
compute AUC
predGLM =  predict(glm2, TS1, type="response")
colAUC(predGLM, TS$buy,T) #0.7578518
                [,1]
no vs. yes 0.7578518

TS$buy %>% table
.
  no  yes 
4603 3973 
confuse matrix
cm = table(actual = TS1$buy, predict = predGLM > 0.5); cm
      predict
actual FALSE TRUE
   no   3822  781
   yes  1759 2214
compute accuracy
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts 
[1] 0.7038246
Interaction Terms
  • 根據CART分法產生了對f做分組的想法
    • 分成 1,2,3,4,>=5 共五組
A$f_group = ifelse(A$f == 1,"1",
                   ifelse(A$f == 2,"2",
                   ifelse(A$f == 3,"3",
                   ifelse(A$f == 4,"4",">=5"))))
  • 常理上f(frequecy)高不代表他就是常貴客,也許他只是最初很常來買,但已經很久沒再來了,因此還要考慮到r(recency),如果f高r低,代表這類顧客常來且最近有來,是我們的常貴客。
  • 因此我們再將r做分組,讓f_group和r_group做交互。r的分組則用百分位數分五組。
A <- A %>% mutate(r_group = ntile(r, 5))
#將f_group和r_group轉為factor
A[,c("f_group","r_group")]  <- lapply(A[,c("f_group","r_group")], factor)
#A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR_inter = A[spl, ]
TS_inter = A[!spl, ]
# 把"cust"及"amount"欄位拿掉
TR1_inter = subset(TR_inter, select = -c(cust, amount))
TS1_inter = subset(TS_inter, select = -c(cust, amount))
  • 再分出一組新的TR和TS
glm2 = glm(buy ~ s+m+rev+k+prodPrice+area+mon2+f_group:r_group, TR1_inter, family=binomial())
# glm2 = glm(buy ~ area, TR1, family = binomial) 
# area E,F,H 很顯著
summary(glm2)

Call:
glm(formula = buy ~ s + m + rev + k + prodPrice + area + mon2 + 
    f_group:r_group, family = binomial(), data = TR1_inter)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3354  -0.8615  -0.6828   1.0047   2.4482  

Coefficients: (1 not defined because of singularities)
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -1.364e+00  4.408e-01  -3.094 0.001978 ** 
s                    7.616e-03  2.169e-03   3.511 0.000446 ***
m                   -4.748e-05  2.634e-05  -1.803 0.071425 .  
rev                  2.735e-05  1.091e-05   2.506 0.012199 *  
k                   -1.251e-02  5.477e-03  -2.284 0.022383 *  
prodPrice           -7.031e-04  1.500e-04  -4.688 2.76e-06 ***
areaB               -4.415e-02  1.329e-01  -0.332 0.739755    
areaC               -2.230e-01  1.051e-01  -2.121 0.033905 *  
areaD                1.829e-02  1.118e-01   0.164 0.869989    
areaE                2.408e-01  9.731e-02   2.474 0.013359 *  
areaF                1.609e-01  9.803e-02   1.641 0.100784    
areaG               -6.900e-02  1.050e-01  -0.657 0.511122    
areaH               -1.786e-01  1.214e-01  -1.471 0.141207    
mon2                 1.021e-01  2.268e-02   4.502 6.73e-06 ***
f_group1:r_group1    4.344e-01  4.364e-01   0.995 0.319596    
f_group2:r_group1    8.794e-01  4.394e-01   2.001 0.045375 *  
f_group3:r_group1    1.289e+00  4.273e-01   3.015 0.002567 ** 
f_group4:r_group1    1.669e+00  4.220e-01   3.954 7.68e-05 ***
f_group>=5:r_group1  2.372e+00  4.090e-01   5.800 6.62e-09 ***
f_group1:r_group2    3.630e-01  4.290e-01   0.846 0.397384    
f_group2:r_group2    8.930e-01  4.328e-01   2.064 0.039063 *  
f_group3:r_group2    1.278e+00  4.212e-01   3.035 0.002406 ** 
f_group4:r_group2    1.518e+00  4.196e-01   3.618 0.000297 ***
f_group>=5:r_group2  2.160e+00  4.108e-01   5.256 1.47e-07 ***
f_group1:r_group3    1.601e-01  4.195e-01   0.382 0.702662    
f_group2:r_group3    8.393e-01  4.232e-01   1.983 0.047317 *  
f_group3:r_group3    1.207e+00  4.170e-01   2.895 0.003792 ** 
f_group4:r_group3    1.312e+00  4.198e-01   3.125 0.001779 ** 
f_group>=5:r_group3  1.717e+00  4.141e-01   4.147 3.37e-05 ***
f_group1:r_group4   -1.017e-01  4.079e-01  -0.249 0.803046    
f_group2:r_group4    5.604e-01  4.109e-01   1.364 0.172627    
f_group3:r_group4    9.261e-01  4.138e-01   2.238 0.025216 *  
f_group4:r_group4    1.011e+00  4.266e-01   2.370 0.017795 *  
f_group>=5:r_group4  1.206e+00  4.300e-01   2.805 0.005028 ** 
f_group1:r_group5   -2.899e-01  4.018e-01  -0.721 0.470678    
f_group2:r_group5    2.614e-01  4.069e-01   0.642 0.520654    
f_group3:r_group5    3.510e-01  4.272e-01   0.822 0.411258    
f_group4:r_group5    1.009e-01  5.208e-01   0.194 0.846315    
f_group>=5:r_group5         NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27629  on 20007  degrees of freedom
Residual deviance: 23266  on 19970  degrees of freedom
AIC: 23342

Number of Fisher Scoring iterations: 5
pred =  predict(glm2, TS1_inter, type="response")
prediction from a rank-deficient fit may be misleading
colAUC(pred, TS$buy)    # 0.7466507
                [,1]
no vs. yes 0.7558844
TS$buy %>% table
.
  no  yes 
4603 3973 
#刪除f_group,r_group 避免影響後面
A$f_group <- NULL
A$r_group <- NULL

kmeans集群分析後再做預測

RFM分群參考:

集群分析的模型與預測方法參考:

移除目標變數與類別變數
#colnames(TR1)
LTR = TR1[,c(8:13,17,20)]
LTS = TS1[,c(8:13,17,20)]
#LTR = TR1[,c(1:13,16,18:24)]
#LTS = TS1[,c(1:13,16,18:24)]
#LTR = TR1[,c(2:13,16,18:24,27:49)]
#LTS = TS1[,c(2:13,16,18:24,27:49)]
區隔變數常態化
library(caret)
preproc = preProcess(LTR)
NTR = predict(preproc, LTR)
NTS = predict(preproc, LTS)
新增group欄位表示顧客屬於哪群
#summary(NTR)
set.seed(144)
km <- kmeans(NTR, 7)
table(km$cluster)

   1    2    3    4    5    6    7 
 303 2251 4159 1354 6052 5701  188 
NTR$group = km$cluster
視覺化分群結果
  • 整理每群資料
df = group_by(NTR, group) %>% summarise(
  avg_frequency = mean(f),
  avg_monetary = mean(m),
  total_revenue = sum(rev),
  group_size = n(),
  avg_recency = mean(r)
  #avg_hot5 = mean(hot5),
  #avg_hot10 = mean(hot10),
  #avg_hot20 = mean(hot20)
  #profit = sum(raw)
  ) %>% ungroup %>% 
  mutate(
    #pc_revenue = round(100*total_revenue/sum(total_revenue),1), # % revenue contrib.
    #pc_profit = round(100*profit/sum(profit),1),                # % profit contrib. 
    dummy = 2001  # to fit googleViz's data format
  ) %>% data.frame

googlevis 視覺化

plot( gvisMotionChart(
  df, "group", "dummy",
  options=list(width=800, height=600) 
  ))

kcca 分群模型

library(flexclust)
Loading required package: grid
Loading required package: modeltools
Loading required package: stats4
km.kcca = as.kcca(km, NTR[,1:8]) #NTR不取group
Found more than one class "kcca" in cache; using the first, from namespace 'kernlab'
Also defined by 'flexclust'
Found more than one class "kcca" in cache; using the first, from namespace 'kernlab'
Also defined by 'flexclust'
CTR = predict(km.kcca)
Found more than one class "kcca" in cache; using the first, from namespace 'kernlab'
Also defined by 'flexclust'
CTS = predict(km.kcca, newdata=NTS)
依集群分析結果切割資料
tapply(TR1$buy, CTR, mean)
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NA
 1  2  3  4  5  6  7 
NA NA NA NA NA NA NA 
#從googlevis看第2群是高f低r群,實際上第四個月的buy比例也確實高
分群模型
  • 對每群做個別的train與predict
#colnames(TR1)
M = lapply(split(TR1[,c(8:13,17,20,16)], CTR), function(x) 
  glm(buy~., data=x, family=binomial) )
#sapply(M, coef)
#glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
分群測試AUC
Pred = lapply(1:7, function(i) 
  predict(M[[i]], TS1[CTS==i,], type='response') )
 sapply(1:7, function(i) 
   colAUC(Pred[[i]], TS1$buy[CTS==i]))
[1] 0.8928571 0.7162432 0.6366921 0.6616960 0.6222446 0.6843932 0.6673993
#length(Pred[[1]])
#length(TS1$buy[CTS==1])
分群整體測試AUC
# table( do.call(c, split(TS1$buy,CTS)), do.call(c, Pred) > 0.5 ) %>%
#   {sum(diag(.))/sum(.)}
a = cbind(TS1$buy[CTS==1],Pred[[1]])
b = cbind(TS1$buy[CTS==2],Pred[[2]])
c = cbind(TS1$buy[CTS==3],Pred[[3]])
d = cbind(TS1$buy[CTS==4],Pred[[4]])
e = cbind(TS1$buy[CTS==5],Pred[[5]])
f = cbind(TS1$buy[CTS==6],Pred[[6]])
g = cbind(TS1$buy[CTS==7],Pred[[7]])
#h = cbind(TS1$buy[CTS==8],Pred[[8]])
CF = rbind(a,b,c,d,e,f,g)
colAUC(CF[,2],CF[,1])
             [,1]
1 vs. 2 0.7564442

Classification ModelEnsembling

# GLM + CART1
result = (predGLM + PredictCART1)/2
colAUC(result, TS$buy,T) #0.7584242
                [,1]
no vs. yes 0.7584242

# GLM + CART2
result = (predGLM + PredictCART2)/2
colAUC(result, TS$buy,T) # 0.7584827 higher
                [,1]
no vs. yes 0.7584827





Regression Model

Preparing the Data
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2) %>% subset(., select = -c(cust, buy))
TS2 = subset(A2, !spl2) %>% subset(., select = -c(cust, buy))

linear model

lm1 = lm(amount ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6  -areaACDEF-area -weekday  -hot5-hot10-hot20-ageGroup-mon1-mon2 ,TR2)
summary(lm1)

Call:
lm(formula = amount ~ . - W7 - W1 - W2 - W3 - W4 - W5 - W6 - 
    areaACDEF - area - weekday - hot5 - hot10 - hot20 - ageGroup - 
    mon1 - mon2, data = TR2)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8497 -0.2264  0.0478  0.2769  1.5540 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     1.335e+00  4.704e-02  28.376  < 2e-16 ***
r              -1.546e-03  5.262e-04  -2.937  0.00332 ** 
s               2.038e-03  5.155e-04   3.954 7.73e-05 ***
f               1.170e-02  2.705e-03   4.325 1.54e-05 ***
m               5.367e-01  4.345e-02  12.350  < 2e-16 ***
rev            -3.019e-02  4.133e-02  -0.731  0.46505    
raw             6.186e-05  8.795e-06   7.033 2.16e-12 ***
ageB            7.682e-02  2.492e-02   3.082  0.00206 ** 
ageC            1.194e-01  2.289e-02   5.218 1.85e-07 ***
ageD            1.262e-01  2.259e-02   5.588 2.37e-08 ***
ageE            1.321e-01  2.312e-02   5.712 1.15e-08 ***
ageF            1.054e-01  2.416e-02   4.365 1.29e-05 ***
ageG            8.057e-02  2.635e-02   3.058  0.00223 ** 
ageH            7.178e-02  3.106e-02   2.311  0.02083 *  
ageI            6.980e-02  3.196e-02   2.184  0.02899 *  
ageJ           -1.970e-02  2.806e-02  -0.702  0.48279    
ageK            1.130e-01  3.768e-02   2.999  0.00272 ** 
prodPrice      -3.198e-04  5.026e-05  -6.364 2.07e-10 ***
CHRTRUE        -4.254e-02  1.746e-02  -2.437  0.01484 *  
k              -5.485e-03  1.073e-03  -5.111 3.26e-07 ***
areaEFHTRUE    -1.471e-02  1.029e-02  -1.430  0.15289    
mon3            1.524e-02  4.845e-03   3.145  0.00166 ** 
weekend2       -3.722e-03  1.755e-02  -0.212  0.83209    
weekend3       -4.709e-03  1.262e-02  -0.373  0.70897    
p4711271000014 -1.239e-02  9.222e-03  -1.343  0.17922    
p4714981010038 -1.616e-02  8.396e-03  -1.925  0.05428 .  
p4719090900065 -1.011e-02  1.227e-02  -0.824  0.40996    
p4711080010112  1.684e-03  9.343e-03   0.180  0.85697    
p4710908131589  1.263e-02  1.461e-02   0.864  0.38752    
p4713985863121  2.320e-02  1.386e-02   1.674  0.09424 .  
p4710114128038  1.809e-02  1.445e-02   1.252  0.21066    
p4710421090059 -1.361e-02  1.319e-02  -1.031  0.30247    
p4710291112172  3.389e-02  1.499e-02   2.262  0.02374 *  
p4712425010712  3.541e-02  1.513e-02   2.340  0.01929 *  
p4710583996008  1.054e-02  1.377e-02   0.766  0.44376    
p4710011401128 -1.081e-03  1.237e-02  -0.087  0.93037    
p4710088410139  4.659e-03  1.527e-02   0.305  0.76035    
p4719090900058  2.458e-02  1.613e-02   1.524  0.12764    
p4710094097768  1.946e-02  1.103e-02   1.765  0.07760 .  
p4710088410610  1.353e-02  1.377e-02   0.982  0.32598    
p4710085120628  6.767e-04  1.616e-02   0.042  0.96661    
p4710265849066 -1.987e-02  1.750e-02  -1.135  0.25628    
p4710018004605  1.555e-02  1.562e-02   0.996  0.31935    
p4710908131534 -7.272e-04  1.635e-02  -0.044  0.96453    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4179 on 9225 degrees of freedom
Multiple R-squared:  0.305, Adjusted R-squared:  0.3017 
F-statistic: 94.14 on 43 and 9225 DF,  p-value: < 2.2e-16

compute R-squared

r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.3049726 0.2732476

CART Regression Model

Cross-Validation
library(caret)
library(e1071)
library(rpart)
# Define cross-validation experiment
# k=10
numFolds = trainControl( method = "cv", number = 10 ) 
# pick the posiible values for cp
cpGrid = expand.grid( .cp = seq(0.001,0.1,0.001)) 
# Perform the cross validation
cv = train(amount ~ ., data = TR2, method = "rpart", trControl = numFolds, tuneGrid = cpGrid )
# method="rpart" : we wanto to validate a CART model
plot(cv);cv
CART 

9269 samples
  49 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 8343, 8343, 8340, 8344, 8342, 8342, ... 
Resampling results across tuning parameters:

  cp     RMSE       Rsquared   MAE      
  0.001  0.4280068  0.2711698  0.3286290
  0.002  0.4236615  0.2828408  0.3262958
  0.003  0.4242213  0.2806896  0.3266302
  0.004  0.4262393  0.2737030  0.3284240
  0.005  0.4267641  0.2717313  0.3289417
  0.006  0.4269306  0.2712072  0.3292381
  0.007  0.4269095  0.2712740  0.3293730
  0.008  0.4289031  0.2646008  0.3304202
  0.009  0.4298855  0.2611819  0.3311835
  0.010  0.4328875  0.2505936  0.3339821
  0.011  0.4328975  0.2505476  0.3338670
  0.012  0.4333520  0.2489805  0.3344444
  0.013  0.4348697  0.2436760  0.3355859
  0.014  0.4367141  0.2372062  0.3372679
  0.015  0.4368929  0.2365914  0.3373805
  0.016  0.4374052  0.2348192  0.3375237
  0.017  0.4374052  0.2348192  0.3375237
  0.018  0.4374052  0.2348192  0.3375237
  0.019  0.4374052  0.2348192  0.3375237
  0.020  0.4374052  0.2348192  0.3375237
  0.021  0.4374052  0.2348192  0.3375237
  0.022  0.4374052  0.2348192  0.3375237
  0.023  0.4374052  0.2348192  0.3375237
  0.024  0.4374052  0.2348192  0.3375237
  0.025  0.4374052  0.2348192  0.3375237
  0.026  0.4374052  0.2348192  0.3375237
  0.027  0.4374052  0.2348192  0.3375237
  0.028  0.4374052  0.2348192  0.3375237
  0.029  0.4374052  0.2348192  0.3375237
  0.030  0.4386075  0.2305706  0.3383498
  0.031  0.4415920  0.2202157  0.3402422
  0.032  0.4435006  0.2135814  0.3417392
  0.033  0.4441417  0.2112573  0.3422859
  0.034  0.4441417  0.2112573  0.3422859
  0.035  0.4441417  0.2112573  0.3422859
  0.036  0.4441417  0.2112573  0.3422859
  0.037  0.4441417  0.2112573  0.3422859
  0.038  0.4441417  0.2112573  0.3422859
  0.039  0.4466367  0.2024459  0.3438247
  0.040  0.4466367  0.2024459  0.3438247
  0.041  0.4466367  0.2024459  0.3438247
  0.042  0.4466367  0.2024459  0.3438247
  0.043  0.4466367  0.2024459  0.3438247
  0.044  0.4466367  0.2024459  0.3438247
  0.045  0.4466367  0.2024459  0.3438247
  0.046  0.4466367  0.2024459  0.3438247
  0.047  0.4479206  0.1978418  0.3448467
  0.048  0.4527458  0.1803652  0.3490769
  0.049  0.4542481  0.1749889  0.3503303
  0.050  0.4555422  0.1704642  0.3515812
  0.051  0.4574262  0.1635717  0.3535963
  0.052  0.4581809  0.1606468  0.3543684
  0.053  0.4581809  0.1606468  0.3543684
  0.054  0.4581809  0.1606468  0.3543684
  0.055  0.4581809  0.1606468  0.3543684
  0.056  0.4581809  0.1606468  0.3543684
  0.057  0.4581809  0.1606468  0.3543684
  0.058  0.4581809  0.1606468  0.3543684
  0.059  0.4581809  0.1606468  0.3543684
  0.060  0.4581809  0.1606468  0.3543684
  0.061  0.4581809  0.1606468  0.3543684
  0.062  0.4581809  0.1606468  0.3543684
  0.063  0.4581809  0.1606468  0.3543684
  0.064  0.4581809  0.1606468  0.3543684
  0.065  0.4581809  0.1606468  0.3543684
  0.066  0.4581809  0.1606468  0.3543684
  0.067  0.4581809  0.1606468  0.3543684
  0.068  0.4581809  0.1606468  0.3543684
  0.069  0.4581809  0.1606468  0.3543684
  0.070  0.4581809  0.1606468  0.3543684
  0.071  0.4581809  0.1606468  0.3543684
  0.072  0.4581809  0.1606468  0.3543684
  0.073  0.4581809  0.1606468  0.3543684
  0.074  0.4581809  0.1606468  0.3543684
  0.075  0.4581809  0.1606468  0.3543684
  0.076  0.4581809  0.1606468  0.3543684
  0.077  0.4581809  0.1606468  0.3543684
  0.078  0.4581809  0.1606468  0.3543684
  0.079  0.4581809  0.1606468  0.3543684
  0.080  0.4581809  0.1606468  0.3543684
  0.081  0.4581809  0.1606468  0.3543684
  0.082  0.4581809  0.1606468  0.3543684
  0.083  0.4581809  0.1606468  0.3543684
  0.084  0.4581809  0.1606468  0.3543684
  0.085  0.4581809  0.1606468  0.3543684
  0.086  0.4581809  0.1606468  0.3543684
  0.087  0.4581809  0.1606468  0.3543684
  0.088  0.4581809  0.1606468  0.3543684
  0.089  0.4581809  0.1606468  0.3543684
  0.090  0.4581809  0.1606468  0.3543684
  0.091  0.4581809  0.1606468  0.3543684
  0.092  0.4581809  0.1606468  0.3543684
  0.093  0.4581809  0.1606468  0.3543684
  0.094  0.4581809  0.1606468  0.3543684
  0.095  0.4581809  0.1606468  0.3543684
  0.096  0.4581809  0.1606468  0.3543684
  0.097  0.4581809  0.1606468  0.3543684
  0.098  0.4581809  0.1606468  0.3543684
  0.099  0.4581809  0.1606468  0.3543684
  0.100  0.4581809  0.1606468  0.3543684

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.002.

Create a CART model

rpart2 = rpart(amount ~ ., data = TR2, cp = 0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) -  TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2521828

model ensembling

predLM = predict(lm1, newdata=TS2)
predCART = predict(rpart2, newdata=TS2)
result = (predLM + predCART)/2
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((result -  TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2740926




Stop Parallel Processing
stopCluster(clust)








---
title: "Data Preparation, Ta-Feng"
author: "Group 4 B034020012謝雨靜  B034020027陳韻卉 B044012015王譯苓  M064111025  黃威豪  M064111039王  欣  行傳所    孟祥瑄"
output: html_notebook

---

<br>

### Preparing Data

##### Libraries 
```{r echo=T, message=F, cache=F, warning=F}
Sys.setlocale("LC_ALL","C")
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
```

### 新增變數

* CHR：聖誕節前一週有沒有來買
* maxWeekday：顧客最常禮拜幾來買
* prodPrice：商品平均單價
* weekday：顧客最常禮拜幾來買
* k：顧客個人平均購買週期
* areaEFH：顧客是否住在ACDEF區域
* areaACDEF：顧客是否住在ACDEF區域
* mon1：顧客第一個月來購買的次數
* mon2：顧客第二個月來購買的次數
* mon3：顧客第三個月來購買的次數

##### 新增CHR
```{r}
X$CHR=as.numeric(X$date)

X$CHR1=X$CHR>=11309 & X$CHR<=11315
X$CHR2=as.numeric(X$CHR1)
```


##### 計算maxweekday
```{r}
# table(X$cust) %>% sort %>% tail
maxWeekday = function(x){
  y = table(x) %>% t %>% as.matrix
  result = which.max(y) %>% 
    colnames(y)[.] %>% 
    as.Date %>% 
    weekdays
  return(result)
}
```


##### 計算其餘9個變數
```{r}
d0 = max(X$date)
NewVar = group_by(X, cust) %>% summarise(
  prodPrice = mean(total / pieces), # 商品平均單價
  weekday = maxWeekday(date), # 顧客最常禮拜幾來買
  CHR = sum(CHR2)>0, # chistmas
  k = (1 + as.integer(difftime(max(date), min(date), units="days"))) / n(), # 個人平均購買週期
  areaEFH = ((first(area)=="E") | (first(area)=="F") | (first(area)=="H")), # 是否住在EFH區域
  areaACDEF = ((first(area)=="A") | (first(area)=="C") | (first(area)=="D") | (first(area)=="E") | (first(area)=="F")),# 是否住在ACDEF區域
  mon1 = sum(date<="2000-11-30"),  # 顧客第一個月來購買的次數
  mon2 = sum(date<="2000-12-31" & date>"2000-11-30"), #第二個月
  mon3 = sum(date<="2001-01-31" & date>"2000-12-31")  #及第三個月來購買的次數
  ) %>% data.frame    # 28584
```


##### 將新增的變數和資料A合併
```{r}
A = merge(A, NewVar, all.x = T)
```


##### 把weekday轉換成有順序性的facotr變數
```{r}
A$weekday = factor(A$weekday
    , ordered=TRUE
    , levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
```

### 新增變數(2)

* ageGroup：將年齡區分為更寬的range
    * 35歲以下為1
    * 35~50以下為2
    * 50以上為3

* Weekday Percentage：顧客每日來買的機率


##### 計算年齡區間
```{r}
A$ageGroup = sapply(seq(1,nrow(A),1), function(x){
  if((A[x, "age"] == "A") | (A[x, "age"] == "B") | (A[x, "age"] == "C")) 
    A$ageGroup = 1 #35歲以下者
  else if((A[x, "age"] == "D") | (A[x, "age"] == "E") | (A[x, "age"] == "F")) 
    A$ageGroup = 2 #35~50歲
  else
    A$ageGroup = 3 #50歲以上者
})
A$ageGroup = as.factor(A$ageGroup)
```


##### 計算Weekday Percentage: W1 ~ W7
```{r}
X = X %>% mutate(wday = format(date, "%w"))
# table(X$wday)

# 製作顧客星期矩陣
mx = xtabs(~ cust + wday, X)
# dim(mx)
# mx[1:5,]
mx = mx / rowSums(mx)
# mx[1:5,]
```

##### 將矩陣併入入data frame
```{r}
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>% 
  setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>% 
  right_join(A, by='cust')
```

### 新增變數(3)

* 顧客偏向週末或平日消費？

    * 若偏好週末消費則為 1 
    * 若無特別偏好則為 2
    * 若偏好平日消費則為3
    
* 顧客產品矩陣
    * 是否有購買前5
    * 前10
    * 或前20熱門產品

##### 計算顧客為weekend或weekday shopper
```{r}
A$weekend = sapply(seq(1,nrow(A),1), function(x){
  w15 = A[x,c("W1","W2","W3","W4","W5")] %>% sum
  w67 = A[x,c("W6","W7")] %>% sum
  if(w67>w15) 
    A$weekend = 1 #偏好週末消費
  else if(w67==w15)
    A$weekend = 2 #無特別偏好
  else
    A$weekend = 3 #偏好平日者
})
A$weekend = as.factor(A$weekend)
```

##### 顧客產品矩陣
```{r}
library(Matrix)
library(slam)

Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
cpm = xtabs(~ cust + prod, Z, sparse=T)
```

_刪去購買次數小於10的產品_
```{r}
table(colSums(cpm) < 10)       # 12862
cpm = cpm[, colSums(cpm) > 10]   
cpm = cpm[, order(-colSums(cpm))]   # order product by frequency
dim(cpm)                       # 28584  9149
```

_是否有購買前5大、10大、20大熱銷產品_
```{r}
hot5 = cpm[,1:5] %>% rowSums
A$hot5 = hot5>0

hot10 = cpm[,1:10] %>% rowSums
A$hot10 = hot10>0

hot20 = cpm[,1:20] %>% rowSums
A$hot20 = hot20>0
```

##### 前20個熱銷產品分別購買次數
_確認cpm和A的顧客順序及大小是否有相同_
```{r}
cpm = cpm %>% as.matrix
(A$cust == rownames(cpm)) %>% sum
```

_合併兩個資料並將欄位名稱變更_
```{r}
A = cbind(A,cpm[,1:20])

colnames(A)[33:52] = sapply(seq(33,52,1), function(x){
  names = paste0("p",colnames(A)[x])
})
```


##### summary
```{r}
summary(A)
```
<br><br><hr>



### Spliting the Data
```{r}
A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR = A[spl, ]
TS = A[!spl, ]

# 把"cust"及"amount"欄位拿掉
TR1 = subset(TR, select = -c(cust, amount))
TS1 = subset(TS, select = -c(cust, amount))
```


<br><br><hr>

##### Turn on Parallel Processing
```{r}
#做平行運算
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
```
<br><br><hr>


### CART Classification Model 

##### CV Control for Classification
```{r}
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)

```

##### CV: `rpart()`, Classification Tree 
```{r}
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR1, method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(.cp = seq(0.0002,0.001,0.0001)))
Sys.time() - t0
```

```{r fig.height=3, fig.width=7}
plot(cv.rpart);cv.rpart
```


##### Classification Tree, Final Model
```{r}
#Create a CART model
rpart1 = rpart(buy ~ ., TR1, method="class", cp=0.0008)
PredictCART1 = predict(rpart1, TS, type="prob")[,2] 
```

##### compute AUC
```{r}
colAUC(PredictCART1,TS$buy,T) #0.7433228
```


### CART model without cp Classification Model 
```{r}
# CART model
rpart2 = rpart(buy ~ ., data = TR1, method="class")

# Make predictions
PredictCART2 = predict(rpart2, newdata = TS1, type = "prob")[,2]
```

##### compute AUC
```{r}
colAUC(PredictCART2, TS$buy,T) #0.6983998
```


### GLM Classification Model 
```{r}
glm2 = glm(buy ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6 -age -areaEFH -areaACDEF -weekday -weekend -hot20 -hot10 -hot5 -ageGroup - CHR -mon1 -mon2 -mon3 , TR1[,1:30], family=binomial())

summary(glm2)
```

##### compute AUC
```{r}
predGLM =  predict(glm2, TS1, type="response")
colAUC(predGLM, TS$buy,T) #0.7578518
TS$buy %>% table
```

##### confuse matrix
```{r}
cm = table(actual = TS1$buy, predict = predGLM > 0.5); cm
```

##### compute accuracy
```{r}
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts 
```

##### Interaction Terms 

* 根據CART分法產生了對f做分組的想法
    * 分成 1,2,3,4,>=5 共五組
```{r}
A$f_group = ifelse(A$f == 1,"1",
                   ifelse(A$f == 2,"2",
                   ifelse(A$f == 3,"3",
                   ifelse(A$f == 4,"4",">=5"))))
```


+ 常理上f(frequecy)高不代表他就是常貴客，也許他只是最初很常來買，但已經很久沒再來了，因此還要考慮到r(recency)，如果f高r低，代表這類顧客常來且最近有來，是我們的常貴客。
+ 因此我們再將r做分組，讓f_group和r_group做交互。r的分組則用百分位數分五組。

```{r}
A <- A %>% mutate(r_group = ntile(r, 5))

#將f_group和r_group轉為factor
A[,c("f_group","r_group")]  <- lapply(A[,c("f_group","r_group")], factor)
```

```{r}
#A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR_inter = A[spl, ]
TS_inter = A[!spl, ]

# 把"cust"及"amount"欄位拿掉
TR1_inter = subset(TR_inter, select = -c(cust, amount))
TS1_inter = subset(TS_inter, select = -c(cust, amount))
```

+ 再分出一組新的TR和TS

```{r}
glm2 = glm(buy ~ s+m+rev+k+prodPrice+area+mon2+f_group:r_group, TR1_inter, family=binomial())

# glm2 = glm(buy ~ area, TR1, family = binomial) 
# area E,F,H 很顯著
summary(glm2)
```

```{r}
pred =  predict(glm2, TS1_inter, type="response")
colAUC(pred, TS$buy)    # 0.7466507
TS$buy %>% table
```

```{r}
#刪除f_group,r_group 避免影響後面
A$f_group <- NULL
A$r_group <- NULL
```

### kmeans集群分析後再做預測

RFM分群參考: 

+ http://rpubs.com/tonychuo/ProdInfo

集群分析的模型與預測方法參考: 

+ http://rpubs.com/tonychuo/AS6-0(腦腫瘤)
+ http://rpubs.com/tonychuo/AS6-3(股票)

##### 移除目標變數與類別變數

```{r}
#colnames(TR1)
LTR = TR1[,c(8:13,17,20)]
LTS = TS1[,c(8:13,17,20)]

#LTR = TR1[,c(1:13,16,18:24)]
#LTS = TS1[,c(1:13,16,18:24)]
#LTR = TR1[,c(2:13,16,18:24,27:49)]
#LTS = TS1[,c(2:13,16,18:24,27:49)]
```

##### 區隔變數常態化

```{r}
library(caret)
preproc = preProcess(LTR)
NTR = predict(preproc, LTR)
NTS = predict(preproc, LTS)
```

##### 新增group欄位表示顧客屬於哪群

```{r}
#summary(NTR)
set.seed(144)
km <- kmeans(NTR, 7)
table(km$cluster)
NTR$group = km$cluster
```

##### 視覺化分群結果

* 整理每群資料
```{r}
df = group_by(NTR, group) %>% summarise(
  avg_frequency = mean(f),
  avg_monetary = mean(m),
  total_revenue = sum(rev),
  group_size = n(),
  avg_recency = mean(r)
  #avg_hot5 = mean(hot5),
  #avg_hot10 = mean(hot10),
  #avg_hot20 = mean(hot20)
  #profit = sum(raw)
  ) %>% ungroup %>% 
  mutate(
    #pc_revenue = round(100*total_revenue/sum(total_revenue),1), # % revenue contrib.
    #pc_profit = round(100*profit/sum(profit),1),                # % profit contrib. 
    dummy = 2001  # to fit googleViz's data format
  ) %>% data.frame
```

### googlevis 視覺化

```{r}

plot( gvisMotionChart(
  df, "group", "dummy",
  options=list(width=800, height=600) 
  ))
```

### kcca 分群模型

* 用相同的分群規則對training data和testing data做分群
```{r}
library(flexclust)
km.kcca = as.kcca(km, NTR[,1:8]) #NTR不取group
CTR = predict(km.kcca)
CTS = predict(km.kcca, newdata=NTS)
```

##### 依集群分析結果切割資料

```{r}
tapply(TR1$buy, CTR, mean)
#從googlevis看第2群是高f低r群，實際上第四個月的buy比例也確實高
```

##### 分群模型

+ 對每群做個別的train與predict
```{r}
#colnames(TR1)
M = lapply(split(TR1[,c(8:13,17,20,16)], CTR), function(x) 
  glm(buy~., data=x, family=binomial) )
#sapply(M, coef)

#glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
```

##### 分群測試AUC

```{r}
Pred = lapply(1:7, function(i) 
  predict(M[[i]], TS1[CTS==i,], type='response') )
 sapply(1:7, function(i) 
   colAUC(Pred[[i]], TS1$buy[CTS==i]))
#length(Pred[[1]])
#length(TS1$buy[CTS==1])
```

##### 分群整體測試AUC
```{r}
# table( do.call(c, split(TS1$buy,CTS)), do.call(c, Pred) > 0.5 ) %>%
#   {sum(diag(.))/sum(.)}

a = cbind(TS1$buy[CTS==1],Pred[[1]])
b = cbind(TS1$buy[CTS==2],Pred[[2]])
c = cbind(TS1$buy[CTS==3],Pred[[3]])
d = cbind(TS1$buy[CTS==4],Pred[[4]])
e = cbind(TS1$buy[CTS==5],Pred[[5]])
f = cbind(TS1$buy[CTS==6],Pred[[6]])
g = cbind(TS1$buy[CTS==7],Pred[[7]])
#h = cbind(TS1$buy[CTS==8],Pred[[8]])

CF = rbind(a,b,c,d,e,f,g)
colAUC(CF[,2],CF[,1])
```


### Classification ModelEnsembling
```{r}
# GLM + CART1
result = (predGLM + PredictCART1)/2
colAUC(result, TS$buy,T) #0.7584242

# GLM + CART2
result = (predGLM + PredictCART2)/2
colAUC(result, TS$buy,T) # 0.7584827 higher
```


<br><br><br><hr>

### Regression Model
##### Preparing the Data
```{r}
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
```

```{r}
TR2 = subset(A2, spl2) %>% subset(., select = -c(cust, buy))
TS2 = subset(A2, !spl2) %>% subset(., select = -c(cust, buy))
```


### linear model
```{r}

lm1 = lm(amount ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6  -areaACDEF-area -weekday  -hot5-hot10-hot20-ageGroup-mon1-mon2 ,TR2)

summary(lm1)
```

###compute R-squared
```{r}
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
```


### CART Regression Model


##### Cross-Validation
```{r}
library(caret)
library(e1071)
library(rpart)

# Define cross-validation experiment
# k=10
numFolds = trainControl( method = "cv", number = 10 ) 
# pick the posiible values for cp
cpGrid = expand.grid( .cp = seq(0.001,0.1,0.001)) 


# Perform the cross validation
cv = train(amount ~ ., data = TR2, method = "rpart", trControl = numFolds, tuneGrid = cpGrid )
# method="rpart" : we wanto to validate a CART model
plot(cv);cv
```
###Create a CART model
```{r}

rpart2 = rpart(amount ~ ., data = TR2, cp = 0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) -  TS2$amount)^2)
1 - (SSE/SST)
```


### model ensembling
```{r}
predLM = predict(lm1, newdata=TS2)
predCART = predict(rpart2, newdata=TS2)
result = (predLM + predCART)/2

SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((result -  TS2$amount)^2)
1 - (SSE/SST)
```


<br><br><br><hr>


##### Stop Parallel Processing
```{r}
stopCluster(clust)
```








<br><br><br><br><hr><br><br><br>

<style>
.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #008800;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h3{
  color: #b36b00;
  background: #ffe0b3;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #006000;
  background: #ffffe0;
  line-height: 2;
  font-weight: bold;
}

em{
  color: #0000c0;
  background: #f0f0f0;
  }
</style>

