Preparing Data

Libraries
Sys.setlocale("LC_ALL","C")
[1] "C"
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
計算聖誕節前一週有沒有來買
X$CHR=as.numeric(X$date)
X$CHR1=X$CHR>=11309 & X$CHR<=11315
X$CHR2=as.numeric(X$CHR1)
計算顧客最常禮拜幾來買
# table(X$cust) %>% sort %>% tail
maxWeekday = function(x){
  y = table(x) %>% t %>% as.matrix
  result = which.max(y) %>% 
    colnames(y)[.] %>% 
    as.Date %>% 
    weekdays
  return(result)
}

新增變數

  • prodPrice
  • weekday
  • CHR
  • k
  • areaEFH
  • mon1
  • mon2
  • mon3
d0 = max(X$date)
NewVar = group_by(X, cust) %>% summarise(
  prodPrice = mean(total / pieces), # 商品平均單價
  weekday = maxWeekday(date), # 顧客最常禮拜幾來買
  CHR = sum(CHR2)>0, # chistmas
  k = (1 + as.integer(difftime(max(date), min(date), units="days"))) / n(), # 個人平均購買週期
  areaEFH = ((first(area)=="E") | (first(area)=="F") | (first(area)=="H")), # 是否住在EFH區域
  areaACDEF = ((first(area)=="A") | (first(area)=="C") | (first(area)=="D") | (first(area)=="E") | (first(area)=="F")),# 是否住在ACDEF區域
  mon1 = sum(date<="2000-11-30"),
  mon2 = sum(date<="2000-12-31" & date>"2000-11-30"),
  mon3 = sum(date<="2001-01-31" & date>"2000-12-31")
  ) %>% data.frame    # 28584
將我們新增的變數和資料A合併
A = merge(A, NewVar, all.x = T)
把weekday轉換成有順序性的facotr變數
A$weekday = factor(A$weekday
    , ordered=TRUE
    , levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

新增變數: 年齡區間

將年齡區分為更寬的range:

  • 35歲以下為1
  • 35~50以下為2
  • 50以上為3
A$ageGroup = sapply(seq(1,nrow(A),1), function(x){
  if((A[x, "age"] == "A") | (A[x, "age"] == "B") | (A[x, "age"] == "C")) 
    A$ageGroup = 1
  else if((A[x, "age"] == "D") | (A[x, "age"] == "E") | (A[x, "age"] == "F")) 
    A$ageGroup = 2
  else
    A$ageGroup = 3
})
A$ageGroup = as.factor(A$ageGroup)

新增變數: Weekday Percentage: W1 ~ W7

X = X %>% mutate(wday = format(date, "%w"))
# table(X$wday)
# 顧客星期矩陣
mx = xtabs(~ cust + wday, X)
# dim(mx)
# mx[1:5,]
mx = mx / rowSums(mx)
# mx[1:5,]
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>% 
  setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>% 
  right_join(A, by='cust')

新增變數: 偏向週末或平日消費

算出顧客偏向週末或是平日來消費:

  • 若偏好週末消費則為 1
  • 若無特別偏好則為 2
  • 若偏好平日消費則為3
A$weekend = sapply(seq(1,nrow(A),1), function(x){
  w15 = A[x,c("W1","W2","W3","W4","W5")] %>% sum
  w67 = A[x,c("W6","W7")] %>% sum
  if(w67>w15) 
    A$weekend = 1
  else if(w67==w15)
    A$weekend = 2
  else
    A$weekend = 3
})
A$weekend = as.factor(A$weekend)

新增變數 : 是否有購買前5大、10大、20大熱銷產品 及 前20個熱銷產品分別購買次數

顧客產品矩陣
library(Matrix)
library(slam)
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
cpm = xtabs(~ cust + prod, Z, sparse=T)
刪去購買次數小於10的產品
table(colSums(cpm) < 10)       # 12588 

FALSE  TRUE 
 9652 12862 
cpm = cpm[, colSums(cpm) > 10]   # 
cpm = cpm[, order(-colSums(cpm))]   # order product by frequency
dim(cpm)                            # 32241 23787>14621
[1] 28584  9149
是否有購買前5大、10大、20大熱銷產品
hot5 = cpm[,1:5] %>% rowSums
A$hot5 = hot5>0
hot10 = cpm[,1:10] %>% rowSums
A$hot10 = hot10>0
hot20 = cpm[,1:20] %>% rowSums
A$hot20 = hot20>0
前20個熱銷產品分別購買次數

確認cpm和A的顧客順序及大小是否有相同

cpm = cpm %>% as.matrix
(A$cust == rownames(cpm)) %>% sum
[1] 28584

合併兩個資料並將欄位名稱變更

A = cbind(A,cpm[,1:20])
colnames(A)[33:52] = sapply(seq(33,52,1), function(x){
  names = paste0("p",colnames(A)[x])
})
summary
summary(A)
      cust                W1               W2        
 Min.   :    1069   Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 1060898   1st Qu.:0.0000   1st Qu.:0.0000  
 Median : 1654100   Median :0.0000   Median :0.0000  
 Mean   : 1461070   Mean   :0.2310   Mean   :0.1399  
 3rd Qu.: 1945003   3rd Qu.:0.3484   3rd Qu.:0.2000  
 Max.   :20002000   Max.   :1.0000   Max.   :1.0000  
                                                     
       W3               W4              W5               W6        
 Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.0000   Median :0.000   Median :0.0000   Median :0.0000  
 Mean   :0.1176   Mean   :0.103   Mean   :0.1188   Mean   :0.1136  
 3rd Qu.:0.1333   3rd Qu.:0.000   3rd Qu.:0.1354   3rd Qu.:0.1111  
 Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
                                                                   
       W7               r               s               f         
 Min.   :0.0000   Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
 1st Qu.:0.0000   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
 Median :0.0000   Median :21.00   Median :68.00   Median : 2.000  
 Mean   :0.1762   Mean   :32.12   Mean   :61.27   Mean   : 3.089  
 3rd Qu.:0.2500   3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
 Max.   :1.0000   Max.   :92.00   Max.   :92.00   Max.   :60.000  
                                                                  
       m                rev             raw               age      
 Min.   :    8.0   Min.   :    8   Min.   : -742.0   D      :5832  
 1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   C      :5238  
 Median :  709.5   Median : 1566   Median :  218.0   E      :4514  
 Mean   : 1012.4   Mean   : 2711   Mean   :  420.8   F      :3308  
 3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0   B      :2802  
 Max.   :10634.0   Max.   :99597   Max.   :15565.0   G      :1940  
                                                     (Other):4950  
      area          amount         buy            prodPrice      
 E      :9907   Min.   :    8   Mode :logical   Min.   :   5.00  
 F      :7798   1st Qu.:  454   FALSE:15342     1st Qu.:  63.36  
 C      :3169   Median :  993   TRUE :13242     Median :  86.44  
 G      :3052   Mean   : 1499                   Mean   : 115.97  
 D      :1778   3rd Qu.: 1955                   3rd Qu.: 123.20  
 H      :1295   Max.   :28089                   Max.   :3939.00  
 (Other):1585   NA's   :15342                                    
      weekday        CHR                k           areaEFH       
 Sunday   :6761   Mode :logical   Min.   : 1.000   Mode :logical  
 Monday   :3463   FALSE:27066     1st Qu.: 1.000   FALSE:9584     
 Tuesday  :2900   TRUE :1518      Median : 5.571   TRUE :19000    
 Wednesday:2727                   Mean   : 8.550                  
 Thursday :3878                   3rd Qu.:14.000                  
 Friday   :3617                   Max.   :45.000                  
 Saturday :5238                                                   
 areaACDEF            mon1             mon2              mon3       
 Mode :logical   Min.   : 0.000   Min.   : 0.0000   Min.   : 0.000  
 FALSE:5130      1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.: 0.000  
 TRUE :23454     Median : 1.000   Median : 1.0000   Median : 1.000  
                 Mean   : 1.112   Mean   : 0.9339   Mean   : 1.043  
                 3rd Qu.: 1.000   3rd Qu.: 1.0000   3rd Qu.: 1.000  
                 Max.   :26.000   Max.   :21.0000   Max.   :25.000  
                                                                    
 ageGroup  weekend      hot5           hot10           hot20        
 1: 9445   1: 5337   Mode :logical   Mode :logical   Mode :logical  
 2:13654   2: 3205   FALSE:19029     FALSE:16425     FALSE:13336    
 3: 5485   3:20042   TRUE :9555      TRUE :12159     TRUE :15248    
                                                                    
                                                                    
                                                                    
                                                                    
 p4711271000014   p4714981010038    p4719090900065   
 Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000  
 1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000  
 Median :0.0000   Median : 0.0000   Median :0.00000  
 Mean   :0.1837   Mean   : 0.1631   Mean   :0.07679  
 3rd Qu.:0.0000   3rd Qu.: 0.0000   3rd Qu.:0.00000  
 Max.   :5.0000   Max.   :16.0000   Max.   :6.00000  
                                                     
 p4711080010112     p4710908131589   p4713985863121 
 Min.   : 0.00000   Min.   :0.0000   Min.   :0.000  
 1st Qu.: 0.00000   1st Qu.:0.0000   1st Qu.:0.000  
 Median : 0.00000   Median :0.0000   Median :0.000  
 Mean   : 0.06014   Mean   :0.0543   Mean   :0.053  
 3rd Qu.: 0.00000   3rd Qu.:0.0000   3rd Qu.:0.000  
 Max.   :21.00000   Max.   :8.0000   Max.   :9.000  
                                                    
 p4710114128038    p4710421090059    p4710291112172  
 Min.   :0.00000   Min.   : 0.0000   Min.   :0.0000  
 1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.0000  
 Median :0.00000   Median : 0.0000   Median :0.0000  
 Mean   :0.05153   Mean   : 0.0515   Mean   :0.0508  
 3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.0000  
 Max.   :5.00000   Max.   :13.0000   Max.   :5.0000  
                                                     
 p4712425010712    p4710583996008    p4710011401128   
 Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.00000   Median :0.00000   Median :0.00000  
 Mean   :0.05076   Mean   :0.04894   Mean   :0.04807  
 3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :5.00000   Max.   :7.00000   Max.   :8.00000  
                                                      
 p4710088410139    p4719090900058     p4710094097768    
 Min.   :0.00000   Min.   : 0.00000   Min.   : 0.00000  
 1st Qu.:0.00000   1st Qu.: 0.00000   1st Qu.: 0.00000  
 Median :0.00000   Median : 0.00000   Median : 0.00000  
 Mean   :0.04803   Mean   : 0.04628   Mean   : 0.04429  
 3rd Qu.:0.00000   3rd Qu.: 0.00000   3rd Qu.: 0.00000  
 Max.   :5.00000   Max.   :13.00000   Max.   :17.00000  
                                                        
 p4710088410610    p4710085120628    p4710265849066   
 Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.00000   Median :0.00000   Median :0.00000  
 Mean   :0.04394   Mean   :0.04121   Mean   :0.04058  
 3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :6.00000   Max.   :7.00000   Max.   :4.00000  
                                                      
 p4710018004605    p4710908131534   
 Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.00000   Median :0.00000  
 Mean   :0.03985   Mean   :0.03778  
 3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :6.00000   Max.   :5.00000  
                                    



Spliting the Data

A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR = A[spl, ]
TS = A[!spl, ]
# 把"cust"及"amount"欄位拿掉
TR1 = subset(TR, select = -c(cust, amount))
TS1 = subset(TS, select = -c(cust, amount))



Turn on Parallel Processing
#做平行運算
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4



CART Classification Model

CV Control for Classification
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)
CV: rpart(), Classification Tree
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR1, method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(.cp = seq(0.0002,0.001,0.0001)))
Sys.time() - t0
Time difference of 35.16521 secs
plot(cv.rpart);cv.rpart
CART 

20008 samples
   49 predictor
    2 classes: 'no', 'yes' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 2 times) 
Summary of sample sizes: 18007, 18007, 18007, 18007, 18007, 18007, ... 
Resampling results across tuning parameters:

  cp     ROC        Sens       Spec     
  2e-04  0.6945636  0.7386622  0.5807015
  3e-04  0.7063938  0.7728375  0.5760073
  4e-04  0.7161350  0.7787498  0.5792441
  5e-04  0.7234962  0.7929951  0.5694282
  6e-04  0.7256986  0.7994671  0.5675938
  7e-04  0.7275238  0.8028204  0.5660290
  8e-04  0.7258432  0.8104117  0.5597687
  9e-04  0.7236823  0.8129257  0.5577191
  1e-03  0.7224932  0.8109231  0.5605789

ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 7e-04.

Classification Tree, Final Model

第一層分歧點是f<3
第二層則分別是f<2 與 f<5
代表f(frequency購買頻率)對buy的影響力很大
最右邊的terminal node更是只經過兩層判斷: f>=3 → f>=5
也就是f >= 5 就直接判斷buy = TRUE

compute AUC
colAUC(PredictCART1,TS$buy)
                [,1]
no vs. yes 0.7449168

CART model without cp Classification Model

# CART model
rpart2 = rpart(buy ~ ., data = TR1, method="class")
# Make predictions
PredictCART2 = predict(rpart2, newdata = TS1, type = "prob")[,2]
compute AUC
colAUC(PredictCART2, TS$buy)
                [,1]
no vs. yes 0.6983998

GLM Classification Model

glm2 = glm(buy ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6 -age -areaEFH -areaACDEF -weekday -weekend -hot20 -hot10 -hot5 -ageGroup - CHR -mon1 -mon2 -mon3 , TR1[,1:30], family=binomial())
summary(glm2)

Call:
glm(formula = buy ~ . - W7 - W1 - W2 - W3 - W4 - W5 - W6 - age - 
    areaEFH - areaACDEF - weekday - weekend - hot20 - hot10 - 
    hot5 - ageGroup - CHR - mon1 - mon2 - mon3, family = binomial(), 
    data = TR1[, 1:30])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.5126  -0.8816  -0.7090   1.0349   2.3789  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.034e+00  1.083e-01  -9.551  < 2e-16 ***
r           -2.108e-02  2.319e-03  -9.091  < 2e-16 ***
s            1.821e-02  2.341e-03   7.776 7.47e-15 ***
f            2.252e-01  2.215e-02  10.170  < 2e-16 ***
m           -3.110e-06  2.780e-05  -0.112  0.91093    
rev          4.874e-05  1.958e-05   2.489  0.01281 *  
raw         -2.850e-04  8.663e-05  -3.290  0.00100 ** 
areaB       -3.508e-02  1.321e-01  -0.266  0.79059    
areaC       -2.072e-01  1.044e-01  -1.985  0.04716 *  
areaD        3.703e-02  1.109e-01   0.334  0.73850    
areaE        2.606e-01  9.672e-02   2.695  0.00705 ** 
areaF        1.773e-01  9.738e-02   1.821  0.06859 .  
areaG       -4.751e-02  1.043e-01  -0.456  0.64869    
areaH       -1.878e-01  1.215e-01  -1.545  0.12234    
prodPrice   -7.209e-04  1.491e-04  -4.836 1.32e-06 ***
k           -2.034e-02  5.019e-03  -4.054 5.04e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27629  on 20007  degrees of freedom
Residual deviance: 23270  on 19992  degrees of freedom
AIC: 23302

Number of Fisher Scoring iterations: 6

把age刪掉 多k,prodPrice

compute AUC
predGLM =  predict(glm2, TS1, type="response")
colAUC(predGLM, TS$buy)
                [,1]
no vs. yes 0.7578518
TS$buy %>% table
.
  no  yes 
4603 3973 
confuse matrix
cm = table(actual = TS1$buy, predict = predGLM > 0.5); cm
      predict
actual FALSE TRUE
   no   3822  781
   yes  1759 2214
compute accuracy
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts 
[1] 0.7038246

Classification ModelEnsembling

# GLM + CART1
result = (predGLM + PredictCART1)/2
colAUC(result, TS$buy)
                [,1]
no vs. yes 0.7584242
# GLM + CART2
result = (predGLM + PredictCART2)/2
colAUC(result, TS$buy) # higher
                [,1]
no vs. yes 0.7584827




Regression Model

Preparing the Data
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2) %>% subset(., select = -c(cust, buy))
TS2 = subset(A2, !spl2) %>% subset(., select = -c(cust, buy))

linear model

lm1 = lm(amount ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6  -areaACDEF-area -weekday  -hot5-hot10-hot20-ageGroup-mon1-mon2 ,TR2)
summary(lm1)

Call:
lm(formula = amount ~ . - W7 - W1 - W2 - W3 - W4 - W5 - W6 - 
    areaACDEF - area - weekday - hot5 - hot10 - hot20 - ageGroup - 
    mon1 - mon2, data = TR2)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8497 -0.2264  0.0478  0.2769  1.5540 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     1.335e+00  4.704e-02  28.376  < 2e-16 ***
r              -1.546e-03  5.262e-04  -2.937  0.00332 ** 
s               2.038e-03  5.155e-04   3.954 7.73e-05 ***
f               1.170e-02  2.705e-03   4.325 1.54e-05 ***
m               5.367e-01  4.345e-02  12.350  < 2e-16 ***
rev            -3.019e-02  4.133e-02  -0.731  0.46505    
raw             6.186e-05  8.795e-06   7.033 2.16e-12 ***
ageB            7.682e-02  2.492e-02   3.082  0.00206 ** 
ageC            1.194e-01  2.289e-02   5.218 1.85e-07 ***
ageD            1.262e-01  2.259e-02   5.588 2.37e-08 ***
ageE            1.321e-01  2.312e-02   5.712 1.15e-08 ***
ageF            1.054e-01  2.416e-02   4.365 1.29e-05 ***
ageG            8.057e-02  2.635e-02   3.058  0.00223 ** 
ageH            7.178e-02  3.106e-02   2.311  0.02083 *  
ageI            6.980e-02  3.196e-02   2.184  0.02899 *  
ageJ           -1.970e-02  2.806e-02  -0.702  0.48279    
ageK            1.130e-01  3.768e-02   2.999  0.00272 ** 
prodPrice      -3.198e-04  5.026e-05  -6.364 2.07e-10 ***
CHRTRUE        -4.254e-02  1.746e-02  -2.437  0.01484 *  
k              -5.485e-03  1.073e-03  -5.111 3.26e-07 ***
areaEFHTRUE    -1.471e-02  1.029e-02  -1.430  0.15289    
mon3            1.524e-02  4.845e-03   3.145  0.00166 ** 
weekend2       -3.722e-03  1.755e-02  -0.212  0.83209    
weekend3       -4.709e-03  1.262e-02  -0.373  0.70897    
p4711271000014 -1.239e-02  9.222e-03  -1.343  0.17922    
p4714981010038 -1.616e-02  8.396e-03  -1.925  0.05428 .  
p4719090900065 -1.011e-02  1.227e-02  -0.824  0.40996    
p4711080010112  1.684e-03  9.343e-03   0.180  0.85697    
p4710908131589  1.263e-02  1.461e-02   0.864  0.38752    
p4713985863121  2.320e-02  1.386e-02   1.674  0.09424 .  
p4710114128038  1.809e-02  1.445e-02   1.252  0.21066    
p4710421090059 -1.361e-02  1.319e-02  -1.031  0.30247    
p4710291112172  3.389e-02  1.499e-02   2.262  0.02374 *  
p4712425010712  3.541e-02  1.513e-02   2.340  0.01929 *  
p4710583996008  1.054e-02  1.377e-02   0.766  0.44376    
p4710011401128 -1.081e-03  1.237e-02  -0.087  0.93037    
p4710088410139  4.659e-03  1.527e-02   0.305  0.76035    
p4719090900058  2.458e-02  1.613e-02   1.524  0.12764    
p4710094097768  1.946e-02  1.103e-02   1.765  0.07760 .  
p4710088410610  1.353e-02  1.377e-02   0.982  0.32598    
p4710085120628  6.767e-04  1.616e-02   0.042  0.96661    
p4710265849066 -1.987e-02  1.750e-02  -1.135  0.25628    
p4710018004605  1.555e-02  1.562e-02   0.996  0.31935    
p4710908131534 -7.272e-04  1.635e-02  -0.044  0.96453    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4179 on 9225 degrees of freedom
Multiple R-squared:  0.305, Adjusted R-squared:  0.3017 
F-statistic: 94.14 on 43 and 9225 DF,  p-value: < 2.2e-16

compute R-squared

r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.3049726 0.2732476

CART Regression Model

Cross-Validation
library(caret)
library(e1071)
library(rpart)
# Define cross-validation experiment
# k=10
numFolds = trainControl( method = "cv", number = 10 ) 
# pick the posiible values for cp
cpGrid = expand.grid( .cp = seq(0.001,0.1,0.001)) 
# Perform the cross validation
cv = train(amount ~ ., data = TR2, method = "rpart", trControl = numFolds, tuneGrid = cpGrid )
# method="rpart" : we wanto to validate a CART model
plot(cv);cv
CART 

9269 samples
  49 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 8342, 8342, 8344, 8341, 8343, 8343, ... 
Resampling results across tuning parameters:

  cp     RMSE       Rsquared   MAE      
  0.001  0.4292028  0.2690773  0.3282264
  0.002  0.4252379  0.2792146  0.3270657
  0.003  0.4259257  0.2767614  0.3279024
  0.004  0.4270146  0.2727301  0.3293621
  0.005  0.4276177  0.2700327  0.3302938
  0.006  0.4280014  0.2686708  0.3308160
  0.007  0.4285473  0.2667992  0.3310266
  0.008  0.4291134  0.2649987  0.3312590
  0.009  0.4305926  0.2599312  0.3320272
  0.010  0.4331756  0.2514542  0.3344812
  0.011  0.4332454  0.2511085  0.3346050
  0.012  0.4332454  0.2511085  0.3346050
  0.013  0.4352086  0.2444000  0.3362385
  0.014  0.4367615  0.2389578  0.3377461
  0.015  0.4374698  0.2363249  0.3379678
  0.016  0.4374698  0.2363249  0.3379678
  0.017  0.4374698  0.2363249  0.3379678
  0.018  0.4374698  0.2363249  0.3379678
  0.019  0.4374698  0.2363249  0.3379678
  0.020  0.4375541  0.2357503  0.3380023
  0.021  0.4375541  0.2357503  0.3380023
  0.022  0.4375541  0.2357503  0.3380023
  0.023  0.4375541  0.2357503  0.3380023
  0.024  0.4375541  0.2357503  0.3380023
  0.025  0.4375541  0.2357503  0.3380023
  0.026  0.4375541  0.2357503  0.3380023
  0.027  0.4375541  0.2357503  0.3380023
  0.028  0.4383222  0.2329409  0.3387731
  0.029  0.4383222  0.2329409  0.3387731
  0.030  0.4391150  0.2300391  0.3392694
  0.031  0.4420512  0.2195971  0.3413720
  0.032  0.4439264  0.2129179  0.3424785
  0.033  0.4445446  0.2108207  0.3428081
  0.034  0.4447775  0.2100530  0.3430843
  0.035  0.4447775  0.2100530  0.3430843
  0.036  0.4447775  0.2100530  0.3430843
  0.037  0.4447775  0.2100530  0.3430843
  0.038  0.4447775  0.2100530  0.3430843
  0.039  0.4447775  0.2100530  0.3430843
  0.040  0.4447775  0.2100530  0.3430843
  0.041  0.4447775  0.2100530  0.3430843
  0.042  0.4447775  0.2100530  0.3430843
  0.043  0.4447775  0.2100530  0.3430843
  0.044  0.4447775  0.2100530  0.3430843
  0.045  0.4447775  0.2100530  0.3430843
  0.046  0.4447775  0.2100530  0.3430843
  0.047  0.4447775  0.2100530  0.3430843
  0.048  0.4495928  0.1929573  0.3471033
  0.049  0.4550808  0.1731512  0.3517455
  0.050  0.4562046  0.1691672  0.3531857
  0.051  0.4571753  0.1655309  0.3539768
  0.052  0.4571753  0.1655309  0.3539768
  0.053  0.4571753  0.1655309  0.3539768
  0.054  0.4571753  0.1655309  0.3539768
  0.055  0.4589018  0.1591169  0.3553477
  0.056  0.4589018  0.1591169  0.3553477
  0.057  0.4589018  0.1591169  0.3553477
  0.058  0.4589018  0.1591169  0.3553477
  0.059  0.4589018  0.1591169  0.3553477
  0.060  0.4589018  0.1591169  0.3553477
  0.061  0.4589018  0.1591169  0.3553477
  0.062  0.4589018  0.1591169  0.3553477
  0.063  0.4589018  0.1591169  0.3553477
  0.064  0.4589018  0.1591169  0.3553477
  0.065  0.4589018  0.1591169  0.3553477
  0.066  0.4589018  0.1591169  0.3553477
  0.067  0.4589018  0.1591169  0.3553477
  0.068  0.4589018  0.1591169  0.3553477
  0.069  0.4589018  0.1591169  0.3553477
  0.070  0.4589018  0.1591169  0.3553477
  0.071  0.4589018  0.1591169  0.3553477
  0.072  0.4589018  0.1591169  0.3553477
  0.073  0.4589018  0.1591169  0.3553477
  0.074  0.4589018  0.1591169  0.3553477
  0.075  0.4589018  0.1591169  0.3553477
  0.076  0.4589018  0.1591169  0.3553477
  0.077  0.4589018  0.1591169  0.3553477
  0.078  0.4589018  0.1591169  0.3553477
  0.079  0.4589018  0.1591169  0.3553477
  0.080  0.4589018  0.1591169  0.3553477
  0.081  0.4589018  0.1591169  0.3553477
  0.082  0.4589018  0.1591169  0.3553477
  0.083  0.4589018  0.1591169  0.3553477
  0.084  0.4589018  0.1591169  0.3553477
  0.085  0.4589018  0.1591169  0.3553477
  0.086  0.4589018  0.1591169  0.3553477
  0.087  0.4589018  0.1591169  0.3553477
  0.088  0.4589018  0.1591169  0.3553477
  0.089  0.4589018  0.1591169  0.3553477
  0.090  0.4589018  0.1591169  0.3553477
  0.091  0.4589018  0.1591169  0.3553477
  0.092  0.4589018  0.1591169  0.3553477
  0.093  0.4589018  0.1591169  0.3553477
  0.094  0.4589018  0.1591169  0.3553477
  0.095  0.4589018  0.1591169  0.3553477
  0.096  0.4589018  0.1591169  0.3553477
  0.097  0.4589018  0.1591169  0.3553477
  0.098  0.4589018  0.1591169  0.3553477
  0.099  0.4589018  0.1591169  0.3553477
  0.100  0.4589018  0.1591169  0.3553477

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.002.

Create a CART model

rpart2 = rpart(amount ~ ., data = TR2, cp = 0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) -  TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2521828

model ensembling

predLM = predict(lm1, newdata=TS2)
predCART = predict(rpart2, newdata=TS2)
result = (predLM + predCART)/2
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((result -  TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2740926




Stop Parallel Processing
stopCluster(clust)








---
title: "Data Preparation, Ta-Feng"
author: "Group 4  8/6"
output: html_notebook
---

<br>

### Preparing Data

##### Libraries
```{r echo=T, message=F, cache=F, warning=F}
Sys.setlocale("LC_ALL","C")
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
```


##### 計算聖誕節前一週有沒有來買
```{r}
X$CHR=as.numeric(X$date)

X$CHR1=X$CHR>=11309 & X$CHR<=11315
X$CHR2=as.numeric(X$CHR1)
```

###### 計算顧客最常禮拜幾來買
```{r}
# table(X$cust) %>% sort %>% tail
maxWeekday = function(x){
  y = table(x) %>% t %>% as.matrix
  result = which.max(y) %>% 
    colnames(y)[.] %>% 
    as.Date %>% 
    weekdays
  return(result)
}
```

#### 新增變數
 
* prodPrice
* weekday
* CHR
* k
* areaEFH
* mon1
* mon2
* mon3

```{r}
d0 = max(X$date)
NewVar = group_by(X, cust) %>% summarise(
  prodPrice = mean(total / pieces), # 商品平均單價
  weekday = maxWeekday(date), # 顧客最常禮拜幾來買
  CHR = sum(CHR2)>0, # chistmas
  k = (1 + as.integer(difftime(max(date), min(date), units="days"))) / n(), # 個人平均購買週期
  areaEFH = ((first(area)=="E") | (first(area)=="F") | (first(area)=="H")), # 是否住在EFH區域
  areaACDEF = ((first(area)=="A") | (first(area)=="C") | (first(area)=="D") | (first(area)=="E") | (first(area)=="F")),# 是否住在ACDEF區域
  mon1 = sum(date<="2000-11-30"),
  mon2 = sum(date<="2000-12-31" & date>"2000-11-30"),
  mon3 = sum(date<="2001-01-31" & date>"2000-12-31")
  ) %>% data.frame    # 28584
```

##### 將我們新增的變數和資料A合併
```{r}
A = merge(A, NewVar, all.x = T)
```


#####把weekday轉換成有順序性的facotr變數
```{r}
A$weekday = factor(A$weekday
    , ordered=TRUE
    , levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
```

#### 新增變數: 年齡區間

將年齡區分為更寬的range:

* 35歲以下為1
* 35~50以下為2
* 50以上為3
```{r}
A$ageGroup = sapply(seq(1,nrow(A),1), function(x){
  if((A[x, "age"] == "A") | (A[x, "age"] == "B") | (A[x, "age"] == "C")) 
    A$ageGroup = 1
  else if((A[x, "age"] == "D") | (A[x, "age"] == "E") | (A[x, "age"] == "F")) 
    A$ageGroup = 2
  else
    A$ageGroup = 3
})
A$ageGroup = as.factor(A$ageGroup)
```


#### 新增變數: Weekday Percentage: W1 ~ W7
```{r}
X = X %>% mutate(wday = format(date, "%w"))
# table(X$wday)

# 顧客星期矩陣
mx = xtabs(~ cust + wday, X)
# dim(mx)
# mx[1:5,]
mx = mx / rowSums(mx)
# mx[1:5,]
```

```{r}
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>% 
  setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>% 
  right_join(A, by='cust')
```

#### 新增變數: 偏向週末或平日消費

算出顧客偏向週末或是平日來消費:

* 若偏好週末消費則為 1 
* 若無特別偏好則為 2
* 若偏好平日消費則為3

```{r}
A$weekend = sapply(seq(1,nrow(A),1), function(x){
  w15 = A[x,c("W1","W2","W3","W4","W5")] %>% sum
  w67 = A[x,c("W6","W7")] %>% sum
  if(w67>w15) 
    A$weekend = 1
  else if(w67==w15)
    A$weekend = 2
  else
    A$weekend = 3
})
A$weekend = as.factor(A$weekend)
```



#### 新增變數 : 是否有購買前5大、10大、20大熱銷產品 及 前20個熱銷產品分別購買次數

##### 顧客產品矩陣
```{r}
library(Matrix)
library(slam)

Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
cpm = xtabs(~ cust + prod, Z, sparse=T)
```

##### 刪去購買次數小於10的產品
```{r}
table(colSums(cpm) < 10)       # 12588 
cpm = cpm[, colSums(cpm) > 10]   # 
cpm = cpm[, order(-colSums(cpm))]   # order product by frequency
dim(cpm)                            # 32241 23787>14621
```

##### 是否有購買前5大、10大、20大熱銷產品
```{r}
hot5 = cpm[,1:5] %>% rowSums
A$hot5 = hot5>0

hot10 = cpm[,1:10] %>% rowSums
A$hot10 = hot10>0

hot20 = cpm[,1:20] %>% rowSums
A$hot20 = hot20>0
```

##### 前20個熱銷產品分別購買次數
確認cpm和A的顧客順序及大小是否有相同
```{r}
cpm = cpm %>% as.matrix
(A$cust == rownames(cpm)) %>% sum
```

合併兩個資料並將欄位名稱變更
```{r}
A = cbind(A,cpm[,1:20])

colnames(A)[33:52] = sapply(seq(33,52,1), function(x){
  names = paste0("p",colnames(A)[x])
})
```


##### summary
```{r}
summary(A)
```
<br><br><hr>



### Spliting the Data
```{r}
A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR = A[spl, ]
TS = A[!spl, ]

# 把"cust"及"amount"欄位拿掉
TR1 = subset(TR, select = -c(cust, amount))
TS1 = subset(TS, select = -c(cust, amount))
```


<br><br><hr>




##### Turn on Parallel Processing
```{r}
#做平行運算
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
```
<br><br><hr>


### CART Classification Model 

##### CV Control for Classification
```{r}
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)

```

##### CV: `rpart()`, Classification Tree 
```{r}
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR1, method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(.cp = seq(0.0002,0.001,0.0001)))
Sys.time() - t0
```

```{r fig.height=3, fig.width=7}
plot(cv.rpart);cv.rpart
```


##### Classification Tree, Final Model
```{r}
#Create a CART model
rpart1 = rpart(buy ~ ., TR1, method="class", cp=0.0008 , minbucket)

#畫出決策樹
library(rpart.plot)
#rpart.plot(rpart1,tweak=3)
prp(rpart1,         # 模型
    faclen=0,           # 呈現的變數不要縮寫
    fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
    shadow.col="gray",  # 最下面的節點塗上陰影
    # number of correct classifications / number of observations in that node
    extra=2,
    tweak=2.5,space=0.5)
?prp

PredictCART1 = predict(rpart1, TS, type="prob")[,2] 
```
第一層分歧點是f<3 <br>
第二層則分別是f<2 與 f<5 <br>
代表f(frequency購買頻率)對buy的影響力很大 <br>
最右邊的terminal node更是只經過兩層判斷: f>=3 → f>=5 <br>
也就是f >= 5 就直接判斷buy = TRUE<br><br>




##### compute AUC
```{r}
colAUC(PredictCART1,TS$buy)
```


### CART model without cp Classification Model 
```{r}
# CART model
rpart2 = rpart(buy ~ ., data = TR1, method="class")

# Make predictions
PredictCART2 = predict(rpart2, newdata = TS1, type = "prob")[,2]
```

##### compute AUC
```{r}
colAUC(PredictCART2, TS$buy)
```



### GLM Classification Model 
```{r}
glm2 = glm(buy ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6 -age -areaEFH -areaACDEF -weekday -weekend -hot20 -hot10 -hot5 -ageGroup - CHR -mon1 -mon2 -mon3 , TR1[,1:30], family=binomial())

summary(glm2)
```

把age刪掉
多k,prodPrice

##### compute AUC
```{r}
predGLM =  predict(glm2, TS1, type="response")
colAUC(predGLM, TS$buy)
TS$buy %>% table
```

##### confuse matrix
```{r}
cm = table(actual = TS1$buy, predict = predGLM > 0.5); cm
```

##### compute accuracy
```{r}
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts 
```



### Classification ModelEnsembling
```{r}
# GLM + CART1
result = (predGLM + PredictCART1)/2
colAUC(result, TS$buy)

# GLM + CART2
result = (predGLM + PredictCART2)/2
colAUC(result, TS$buy) # higher
```








<br><br><br><hr>

### Regression Model
##### Preparing the Data
```{r}
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
```

```{r}
TR2 = subset(A2, spl2) %>% subset(., select = -c(cust, buy))
TS2 = subset(A2, !spl2) %>% subset(., select = -c(cust, buy))
```


### linear model
```{r}

lm1 = lm(amount ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6  -areaACDEF-area -weekday  -hot5-hot10-hot20-ageGroup-mon1-mon2 ,TR2)

summary(lm1)
```

###compute R-squared
```{r}
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
```


### CART Regression Model


##### Cross-Validation
```{r}
library(caret)
library(e1071)
library(rpart)

# Define cross-validation experiment
# k=10
numFolds = trainControl( method = "cv", number = 10 ) 
# pick the posiible values for cp
cpGrid = expand.grid( .cp = seq(0.001,0.1,0.001)) 


# Perform the cross validation
cv = train(amount ~ ., data = TR2, method = "rpart", trControl = numFolds, tuneGrid = cpGrid )
# method="rpart" : we wanto to validate a CART model
plot(cv);cv
```
###Create a CART model
```{r}

rpart2 = rpart(amount ~ ., data = TR2, cp = 0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) -  TS2$amount)^2)
1 - (SSE/SST)
```


### model ensembling
```{r}
predLM = predict(lm1, newdata=TS2)
predCART = predict(rpart2, newdata=TS2)
result = (predLM + predCART)/2

SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((result -  TS2$amount)^2)
1 - (SSE/SST)
```


<br><br><br><hr>


##### Stop Parallel Processing
```{r}
stopCluster(clust)
```








<br><br><br><br><hr><br><br><br>

<style>
.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #008800;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h3{
  color: #b36b00;
  background: #ffe0b3;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #006000;
  background: #ffffe0;
  line-height: 2;
  font-weight: bold;
}

em{
  color: #0000c0;
  background: #f0f0f0;
  }
</style>

