1.資料準備

Sys.setlocale("LC_ALL","C")
[1] "C/C/C/C/C/zh_TW.UTF-8"
library(dplyr)
library(ggplot2)
library(caTools)
library(rpart) 
library(rpart.plot)
Loading Data
rm(list=ls(all=TRUE))
load("data/tf2.rdata")

2.從現有模型中尋找新變數

2.1 把area換算成距離
A$distance[A$area == "E"] = 1
A$distance[A$area == "F"] = 2
A$distance[A$area == "D"] = 3
A$distance[A$area == "A"] = 4
A$distance[A$area == "B"] = 5
A$distance[A$area == "C"] = 6
A$distance[A$area == "G"] = 7
A$distance[A$area == "H"] = 1
  • 本組認為距離的遠近可能會影響顧客來購買意願
  • 因共線性問題,無法同時將距離變數與area欄位放入迴歸模型中
  • 由迴歸結果發現在此模型下area比距離更夠有解釋力
2.2 把pieces/items加入變數來討論星期幾對於這兩個變數是否有解釋力
X = X %>% mutate(wday = format(date, "%w"))
wp=aggregate(X$pieces, by=list(Category=X$wday), FUN=sum)
wi=aggregate(X$items, by=list(Category=X$wday), FUN=sum)
barplot(wp$x,names.arg = c("SUN","MON","TUE","WED","THU","FRI","SAT")) #pieces與星期的關係

barplot(wi$x,names.arg = c("SUN","MON","TUE","WED","THU","FRI","SAT")) #items與星期的關係

  • 週末之購買件數(pieces)與購買商品種類數(items)較平日多
  • 因此最終模型選擇將pieces與items加入自變數
2.3 RFM模型

RFM模型是由喬治·卡利南(George Cullinan)於1961年所提出,他發現資料庫分析中,有三項重要的指標:最近一次消費(Recency)、消費頻率(Frequency)、與消費金額(Monetary)

#新增rscore
A <- A %>% mutate(
    rscore = cut(
    A$r,  
    breaks = c(1,4,10,12,15,21,33,48,60,72,92), 
    include.lowest = T,  
    labels = c(10,9,8,7,6,5,4,3,2,1)
  )
)
  • 1.3.1最近一次消費(Recency):指消費者至今再次購買與上次購買產品的時間差,舉例來說,將「購買日期分為10等分」,每一等分為資料庫的10%。
  • 最近消費的前10%,編碼為10
  • 10%~20%編碼為9,以此類推
  • 90%~100%編碼為1
  • 編碼等級越高的消費者,重複購買比率較高。
#新增fscore
A <- A %>% mutate(
    fscore = cut(
    A$f,  
    breaks = c(0,1,2,3,4,6,60),  
    include.lowest = T,  
    labels = c(1,5,7,8,9,10) 
  )
)
  • 1.3.2 消費頻率(Frequency):指消費者在一定期間內購買該產品的次數。
  • 次數最多的前10%,編碼為10
  • 10%~20%編碼為9,以此類推
  • 90%~100%編碼為1
  • 編碼等級越高的消費者,其消費頻率越高,忠誠度與顧客價值也越高。
#新增mscore
A <- A %>% mutate(
    mscore = cut(
    A$m,   
    breaks = c(0,179.1625,301,423,557.5333,709.5,902,1148,1526.769,2218,10634),  
    include.lowest = T,  
    labels = c(1,2,3,4,5,6,7,8,9,10) 
  )
)
  • 1.3.3 消費金額(Monetary):指消費者在一定期間內購買該產品的總金額。
  • 金額最大的前10%,編碼為10
  • 10%~20%編碼為9,以此類推
  • 90%~100%編碼為1
  • 編碼等級越高的消費者,其消費金額越高,顧客價值也越高。
  • RFM模型參考資料
2.4 新增時間變數
A$days = (A$s - A$r) #第一次購買至最後一次購買之間隔天數
A$dayf = A$days/A$f  #間隔天數/總購買次數=平均幾天會來一次
  • 可看出A$days越大代表此顧客越依賴本商店之趨勢
cor(A$f, A$days)
[1] 0.641
ggplot(A, aes(days, f))+ geom_point(col="#6495ED")+ geom_smooth(col="red")

  • A$dayf值越小代表顧客越依賴本商店
  • 本組發現A$daysA$dayf有高度相關,但在最後的迴歸模型中只放一個變數解釋力反而下降
cor(A$dayf, A$days)
[1] 0.7567
ggplot(A, aes(days, dayf))+ geom_point(col="#6495ED")


3.資料準備

#在資料中新增pieces與items欄位
B = group_by(X, cust) %>% summarise(
  pieces = sum(pieces),
  items = sum(items)
  ) %>% data.frame    
A = merge(A, B, by="cust", all.x=T)
A$days = (A$s - A$r)
A$dayf = A$days/A$f
#新增rscore
A <- A %>% mutate(
    rscore = cut(
    A$r,  
    breaks = c(1,4,10,12,15,21,33,48,60,72,92), 
    include.lowest = T,  
    labels = c(10,9,8,7,6,5,4,3,2,1)
  )
)
#新增fscore
A <- A %>% mutate(
    fscore = cut(
    A$f,  
    breaks = c(0,1,2,3,4,6,60),  
    include.lowest = T,  
    labels = c(1,5,7,8,9,10) 
  )
)
#新增mscore
A <- A %>% mutate(
    mscore = cut(
    A$m,   
    breaks = c(0,179.1625,301,423,557.5333,709.5,902,1148,1526.769,2218,10634),  
    include.lowest = T,  
    labels = c(1,2,3,4,5,6,7,8,9,10) 
  )
)
A$distance[A$area == "E"] = 1
A$distance[A$area == "F"] = 2
A$distance[A$area == "D"] = 3
A$distance[A$area == "A"] = 4
A$distance[A$area == "B"] = 5
A$distance[A$area == "C"] = 6
A$distance[A$area == "G"] = 7
A$distance[A$area == "H"] = 1
table(A$distance)

    1     2     3     4     5     6     7 
11202  7798  1778   802   783  3169  3052 
3.1 Splitting for Logistic Regression
TR = subset(A, spl)
TS = subset(A, !spl)
3.2 Splitting for Linear Regression
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)


4. 邏輯迴歸模型

glm1 = glm(buy ~ ., TR[,c(6:9,11, 13:19)], family=binomial()) 
summary(glm1)

Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(6:9, 
    11, 13:19)])

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.745  -0.871  -0.675   1.014   1.919  

Coefficients:
              Estimate Std. Error z value           Pr(>|z|)    
(Intercept) -0.8176290  0.1396764   -5.85 0.0000000048064700 ***
rev          0.0000156  0.0000192    0.81            0.41846    
raw         -0.0003214  0.0000863   -3.72            0.00020 ***
ageB        -0.0131029  0.0873195   -0.15            0.88072    
ageC         0.0391362  0.0806248    0.49            0.62738    
ageD         0.0874853  0.0799556    1.09            0.27388    
ageE         0.0934663  0.0820278    1.14            0.25452    
ageF         0.0250656  0.0853348    0.29            0.76896    
ageG         0.0276573  0.0940198    0.29            0.76863    
ageH         0.2047475  0.1100367    1.86            0.06278 .  
ageI         0.0741508  0.1182329    0.63            0.53056    
ageJ         0.2793938  0.1053210    2.65            0.00798 ** 
ageK        -0.0640551  0.1477020   -0.43            0.66452    
areaB       -0.0443848  0.1333282   -0.33            0.73921    
areaC       -0.2032366  0.1053918   -1.93            0.05381 .  
areaD        0.0451702  0.1120573    0.40            0.68688    
areaE        0.2571047  0.0977537    2.63            0.00854 ** 
areaF        0.1678213  0.0985564    1.70            0.08861 .  
areaG       -0.0372408  0.1053729   -0.35            0.72377    
areaH       -0.1393282  0.1238355   -1.13            0.26054    
rscore9     -0.2687036  0.0706527   -3.80            0.00014 ***
rscore8     -0.1971253  0.0793359   -2.48            0.01297 *  
rscore7     -0.1391499  0.0815604   -1.71            0.08799 .  
rscore6     -0.1400645  0.0760864   -1.84            0.06564 .  
rscore5     -0.2338014  0.0757662   -3.09            0.00203 ** 
rscore4     -0.2348815  0.0787455   -2.98            0.00286 ** 
rscore3     -0.2115949  0.0798278   -2.65            0.00803 ** 
rscore2     -0.3236706  0.0823394   -3.93 0.0000846162927277 ***
rscore1     -0.4411497  0.0840432   -5.25 0.0000001528565161 ***
fscore5      0.6633223  0.0610249   10.87            < 2e-16 ***
fscore7      0.7976561  0.0731868   10.90            < 2e-16 ***
fscore8      0.8323321  0.1048399    7.94 0.0000000000000020 ***
fscore9      0.9618447  0.1344541    7.15 0.0000000000008447 ***
fscore10     1.5428202  0.1953205    7.90 0.0000000000000028 ***
mscore2     -0.1606396  0.0708616   -2.27            0.02339 *  
mscore3     -0.1726033  0.0714829   -2.41            0.01575 *  
mscore4     -0.0610448  0.0718581   -0.85            0.39559    
mscore5     -0.1585113  0.0723829   -2.19            0.02853 *  
mscore6     -0.1808735  0.0730981   -2.47            0.01335 *  
mscore7     -0.1461266  0.0740250   -1.97            0.04838 *  
mscore8     -0.2478487  0.0767948   -3.23            0.00125 ** 
mscore9     -0.1981045  0.0813516   -2.44            0.01488 *  
mscore10    -0.1847960  0.0942401   -1.96            0.04989 *  
days         0.0240729  0.0032828    7.33 0.0000000000002250 ***
dayf        -0.0493936  0.0078936   -6.26 0.0000000003913264 ***
pieces       0.0000808  0.0022644    0.04            0.97153    
items        0.0083215  0.0030602    2.72            0.00654 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27629  on 20007  degrees of freedom
Residual deviance: 23142  on 19961  degrees of freedom
AIC: 23236

Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3785  818
  TRUE   1691 2282
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts          # 0.69998
[1] 0.7074
colAUC(pred, TS$buy)                                   # 0.7556
                 [,1]
FALSE vs. TRUE 0.7576


5. 線性迴歸模型

lm1 = lm(amount ~ ., TR2[,c(6,8:10,13:19)])
summary(lm1)

Call:
lm(formula = amount ~ ., data = TR2[, c(6, 8:10, 13:19)])

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8071 -0.2243  0.0486  0.2781  1.4810 

Coefficients:
              Estimate Std. Error t value       Pr(>|t|)    
(Intercept)  1.6306325  0.0940722   17.33        < 2e-16 ***
rev          0.3531444  0.0420181    8.40        < 2e-16 ***
ageB         0.0713390  0.0249489    2.86        0.00425 ** 
ageC         0.1167029  0.0229124    5.09 0.000000358579 ***
ageD         0.1203492  0.0226065    5.32 0.000000104091 ***
ageE         0.1278418  0.0231154    5.53 0.000000032786 ***
ageF         0.1014801  0.0241152    4.21 0.000025991936 ***
ageG         0.0790777  0.0263249    3.00        0.00267 ** 
ageH         0.0703731  0.0310595    2.27        0.02349 *  
ageI         0.0699467  0.0319027    2.19        0.02837 *  
ageJ        -0.0220438  0.0280813   -0.78        0.43248    
ageK         0.1281026  0.0391771    3.27        0.00108 ** 
areaB        0.0842920  0.0432456    1.95        0.05131 .  
areaC        0.0418660  0.0351167    1.19        0.23322    
areaD       -0.0078782  0.0369139   -0.21        0.83100    
areaE        0.0112585  0.0323958    0.35        0.72820    
areaF        0.0142555  0.0326285    0.44        0.66219    
areaG        0.0266225  0.0346978    0.77        0.44294    
areaH        0.0194646  0.0386084    0.50        0.61417    
rscore9     -0.0267572  0.0154468   -1.73        0.08327 .  
rscore8     -0.0242339  0.0185396   -1.31        0.19120    
rscore7     -0.0127723  0.0192583   -0.66        0.50721    
rscore6      0.0149635  0.0185977    0.80        0.42108    
rscore5     -0.0184456  0.0196912   -0.94        0.34892    
rscore4     -0.0212455  0.0216923   -0.98        0.32741    
rscore3     -0.0123278  0.0223010   -0.55        0.58042    
rscore2     -0.0232518  0.0237829   -0.98        0.32826    
rscore1      0.0044405  0.0251430    0.18        0.85982    
fscore5     -0.1020025  0.0244602   -4.17 0.000030718485 ***
fscore7     -0.1690644  0.0274724   -6.15 0.000000000787 ***
fscore8     -0.2114310  0.0324538   -6.51 0.000000000077 ***
fscore9     -0.2498990  0.0371616   -6.72 0.000000000019 ***
fscore10    -0.2711706  0.0482230   -5.62 0.000000019285 ***
mscore2      0.0556642  0.0262046    2.12        0.03368 *  
mscore3      0.0529852  0.0313072    1.69        0.09060 .  
mscore4      0.0925559  0.0354131    2.61        0.00897 ** 
mscore5      0.1199531  0.0392777    3.05        0.00226 ** 
mscore6      0.1197952  0.0428412    2.80        0.00518 ** 
mscore7      0.1419878  0.0465258    3.05        0.00228 ** 
mscore8      0.1550719  0.0507021    3.06        0.00223 ** 
mscore9      0.1896448  0.0558285    3.40        0.00068 ***
mscore10     0.1903761  0.0649944    2.93        0.00341 ** 
days         0.0018384  0.0007315    2.51        0.01198 *  
dayf        -0.0051296  0.0019069   -2.69        0.00716 ** 
pieces       0.0000864  0.0004220    0.20        0.83775    
items        0.0021006  0.0005885    3.57        0.00036 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.418 on 9223 degrees of freedom
Multiple R-squared:  0.304, Adjusted R-squared:  0.301 
F-statistic: 89.6 on 45 and 9223 DF,  p-value: <2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.3042 0.2689

6. 決策樹

Turn on Parallel Processing
library(caret)
library(doParallel)
clust = makeCluster(detectCores())
#顯示X個執行緒
registerDoParallel(clust); getDoParWorkers()
[1] 4
set.seed(2)
t0 = Sys.time()
cv1 = train(
  amount ~ ., data = TR2[,c(6:10, 12:18)], method = "rpart", 
  trControl = trainControl(method = "cv", number=10), metric="Rsquared",
  tuneGrid = expand.grid(cp = seq(0,0.01,0.0002)) 
  )
Warning in .Internal(gc(verbose, reset, full)) :
  closing unused connection 10 (<-localhost:11038)
Warning in .Internal(gc(verbose, reset, full)) :
  closing unused connection 9 (<-localhost:11038)
Warning in .Internal(gc(verbose, reset, full)) :
  closing unused connection 8 (<-localhost:11038)
Warning in .Internal(gc(verbose, reset, full)) :
  closing unused connection 7 (<-localhost:11038)
cv1; plot(cv1)
CART 

9269 samples
  11 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 8341, 8342, 8341, 8343, 8342, 8342, ... 
Resampling results across tuning parameters:

  cp      RMSE    Rsquared  MAE   
  0.0000  0.4780  0.1833    0.3725
  0.0002  0.4702  0.1954    0.3653
  0.0004  0.4520  0.2210    0.3478
  0.0006  0.4331  0.2604    0.3324
  0.0008  0.4293  0.2682    0.3301
  0.0010  0.4271  0.2734    0.3285
  0.0012  0.4265  0.2748    0.3281
  0.0014  0.4261  0.2754    0.3279
  0.0016  0.4260  0.2753    0.3277
  0.0018  0.4258  0.2761    0.3278
  0.0020  0.4261  0.2753    0.3282
  0.0022  0.4269  0.2724    0.3289
  0.0024  0.4272  0.2717    0.3292
  0.0026  0.4271  0.2718    0.3290
  0.0028  0.4275  0.2705    0.3294
  0.0030  0.4279  0.2691    0.3297
  0.0032  0.4282  0.2680    0.3300
  0.0034  0.4284  0.2675    0.3302
  0.0036  0.4285  0.2671    0.3303
  0.0038  0.4285  0.2671    0.3303
  0.0040  0.4288  0.2661    0.3306
  0.0042  0.4292  0.2647    0.3309
  0.0044  0.4301  0.2616    0.3319
  0.0046  0.4301  0.2616    0.3319
  0.0048  0.4301  0.2616    0.3319
  0.0050  0.4307  0.2596    0.3323
  0.0052  0.4320  0.2548    0.3335
  0.0054  0.4321  0.2547    0.3337
  0.0056  0.4332  0.2505    0.3345
  0.0058  0.4344  0.2466    0.3354
  0.0060  0.4344  0.2466    0.3354
  0.0062  0.4352  0.2437    0.3359
  0.0064  0.4355  0.2429    0.3360
  0.0066  0.4361  0.2406    0.3365
  0.0068  0.4361  0.2410    0.3365
  0.0070  0.4361  0.2410    0.3365
  0.0072  0.4361  0.2410    0.3365
  0.0074  0.4361  0.2410    0.3365
  0.0076  0.4358  0.2422    0.3363
  0.0078  0.4361  0.2409    0.3368
  0.0080  0.4362  0.2407    0.3370
  0.0082  0.4363  0.2403    0.3370
  0.0084  0.4363  0.2403    0.3370
  0.0086  0.4363  0.2403    0.3370
  0.0088  0.4365  0.2396    0.3372
  0.0090  0.4365  0.2396    0.3372
  0.0092  0.4365  0.2396    0.3372
  0.0094  0.4365  0.2396    0.3372
  0.0096  0.4365  0.2396    0.3372
  0.0098  0.4365  0.2396    0.3372
  0.0100  0.4365  0.2396    0.3372

Rsquared was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.0018.

Sys.time() - t0
Time difference of 12.69 secs
rpart = rpart(amount ~ ., TR2[,c(6:10, 12:18)], cp=0.0016) 
SST = sum((TS2$amount - mean(TR2$amount))^ 2) 
SSE = sum((predict(rpart, TS2) - TS2$amount)^2) 
1 - (SSE/SST) # 0.2460567
[1] 0.253
prp(rpart) 
Bad 'data' field in model 'call'.
To silence this warning:
    Call prp with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.

#rpart.plot(rpart3,cex=0.6)







---
title: <font color=gray size=7>期中資料分析競賽</font>
author: Group6，陳振嘉 陳偉文 張叡哲 魏嘉妤 楊明修 洪筱涵
date: "`r Sys.time()`"
output: html_notebook
---




***
### 1.資料準備

```{r echo=T, message=F, cache=F, warning=F}
Sys.setlocale("LC_ALL","C")
library(dplyr)
library(ggplot2)
library(caTools)
library(rpart) 
library(rpart.plot)
```

##### Loading Data
```{r}
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
```

***

### 2.從現有模型中尋找新變數

##### 2.1 把area換算成距離
```{r}
A$distance[A$area == "E"] = 1
A$distance[A$area == "F"] = 2
A$distance[A$area == "D"] = 3
A$distance[A$area == "A"] = 4
A$distance[A$area == "B"] = 5
A$distance[A$area == "C"] = 6
A$distance[A$area == "G"] = 7
A$distance[A$area == "H"] = 1
```
+ 本組認為距離的遠近可能會影響顧客來購買意願
+ 因共線性問題，無法同時將距離變數與area欄位放入迴歸模型中
+ 由迴歸結果發現在此模型下area比距離更夠有解釋力

##### 2.2 把pieces/items加入變數來討論星期幾對於這兩個變數是否有解釋力
```{r}
X = X %>% mutate(wday = format(date, "%w"))
wp=aggregate(X$pieces, by=list(Category=X$wday), FUN=sum)
wi=aggregate(X$items, by=list(Category=X$wday), FUN=sum)
barplot(wp$x,names.arg = c("SUN","MON","TUE","WED","THU","FRI","SAT")) #pieces與星期的關係
barplot(wi$x,names.arg = c("SUN","MON","TUE","WED","THU","FRI","SAT")) #items與星期的關係
```
+ 週末之購買件數(pieces)與購買商品種類數(items)較平日多
+ 因此最終模型選擇將pieces與items加入自變數

##### 2.3 RFM模型

<font color=#000080 size=3> RFM模型是由喬治·卡利南（George Cullinan）於1961年所提出，他發現資料庫分析中，有三項重要的指標：最近一次消費（Recency）、消費頻率（Frequency）、與消費金額（Monetary）</font>

```{r}
#新增rscore
A <- A %>% mutate(
    rscore = cut(
    A$r,  
    breaks = c(1,4,10,12,15,21,33,48,60,72,92), 
    include.lowest = T,  
    labels = c(10,9,8,7,6,5,4,3,2,1)
  )
)
```
+ 1.3.1最近一次消費（Recency）：指消費者至今再次購買與上次購買產品的時間差，舉例來說，將「購買日期分為10等分」，每一等分為資料庫的10%。
+ 最近消費的前10%，編碼為10
+ 10%~20%編碼為9，以此類推
+ 90%~100%編碼為1
+ 編碼等級越高的消費者，重複購買比率較高。

```{r}
#新增fscore
A <- A %>% mutate(
    fscore = cut(
    A$f,  
    breaks = c(0,1,2,3,4,6,60),  
    include.lowest = T,  
    labels = c(1,5,7,8,9,10) 
  )
)
```
+ 1.3.2 消費頻率（Frequency）：指消費者在一定期間內購買該產品的次數。
+ 次數最多的前10%，編碼為10
+ 10%~20%編碼為9，以此類推
+ 90%~100%編碼為1
+ 編碼等級越高的消費者，其消費頻率越高，忠誠度與顧客價值也越高。

```{r}
#新增mscore
A <- A %>% mutate(
    mscore = cut(
    A$m,   
    breaks = c(0,179.1625,301,423,557.5333,709.5,902,1148,1526.769,2218,10634),  
    include.lowest = T,  
    labels = c(1,2,3,4,5,6,7,8,9,10) 
  )
)
```
+ 1.3.3 消費金額（Monetary）：指消費者在一定期間內購買該產品的總金額。
+ 金額最大的前10%，編碼為10
+ 10%~20%編碼為9，以此類推
+ 90%~100%編碼為1
+ 編碼等級越高的消費者，其消費金額越高，顧客價值也越高。
+ [RFM模型參考資料](https://reurl.cc/MvNl3)

##### 2.4 新增時間變數
```{r}
A$days = (A$s - A$r) #第一次購買至最後一次購買之間隔天數
A$dayf = A$days/A$f  #間隔天數/總購買次數＝平均幾天會來一次
```
+ 可看出**若`A$days`越大代表此顧客越依賴本商店**之趨勢

```{r}
cor(A$f, A$days)
ggplot(A, aes(days, f))+ geom_point(col="#6495ED")+ geom_smooth(col="red")
```

+ 若`A$dayf`值越小代表顧客越依賴本商店
+ 本組發現`A$days`與`A$dayf`有高度相關，但在最後的迴歸模型中只放一個變數解釋力反而下降

```{r}
cor(A$dayf, A$days)
ggplot(A, aes(days, dayf))+ geom_point(col="#6495ED")
```


<hr>

### 3.資料準備

```{r}
#在資料中新增pieces與items欄位
B = group_by(X, cust) %>% summarise(
  pieces = sum(pieces),
  items = sum(items)
  ) %>% data.frame    
A = merge(A, B, by="cust", all.x=T)
A$days = (A$s - A$r)
A$dayf = A$days/A$f
```


```{r}
#新增rscore
A <- A %>% mutate(
    rscore = cut(
    A$r,  
    breaks = c(1,4,10,12,15,21,33,48,60,72,92), 
    include.lowest = T,  
    labels = c(10,9,8,7,6,5,4,3,2,1)
  )
)
```


```{r}
#新增fscore
A <- A %>% mutate(
    fscore = cut(
    A$f,  
    breaks = c(0,1,2,3,4,6,60),  
    include.lowest = T,  
    labels = c(1,5,7,8,9,10) 
  )
)
```

```{r}
#新增mscore
A <- A %>% mutate(
    mscore = cut(
    A$m,   
    breaks = c(0,179.1625,301,423,557.5333,709.5,902,1148,1526.769,2218,10634),  
    include.lowest = T,  
    labels = c(1,2,3,4,5,6,7,8,9,10) 
  )
)
```


```{r}
A$distance[A$area == "E"] = 1
A$distance[A$area == "F"] = 2
A$distance[A$area == "D"] = 3
A$distance[A$area == "A"] = 4
A$distance[A$area == "B"] = 5
A$distance[A$area == "C"] = 6
A$distance[A$area == "G"] = 7
A$distance[A$area == "H"] = 1
table(A$distance)
```

##### 3.1 Splitting for Logistic Regression 
```{r}
TR = subset(A, spl)
TS = subset(A, !spl)
```
##### 3.2 Splitting for Linear Regression
```{r}
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
```

<br><hr>

### 4. 邏輯迴歸模型
```{r}
glm1 = glm(buy ~ ., TR[,c(6:9,11, 13:19)], family=binomial()) 
summary(glm1)
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts          # 0.69998
colAUC(pred, TS$buy)                                   # 0.7556
```

+ ageD,ageE(35~44歲)購買的機率相較其它年齡層高，可推斷為年輕家庭
+ fscore之5~10組可看出正向關聯，且係數皆顯著，代表購買頻率影響較為明顯
+ mscore較無法看出組別間與係數之關聯
+ 因本組新增rscore fscore mscore 因此本組將rfm變數拿掉
+ 本組覺得距今第一次購買日期的天數(s)影響不大且本組有新增平均購買天數，因此本組將s拿掉


<br><hr>

### 5. 線性迴歸模型

```{r}
lm1 = lm(amount ~ ., TR2[,c(6,8:10,13:19)])
summary(lm1)
```

+ ageD與ageE之係數同樣明顯較其他年齡層高，可視為消費主力
+ mscore之係數隨組別上升，可看出過往客單價越高會正向影響未來客單價
+ fscore與rscore則較無法看出組間之差異


```{r}
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
```


***

### 6. 決策樹

##### Turn on Parallel Processing
```{r}
library(caret)
library(doParallel)

clust = makeCluster(detectCores())
#顯示X個執行緒
registerDoParallel(clust); getDoParWorkers()
```

```{r}
set.seed(2)
t0 = Sys.time()
cv1 = train(
  amount ~ ., data = TR2[,c(6:10, 12:18)], method = "rpart", 
  trControl = trainControl(method = "cv", number=10), metric="Rsquared",
  tuneGrid = expand.grid(cp = seq(0,0.01,0.0002)) 
  )
cv1; plot(cv1)
Sys.time() - t0
```

```{r}
rpart = rpart(amount ~ ., TR2[,c(6:10, 12:18)], cp=0.0016) 
SST = sum((TS2$amount - mean(TR2$amount))^ 2) 
SSE = sum((predict(rpart, TS2) - TS2$amount)^2) 
1 - (SSE/SST) # 0.2460567
```

```{r}
prp(rpart) 
#rpart.plot(rpart3,cex=0.6)
```


<br><br><br><hr><br><br><br>

<style>

.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

.qiz {
  line-height: 1.75;
  background: #f0f0f0;
  border-left: 12px solid #ccffcc;
  padding: 4px;
  padding-left: 10px;
  color: #009900;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #4682B4;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}


h3{
  color: ##00008B;
  background: #F0F8FF;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #8470FF;
  background: #F8F8FF;
  line-height: 1.5;
  font-weight: bold;
}

</style>


