Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
library(lubridate)
library(corrplot)

一、使用原始變數建立模型:glm

載入檔案
load(file='data/tf2.Rdata')
切割TR、TS
TR=subset(A,spl)
TS=subset(A,!spl)
is.na(TR) %>% colSums() #計算TR的NA數量
  cust      r      s      f      m    rev    raw    age   area amount    buy 
     0      0      0      0      0      0      0      0      0  10739      0 
建立模型(glm)
cx=c(2:9,11)
colnames(TR[,cx])
[1] "r"    "s"    "f"    "m"    "rev"  "raw"  "age"  "area" "buy" 
glm1 = glm(buy ~ ., TR[,cx], family=binomial()) 
summary(glm1)

Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, cx])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.7931  -0.8733  -0.6991   1.0384   1.8735  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.259e+00  1.261e-01  -9.985  < 2e-16 ***
r           -1.227e-02  8.951e-04 -13.708  < 2e-16 ***
s            9.566e-03  9.101e-04  10.511  < 2e-16 ***
f            2.905e-01  1.593e-02  18.233  < 2e-16 ***
m           -3.028e-05  2.777e-05  -1.090  0.27559    
rev          4.086e-05  1.940e-05   2.106  0.03521 *  
raw         -2.306e-04  8.561e-05  -2.693  0.00708 ** 
ageB        -4.194e-02  8.666e-02  -0.484  0.62838    
ageC         1.772e-02  7.992e-02   0.222  0.82456    
ageD         7.705e-02  7.921e-02   0.973  0.33074    
ageE         8.699e-02  8.132e-02   1.070  0.28476    
ageF         1.928e-02  8.457e-02   0.228  0.81962    
ageG         1.745e-02  9.323e-02   0.187  0.85155    
ageH         1.752e-01  1.094e-01   1.602  0.10926    
ageI         6.177e-02  1.175e-01   0.526  0.59904    
ageJ         2.652e-01  1.047e-01   2.533  0.01131 *  
ageK        -1.419e-01  1.498e-01  -0.947  0.34347    
areaB       -4.105e-02  1.321e-01  -0.311  0.75603    
areaC       -2.075e-01  1.045e-01  -1.986  0.04703 *  
areaD        3.801e-02  1.111e-01   0.342  0.73214    
areaE        2.599e-01  9.682e-02   2.684  0.00727 ** 
areaF        1.817e-01  9.753e-02   1.863  0.06243 .  
areaG       -4.677e-02  1.045e-01  -0.448  0.65435    
areaH       -1.695e-01  1.232e-01  -1.375  0.16912    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27629  on 20007  degrees of freedom
Residual deviance: 23295  on 19984  degrees of freedom
AIC: 23343

Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
glm的Accuracy及AUC
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3730  873
  TRUE   1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts    #0.755638      
[1] 0.6999767
colAUC(pred, TS$buy)   #0.7556038                                
                    [,1]
FALSE vs. TRUE 0.7556038

二、製作新變數

join_df=group_by(X, cust) %>% summarise(
 
  T11=(month(date)==11) %>% sum ,
  T12=(month(date)==12) %>% sum ,
  T1=(month(date)==1) %>% sum
  ) %>% data.frame    # 28584
join_df

note

T11、T12、T1:將顧客分別在11、12、1月來店次數的總和。


合併變數T11、T12、T1到A (Left join):
A = merge(A, join_df, by="cust", all.x=T)
A

製作新變數(2)
#顧客在11月的消費
Nov = filter(X, month(date)==11 ) %>% 
  group_by(cust) %>% 
  summarise(
    amount_nov = sum(total),
    items_nov=sum(items),
    pieces_nov=sum(pieces),
    gross_nov=sum(gross)
  ) 
Nov
#顧客在12月的消費
Dec = filter(X, month(date)==12 ) %>%
  group_by(cust) %>% 
  summarise(
    amount_dec = sum(total),
    items_dec=sum(items),
    pieces_dec=sum(pieces),
    gross_dec=sum(gross)
  ) 
Dec
#顧客在1月的消費
Jan = filter(X, month(date)==1 ) %>% 
  group_by(cust) %>% 
  summarise(
    amount_m1 = sum(total),
    items_m1=sum(items),
    pieces_m1=sum(pieces),
    gross_m1=sum(gross),
    price_m1=sum(gross)
  ) 
Jan

note

  • 分別製作出顧客在11、12、1月的消費(total/items/pieces/gross),丟進模型排列組合過後發現amount_m1的效果是最顯著的

合併變數到A(Left Join)
A = merge(A, Nov, by="cust", all.x=T)
A = merge(A, Dec, by="cust", all.x=T)
A = merge(A, Jan, by="cust", all.x=T)
A

用平均值填補NA
for(i in 15:27){
  mean_col <- mean(A[, i], na.rm = T)  # mean of col ith
  na.rows <- is.na(A[, i])   #col ith na data
  A[na.rows, i] <- mean_col
}
note
Fig-1: complete NA’s

Fig-1: complete NA’s


製作新變數(3)
A$amount_total=A$amount_nov+A$amount_dec+A$amount_m1
A$gross_total=A$gross_nov+A$gross_dec+A$gross_m1
A$items_total=A$items_nov+A$items_dec+A$items_m1
A$pieces_total=A$pieces_nov+A$pieces_dec+A$pieces_m1
A$f_itemtotal=A$f*A$items_total
A$f_amounttotal=A$f*A$amount_total
切割TR與TS
TR=subset(A,spl)
TS=subset(A,!spl)

用新變數來建立模型(glm)
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
 [1] "r"             "s"             "f"             "m"             "rev"          
 [6] "raw"           "age"           "area"          "buy"           "T1"           
[11] "amount_m1"     "amount_total"  "items_total"   "pieces_total"  "f_itemtotal"  
[16] "f_amounttotal"
glm1 = glm(buy ~ ., TR[,cx], family=binomial()) 
summary(glm1)

Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, cx])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.5203  -0.8705  -0.6974   1.0348   2.0108  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -1.003e+00  1.672e-01  -5.999 1.98e-09 ***
r             -5.582e-03  1.506e-03  -3.707 0.000210 ***
s              5.529e-03  1.352e-03   4.091 4.29e-05 ***
f              2.529e-01  1.823e-02  13.871  < 2e-16 ***
m             -5.899e-05  3.048e-05  -1.935 0.052931 .  
rev            2.226e-04  3.897e-05   5.712 1.12e-08 ***
raw           -3.356e-04  8.815e-05  -3.807 0.000140 ***
ageB          -3.865e-02  8.665e-02  -0.446 0.655527    
ageC           1.282e-02  7.996e-02   0.160 0.872605    
ageD           6.121e-02  7.927e-02   0.772 0.440067    
ageE           6.887e-02  8.144e-02   0.846 0.397774    
ageF          -1.709e-03  8.474e-02  -0.020 0.983913    
ageG           9.365e-04  9.343e-02   0.010 0.992003    
ageH           1.677e-01  1.095e-01   1.532 0.125559    
ageI           5.406e-02  1.178e-01   0.459 0.646179    
ageJ           2.578e-01  1.049e-01   2.457 0.013991 *  
ageK          -1.481e-01  1.497e-01  -0.990 0.322220    
areaB         -4.090e-02  1.324e-01  -0.309 0.757343    
areaC         -2.099e-01  1.047e-01  -2.004 0.045059 *  
areaD          4.644e-02  1.113e-01   0.417 0.676577    
areaE          2.566e-01  9.707e-02   2.643 0.008208 ** 
areaF          1.730e-01  9.781e-02   1.769 0.076952 .  
areaG         -4.291e-02  1.047e-01  -0.410 0.681866    
areaH         -1.677e-01  1.235e-01  -1.358 0.174384    
T1             1.090e-01  2.983e-02   3.655 0.000257 ***
amount_m1      3.149e-05  1.913e-05   1.646 0.099769 .  
amount_total  -1.971e-04  3.865e-05  -5.100 3.39e-07 ***
items_total    7.446e-03  3.536e-03   2.106 0.035231 *  
pieces_total   8.636e-04  2.302e-03   0.375 0.707500    
f_itemtotal   -3.176e-04  3.707e-04  -0.857 0.391518    
f_amounttotal -1.355e-06  2.907e-06  -0.466 0.641057    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27629  on 20007  degrees of freedom
Residual deviance: 23228  on 19977  degrees of freedom
AIC: 23290

Number of Fisher Scoring iterations: 8
pred =  predict(glm1, TS, type="response")

glm1的Accuracy及AUC
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3755  848
  TRUE   1695 2278
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   #0.7017257     
[1] 0.7034748
colAUC(pred, TS$buy)                                
                    [,1]
FALSE vs. TRUE 0.7575633
#0.7575633

用step自動挑選變數
glm1_step=step(glm1,direction = 'backward')
Start:  AIC=23290.28
buy ~ r + s + f + m + rev + raw + age + area + T1 + amount_m1 + 
    amount_total + items_total + pieces_total + f_itemtotal + 
    f_amounttotal
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
                Df Deviance    AIC
- age           10    23246  23288
- pieces_total   1    23228  23288
- f_amounttotal  1    23228  23288
<none>                23228  23290
- amount_m1      1    23231  23291
- m              1    23232  23292
- items_total    1    23233  23293
- T1             1    23242  23302
- r              1    23242  23302
- raw            1    23243  23303
- s              1    23245  23305
- amount_total   1    23255  23315
- area           7    23329  23377
- f              1    23408  23468
- rev            1    23955  24015
- f_itemtotal    1   476281 476341

Step:  AIC=23287.96
buy ~ r + s + f + m + rev + raw + area + T1 + amount_m1 + amount_total + 
    items_total + pieces_total + f_itemtotal + f_amounttotal
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
                Df Deviance   AIC
- f_amounttotal  1    23246 23286
- pieces_total   1    23246 23286
<none>                23246 23288
- amount_m1      1    23249 23289
- m              1    23250 23290
- items_total    1    23250 23290
- T1             1    23260 23300
- r              1    23260 23300
- raw            1    23261 23301
- s              1    23262 23302
- amount_total   1    23274 23314
- area           7    23350 23378
- f              1    23426 23466
- rev            1    23966 24006
- f_itemtotal    1    27848 27888

Step:  AIC=23286.14
buy ~ r + s + f + m + rev + raw + area + T1 + amount_m1 + amount_total + 
    items_total + pieces_total + f_itemtotal
glm.fit: fitted probabilities numerically 0 or 1 occurred
               Df Deviance   AIC
- pieces_total  1    23246 23284
<none>               23246 23286
- amount_m1     1    23249 23287
- m             1    23250 23288
- items_total   1    23252 23290
- f_itemtotal   1    23259 23297
- T1            1    23260 23298
- r             1    23260 23298
- raw           1    23261 23299
- s             1    23263 23301
- amount_total  1    23279 23317
- rev           1    23280 23318
- area          7    23350 23376
- f             1    23427 23465

Step:  AIC=23284.4
buy ~ r + s + f + m + rev + raw + area + T1 + amount_m1 + amount_total + 
    items_total + f_itemtotal
glm.fit: fitted probabilities numerically 0 or 1 occurred
               Df Deviance   AIC
<none>               23246 23284
- amount_m1     1    23249 23285
- m             1    23250 23286
- f_itemtotal   1    23259 23295
- T1            1    23260 23296
- r             1    23260 23296
- raw           1    23262 23298
- s             1    23263 23299
- items_total   1    23273 23309
- amount_total  1    23279 23315
- rev           1    23280 23316
- area          7    23350 23374
- f             1    23428 23464
pred =  predict(glm1_step, TS, type="response")
glm1_step的Accuracy及AUC
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3757  846
  TRUE   1707 2266
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts     #0.7028918     
[1] 0.7023088
colAUC(pred, TS$buy)                                
                    [,1]
FALSE vs. TRUE 0.7577661
#0.7577661

檢查共線性
TR$amount_new=log(1+TR$amount)
TR$amount[is.na(TR$amount)]=0
cx=c(2:7,10)
colnames(TR[,cx])
[1] "r"      "s"      "f"      "m"      "rev"    "raw"    "amount"
cor(TR[,cx]) %>% corrplot()


決策樹

TR=subset(A,spl)
TS=subset(A,!spl)
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
 [1] "r"             "s"             "f"             "m"             "rev"          
 [6] "raw"           "age"           "area"          "buy"           "T1"           
[11] "amount_m1"     "amount_total"  "items_total"   "pieces_total"  "f_itemtotal"  
[16] "f_amounttotal"
rpart1 = rpart(buy ~ ., TR,          # simplify the formula
               method="class", cp=0.001) 
畫出決策樹
library(rpart.plot)
rpart.plot(rpart1,cex=0.6)#cex控制字體大小,這畫法會較詳細

預測決策樹
pred = predict(rpart1, TS, type = "class")  # predict classes;如果要predict機率,class 要寫prop
x = table(actual = TS$buy, pred); x #如果是predict機率的話,後面要寫pred>0.5
       pred
actual  FALSE TRUE
  FALSE  3884  719
  TRUE   1805 2168
sum(diag(x))/sum(x)                         # 0.7022
[1] 0.7056903
ROC & AUC
library(ROCR)
Loading required package: gplots

Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess
PredictROC = predict(rpart1, TS)              # predict prob. #決策樹有多個類別,所以把每一個類別的機率列出來,才會多這個步驟(glm只有一個類別)
head(PredictROC)
       FALSE      TRUE
16 0.6870852 0.3129148
22 0.3313911 0.6686089
26 0.6870852 0.3129148
34 0.1608973 0.8391027
35 0.6870852 0.3129148
37 0.6870852 0.3129148
perf = prediction(PredictROC[,2], TS$buy) #做roc的時候只需要y=1的機率而已
perf = performance(perf, "tpr", "fpr")
plot(perf)

pred = predict(rpart1, TS)[,2]     # prob. of Reverse = 1         
colAUC(pred, TS$buy, T)        # AUC = 0.6984
                    [,1]
FALSE vs. TRUE 0.7144386

開啟平行運算
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4

交叉驗證

切割TR與TS
TR=subset(A,spl)
TS=subset(A,!spl)
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
 [1] "r"             "s"             "f"             "m"             "rev"          
 [6] "raw"           "age"           "area"          "buy"           "T1"           
[11] "amount_m1"     "amount_total"  "items_total"   "pieces_total"  "f_itemtotal"  
[16] "f_amounttotal"
TR$buy = factor(ifelse(TR$buy, "yes", "no"))
TS$buy = factor(ifelse(TS$buy, "yes", "no"))
CV Control for Classification
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)
CV: rpart(), Classification Tree
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR[,cx], method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
Time difference of 1.736375 mins
plot(cv.rpart)

cv.rpart$results 
決策樹, Final Model
rpart1 = rpart(buy ~ ., TR[,cx], method="class", cp=0.0006)
predict(rpart1, TS, type="prob")[,2] %>% 
  colAUC(TS$buy)
                [,1]
no vs. yes 0.7162952
CV: glm
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
  buy ~ ., data=TR[,cx], method="glm", 
  trControl=ctrl, metric="ROC")
Sys.time() - t0
Time difference of 23.91107 secs
cv.glm$results
glm(), Final Model
glm1 = b=glm(buy ~ ., TR[,cx], family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy) #0.7575633
                [,1]
no vs. yes 0.7575633








---
title: "期中小組競賽, Ta-Feng"
author: "第一組-劉育銘、王淯佳、黃柏融、余曜廷、林俞伶、陳正謀"
date: "`r Sys.time()`"
output: html_notebook
---

+ **使用前三個月的資料，預測顧客在第四個月會不會來買**





```{r echo=T, message=F, cache=F, warning=F}
Sys.setlocale("LC_ALL","C")
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
library(lubridate)
library(corrplot)
```





### 一、使用原始變數建立模型:glm


##### 載入檔案
```{r}
load(file='data/tf2.Rdata')
```

##### 切割TR、TS
```{r}
TR=subset(A,spl)
TS=subset(A,!spl)

```


```{r}
is.na(TR) %>% colSums() #計算TR的NA數量
```


##### 建立模型(glm)
```{r}
cx=c(2:9,11)
colnames(TR[,cx])
glm1 = glm(buy ~ ., TR[,cx], family=binomial()) 
summary(glm1)
pred =  predict(glm1, TS, type="response")
```


##### glm的Accuracy及AUC
```{r}

cm = table(actual = TS$buy, predict = pred > 0.5); cm
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts    #0.755638      
colAUC(pred, TS$buy)   #0.7556038                                

```






- - -

### 二、製作新變數
```{r}
join_df=group_by(X, cust) %>% summarise(
 
  T11=(month(date)==11) %>% sum ,
  T12=(month(date)==12) %>% sum ,
  T1=(month(date)==1) %>% sum
  ) %>% data.frame    # 28584

join_df
```
_<span id='A1'> note </span>_

T11、T12、T1:將顧客分別在11、12、1月來店次數的總和。

- - - 


##### 合併變數T11、T12、T1到A (Left join):
```{r}
A = merge(A, join_df, by="cust", all.x=T)
A
```

- - -


##### 製作新變數(2)
```{r}
#顧客在11月的消費
Nov = filter(X, month(date)==11 ) %>% 
  group_by(cust) %>% 
  summarise(
    amount_nov = sum(total),
    items_nov=sum(items),
    pieces_nov=sum(pieces),
    gross_nov=sum(gross)
  ) 
Nov
```


```{r}
#顧客在12月的消費
Dec = filter(X, month(date)==12 ) %>%
  group_by(cust) %>% 
  summarise(
    amount_dec = sum(total),
    items_dec=sum(items),
    pieces_dec=sum(pieces),
    gross_dec=sum(gross)
  ) 
Dec
```



```{r}
#顧客在1月的消費
Jan = filter(X, month(date)==1 ) %>% 
  group_by(cust) %>% 
  summarise(
    amount_m1 = sum(total),
    items_m1=sum(items),
    pieces_m1=sum(pieces),
    gross_m1=sum(gross),
    price_m1=sum(gross)
  ) 
Jan
```
_<span id='A1'> note </span>_ 

+ 分別製作出顧客在11、12、1月的消費(total/items/pieces/gross)，丟進模型排列組合過後發現amount_m1的效果是最顯著的


- - -


##### 合併變數到A(Left Join)
```{r}

A = merge(A, Nov, by="cust", all.x=T)
A = merge(A, Dec, by="cust", all.x=T)
A = merge(A, Jan, by="cust", all.x=T)
A
```

- - -

##### 用平均值填補NA
```{r}
for(i in 15:27){
  mean_col <- mean(A[, i], na.rm = T)  # mean of col ith
  na.rows <- is.na(A[, i])   #col ith na data
  A[na.rows, i] <- mean_col
}


```
##### <span id='A1'> note </span>
![Fig-1: complete NA's ](fig/na.png)





- - -

##### 製作新變數(3)

```{r}
A$amount_total=A$amount_nov+A$amount_dec+A$amount_m1
A$gross_total=A$gross_nov+A$gross_dec+A$gross_m1
A$items_total=A$items_nov+A$items_dec+A$items_m1
A$pieces_total=A$pieces_nov+A$pieces_dec+A$pieces_m1
```


```{r}

A$f_itemtotal=A$f*A$items_total
A$f_amounttotal=A$f*A$amount_total
```

##### 切割TR與TS
```{r}
TR=subset(A,spl)
TS=subset(A,!spl)

```

- - -

##### 用新變數來建立模型(glm)
```{r}
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
glm1 = glm(buy ~ ., TR[,cx], family=binomial()) 
summary(glm1)
pred =  predict(glm1, TS, type="response")
```

- - -

##### glm1的Accuracy及AUC
```{r}

cm = table(actual = TS$buy, predict = pred > 0.5); cm
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   #0.7017257     
colAUC(pred, TS$buy)                                
#0.7575633
```



- - -


##### 用step自動挑選變數
```{r}
glm1_step=step(glm1,direction = 'backward')
pred =  predict(glm1_step, TS, type="response")
```


##### glm1_step的Accuracy及AUC
```{r}

cm = table(actual = TS$buy, predict = pred > 0.5); cm
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts     #0.7028918     
colAUC(pred, TS$buy)                                
#0.7577661
```





- - -


##### 檢查共線性
```{r}
TR$amount_new=log(1+TR$amount)
TR$amount[is.na(TR$amount)]=0
cx=c(2:7,10)
colnames(TR[,cx])
cor(TR[,cx]) %>% corrplot()

```



- - -

### 決策樹
```{r}
TR=subset(A,spl)
TS=subset(A,!spl)

cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])

rpart1 = rpart(buy ~ ., TR,          # simplify the formula
               method="class", cp=0.001) 
```


##### 畫出決策樹
```{r}
library(rpart.plot)
rpart.plot(rpart1,cex=0.6)#cex控制字體大小，這畫法會較詳細
```

##### 預測決策樹
```{r}
pred = predict(rpart1, TS, type = "class")  # predict classes;如果要predict機率，class 要寫prop
x = table(actual = TS$buy, pred); x #如果是predict機率的話，後面要寫pred>0.5
sum(diag(x))/sum(x)                         # 0.7056903
```

##### ROC & AUC
```{r}
library(ROCR)
PredictROC = predict(rpart1, TS)              # predict prob. #決策樹有多個類別，所以把每一個類別的機率列出來，才會多這個步驟(glm只有一個類別)
head(PredictROC)
perf = prediction(PredictROC[,2], TS$buy) #做roc的時候只需要y=1的機率而已
perf = performance(perf, "tpr", "fpr")
plot(perf)
```

```{r}
pred = predict(rpart1, TS)[,2]     # prob. of Reverse = 1         
colAUC(pred, TS$buy, T)        # AUC = 0.7144386
```

##### 開啟平行運算
```{r}
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
```

### 交叉驗證



##### 切割TR與TS
```{r}
TR=subset(A,spl)
TS=subset(A,!spl)

cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])

TR$buy = factor(ifelse(TR$buy, "yes", "no"))
TS$buy = factor(ifelse(TS$buy, "yes", "no"))
```



##### CV Control for Classification
```{r}
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)
```

##### CV: `rpart()`, Classification Tree 
```{r}
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR[,cx], method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
```

```{r fig.height=3, fig.width=7}
plot(cv.rpart)
```

```{r}
cv.rpart$results 
```

##### 決策樹, Final Model
```{r}
rpart1 = rpart(buy ~ ., TR[,cx], method="class", cp=0.0006)
predict(rpart1, TS, type="prob")[,2] %>% 
  colAUC(TS$buy) #0.7162952
```


##### CV: glm
```{r}
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
  buy ~ ., data=TR[,cx], method="glm", 
  trControl=ctrl, metric="ROC")
Sys.time() - t0
```

```{r}
cv.glm$results
```

##### `glm()`, Final Model
```{r}
glm1 = b=glm(buy ~ ., TR[,cx], family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy) #0.7575633
```






<br><br><br><br><hr><br><br><br>

<style>
.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #008800;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h3{
  color: #b36b00;
  background: #ffe0b3;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #006000;
  background: #ffffe0;
  line-height: 2;
  font-weight: bold;
}
h6{
  color: #006000;
  background: #00ffff;
  line-height: 2;
  font-weight: bold;

}
em{
  color: #FFEA6C;
  background: #7D7D7D;
  }
  
table, th, td {
    border: 1px solid black;
}

</style>