建立模型 製作新變數 調整變數 挑選變數

Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(lubridate)
library(rpart.plot)
library(corrplot)
library(ggplot2)

一、使用原始變數建立模型:glm

載入檔案
load(file='data/tf2.Rdata')
切割TR、TS
TR=subset(A,spl)
TS=subset(A,!spl)
is.na(TR) %>% colSums() #計算TR的NA數量
  cust      r      s      f      m    rev    raw    age   area amount 
     0      0      0      0      0      0      0      0      0  10739 
   buy 
     0 
建立模型(glm)
cx=c(2:9,11)
colnames(TR[,cx])
[1] "r"    "s"    "f"    "m"    "rev"  "raw"  "age"  "area" "buy" 
glm1 = glm(buy ~ ., TR[,cx], family=binomial()) 
summary(glm1)

Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, cx])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.7931  -0.8733  -0.6991   1.0384   1.8735  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.259e+00  1.261e-01  -9.985  < 2e-16 ***
r           -1.227e-02  8.951e-04 -13.708  < 2e-16 ***
s            9.566e-03  9.101e-04  10.511  < 2e-16 ***
f            2.905e-01  1.593e-02  18.233  < 2e-16 ***
m           -3.028e-05  2.777e-05  -1.090  0.27559    
rev          4.086e-05  1.940e-05   2.106  0.03521 *  
raw         -2.306e-04  8.561e-05  -2.693  0.00708 ** 
ageB        -4.194e-02  8.666e-02  -0.484  0.62838    
ageC         1.772e-02  7.992e-02   0.222  0.82456    
ageD         7.705e-02  7.921e-02   0.973  0.33074    
ageE         8.699e-02  8.132e-02   1.070  0.28476    
ageF         1.928e-02  8.457e-02   0.228  0.81962    
ageG         1.745e-02  9.323e-02   0.187  0.85155    
ageH         1.752e-01  1.094e-01   1.602  0.10926    
ageI         6.177e-02  1.175e-01   0.526  0.59904    
ageJ         2.652e-01  1.047e-01   2.533  0.01131 *  
ageK        -1.419e-01  1.498e-01  -0.947  0.34347    
areaB       -4.105e-02  1.321e-01  -0.311  0.75603    
areaC       -2.075e-01  1.045e-01  -1.986  0.04703 *  
areaD        3.801e-02  1.111e-01   0.342  0.73214    
areaE        2.599e-01  9.682e-02   2.684  0.00727 ** 
areaF        1.817e-01  9.753e-02   1.863  0.06243 .  
areaG       -4.677e-02  1.045e-01  -0.448  0.65435    
areaH       -1.695e-01  1.232e-01  -1.375  0.16912    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27629  on 20007  degrees of freedom
Residual deviance: 23295  on 19984  degrees of freedom
AIC: 23343

Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
glm的Accuracy及AUC
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3730  873
  TRUE   1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts       
[1] 0.6999767
colAUC(pred, TS$buy)   #0.7556038                                
                    [,1]
FALSE vs. TRUE 0.7556038
檢查共線性
cx=c(2:7,11)
colnames(TR[,cx])
[1] "r"   "s"   "f"   "m"   "rev" "raw" "buy"
cor(TR[,cx]) %>% corrplot.mixed()


二、製作新變數(1)

join_df=group_by(X, cust) %>% summarise(
 
  T11=(month(date)==11) %>% sum ,
  T12=(month(date)==12) %>% sum ,
  T1=(month(date)==1) %>% sum
  ) %>% data.frame    # 28584
join_df

note

T11、T12、T1:將顧客分別在11、12、1月來店次數的總和。


合併變數T11、T12、T1到A (Left join):
A = merge(A, join_df, by="cust", all.x=T)
A
tapply(A$buy, A$T11, mean) %>% barplot

tapply(A$buy, A$T12, mean) %>% barplot

tapply(A$buy, A$T1, mean) %>% barplot


製作新變數(2)
#顧客在11月的消費
Nov = filter(X, month(date)==11 ) %>% 
  group_by(cust) %>% 
  summarise(
    amount_nov = sum(total),#消費總額
    items_nov=sum(items),#交易件數
    pieces_nov=sum(pieces),#購買商品個數
    gross_nov=sum(gross)
  ) 
Nov
#顧客在12月的消費
Dec = filter(X, month(date)==12 ) %>%
  group_by(cust) %>% 
  summarise(
    amount_dec = sum(total),
    items_dec=sum(items),
    pieces_dec=sum(pieces),
    gross_dec=sum(gross)
  ) 
Dec
#顧客在1月的消費
Jan = filter(X, month(date)==1 ) %>% 
  group_by(cust) %>% 
  summarise(
    amount_m1 = sum(total),#消費總額
    items_m1=sum(items),#交易件數
    pieces_m1=sum(pieces),#購買商品個數
    gross_m1=sum(gross)
  ) 
Jan

note

  • 分別製作出顧客在11、12、1月的消費(total/items/pieces/gross),丟進模型排列組合過後發現amount_m1的效果是最顯著的

合併變數到A(Left Join)
A = merge(A, Nov, by="cust", all.x=T)
A = merge(A, Dec, by="cust", all.x=T)
A = merge(A, Jan, by="cust", all.x=T)
A

用平均值填補NA
for(i in 15:24){
  mean_col <- mean(A[, i], na.rm = T)  # mean of col ith
  na.rows <- is.na(A[, i])   #col ith na data
  A[na.rows, i] <- mean_col
}
圖片
Figure - 填補NA

Figure - 填補NA


製作新變數(3)
A$amount_total=A$amount_nov+A$amount_dec+A$amount_m1
A$gross_total=A$gross_nov+A$gross_dec+A$gross_m1
A$items_total=A$items_nov+A$items_dec+A$items_m1
A$pieces_total=A$pieces_nov+A$pieces_dec+A$pieces_m1

調整變數

A$f_itemtotal=A$f*A$items_total
A$f_amounttotal=A$f*A$amount_total
A$f2=A$f^4*A$m^4
A$f3=A$r^4
A$f4=A$s^4
切割TR與TS
TR=subset(A,spl)
TS=subset(A,!spl)

用新變數來建立模型(glm)
cx=c(2:9,11,14,23,27,29,31,32,33,34)
colnames(TR[,cx])
 [1] "r"             "s"             "f"             "m"            
 [5] "rev"           "raw"           "age"           "area"         
 [9] "buy"           "T1"            "amount_m1"     "amount_total" 
[13] "items_total"   "f_itemtotal"   "f_amounttotal" "f2"           
[17] "f3"           
glm1 = glm(buy ~ ., TR[,cx], family=binomial()) 
glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(glm1)
pred =  predict(glm1, TS, type="response")

glm1的Accuracy及AUC
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3738  865
  TRUE   1692 2281
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   #0.7017257     
[1] 0.7018424
colAUC(pred, TS$buy)                                
                    [,1]
FALSE vs. TRUE 0.7578508
#0.7579886

用step自動挑選變數

glm1_step=step(glm1,direction = 'backward')
Start:  AIC=23259.64
buy ~ r + s + f + m + rev + raw + age + area + T1 + amount_m1 + 
    amount_total + items_total + f_itemtotal + f_amounttotal + 
    f2 + f3
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
                Df Deviance   AIC
- age           10    23213 23257
- r              1    23196 23258
- amount_m1      1    23197 23259
<none>                23196 23260
- f_amounttotal  1    23199 23261
- f_itemtotal    1    23203 23265
- m              1    23205 23267
- f2             1    23208 23270
- raw            1    23210 23272
- f3             1    23214 23276
- amount_total   1    23214 23276
- s              1    23214 23276
- items_total    1    23217 23279
- T1             1    23223 23285
- rev            1    23233 23295
- area           7    23296 23346
- f              1    23355 23417
glm.fit: fitted probabilities numerically 0 or 1 occurred

Step:  AIC=23256.62
buy ~ r + s + f + m + rev + raw + area + T1 + amount_m1 + amount_total + 
    items_total + f_itemtotal + f_amounttotal + f2 + f3
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
                Df Deviance   AIC
- r              1    23213 23255
- amount_m1      1    23214 23256
<none>                23213 23257
- f_amounttotal  1    23216 23258
- f_itemtotal    1    23220 23262
- m              1    23222 23264
- f2             1    23225 23267
- raw            1    23227 23269
- s              1    23231 23273
- f3             1    23232 23274
- amount_total   1    23232 23274
- items_total    1    23234 23276
- T1             1    23241 23283
- rev            1    23251 23293
- area           7    23315 23345
- f              1    23372 23414
glm.fit: fitted probabilities numerically 0 or 1 occurred

Step:  AIC=23254.77
buy ~ s + f + m + rev + raw + area + T1 + amount_m1 + amount_total + 
    items_total + f_itemtotal + f_amounttotal + f2 + f3
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
                Df Deviance   AIC
- amount_m1      1    23214 23254
<none>                23213 23255
- f_amounttotal  1    23216 23256
- f_itemtotal    1    23220 23260
- m              1    23223 23263
- f2             1    23226 23266
- raw            1    23228 23268
- items_total    1    23234 23274
- amount_total   1    23235 23275
- s              1    23238 23278
- f3             1    23244 23284
- T1             1    23251 23291
- rev            1    23260 23300
- area           7    23315 23343
- f              1    23377 23417
glm.fit: fitted probabilities numerically 0 or 1 occurred

Step:  AIC=23253.94
buy ~ s + f + m + rev + raw + area + T1 + amount_total + items_total + 
    f_itemtotal + f_amounttotal + f2 + f3
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
                Df Deviance   AIC
<none>                23214 23254
- f_amounttotal  1    23217 23255
- f_itemtotal    1    23221 23259
- m              1    23224 23262
- f2             1    23227 23265
- raw            1    23228 23266
- amount_total   1    23235 23273
- items_total    1    23235 23273
- s              1    23241 23279
- f3             1    23245 23283
- T1             1    23258 23296
- rev            1    23260 23298
- area           7    23316 23342
- f              1    23377 23415
pred =  predict(glm1_step, TS, type="response")
glm1_step的Accuracy及AUC
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3741  862
  TRUE   1694 2279
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts     #0.7028918     
[1] 0.701959
colAUC(pred, TS$buy)                                
                    [,1]
FALSE vs. TRUE 0.7578848
#0.7581489
CV: glm
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
  buy ~ ., data=TR[,cx], method="glm", 
  trControl=ctrl, metric="ROC")
Sys.time() - t0
cv.glm$results
##### glm(), Final Model
glm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)









---
title: "期中小組競賽, Ta-Feng"
author: "第一組-劉育銘、王淯佳、黃柏融、余曜廷、林俞伶、陳正謀"
date: "`r Sys.time()`"
output: html_notebook
---

+ **使用前三個月的資料，預測顧客在第四個月會不會來買**


<div id="mySidenav" class="sidenav">
  <a href="#glm" id="about">建立模型</a>
  <a href="#Add_x" id="blog">製作新變數</a>
  <a href="#Adjust_x" id="projects">調整變數</a>
  <a href="#select_x" id="contact">挑選變數</a>
</div>


```{r echo=T, message=F, cache=F, warning=F}
Sys.setlocale("LC_ALL","C")
library(dplyr)
library(ggplot2)
library(caTools)
library(lubridate)
library(rpart.plot)
library(corrplot)
library(ggplot2)
```





### <span id="glm">一、使用原始變數建立模型:glm</span>


##### 載入檔案
```{r}
load(file='data/tf2.Rdata')
```

##### 切割TR、TS
```{r}
TR=subset(A,spl)
TS=subset(A,!spl)

```


```{r}
is.na(TR) %>% colSums() #計算TR的NA數量
```


##### 建立模型(glm)
```{r}
cx=c(2:9,11)
colnames(TR[,cx])
glm1 = glm(buy ~ ., TR[,cx], family=binomial()) 
summary(glm1)
pred =  predict(glm1, TS, type="response")
```


##### glm的Accuracy及AUC
```{r}

cm = table(actual = TS$buy, predict = pred > 0.5); cm
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts       
colAUC(pred, TS$buy)   #0.7556038                                

```

##### 檢查共線性
```{r}


cx=c(2:7,11)
colnames(TR[,cx])
cor(TR[,cx]) %>% corrplot.mixed()

```


<!-- ##### 2. 使用原始變數建立模型:決策樹 -->
<!-- ```{r} -->
<!-- library(rpart.plot) -->
<!-- cx=c(2,3,5:9,11,14,18) -->
<!-- colnames(TR[,cx]) -->
<!-- cart1 = rpart(buy~., TR[,cx], method='class') -->
<!-- prp(cart1, cex=0.75) -->

<!-- ``` -->

<!-- ##### 2.1 決策樹的Accuracy及AUC -->
<!-- ```{r} -->
<!-- p.cart = pred = predict(cart1, TS)[,2] -->
<!-- table(TS$buy, pred > 0.5) -->
<!-- colAUC( p.cart, TS$buy ) -->
<!-- ``` -->



- - -

### <span id='Add_x'>二、製作新變數(1)</span>
```{r}
join_df=group_by(X, cust) %>% summarise(
 
  T11=(month(date)==11) %>% sum ,
  T12=(month(date)==12) %>% sum ,
  T1=(month(date)==1) %>% sum
  ) %>% data.frame    # 28584

join_df
```
_<span id='A1'> note </span>_

T11、T12、T1:將顧客分別在11、12、1月來店次數的總和。

- - - 


##### 合併變數T11、T12、T1到A (Left join):
```{r}
A = merge(A, join_df, by="cust", all.x=T)
A
```


```{r}
tapply(A$buy, A$T11, mean) %>% barplot
```

```{r}
tapply(A$buy, A$T12, mean) %>% barplot
```

```{r}
tapply(A$buy, A$T1, mean) %>% barplot
```

- - -


##### 製作新變數(2)
```{r}
#顧客在11月的消費
Nov = filter(X, month(date)==11 ) %>% 
  group_by(cust) %>% 
  summarise(
    amount_nov = sum(total),#消費總額
    items_nov=sum(items),#交易件數
    pieces_nov=sum(pieces),#購買商品個數
    gross_nov=sum(gross)
  ) 
Nov
```


```{r}
#顧客在12月的消費
Dec = filter(X, month(date)==12 ) %>%
  group_by(cust) %>% 
  summarise(
    amount_dec = sum(total),
    items_dec=sum(items),
    pieces_dec=sum(pieces),
    gross_dec=sum(gross)
  ) 
Dec
```



```{r}
#顧客在1月的消費
Jan = filter(X, month(date)==1 ) %>% 
  group_by(cust) %>% 
  summarise(
    amount_m1 = sum(total),#消費總額
    items_m1=sum(items),#交易件數
    pieces_m1=sum(pieces),#購買商品個數
    gross_m1=sum(gross)
  ) 
Jan
```
_<span id='A1'> note </span>_ 

+ 分別製作出顧客在11、12、1月的消費(total/items/pieces/gross)，丟進模型排列組合過後發現amount_m1的效果是最顯著的


- - -


##### 合併變數到A(Left Join)
```{r}

A = merge(A, Nov, by="cust", all.x=T)
A = merge(A, Dec, by="cust", all.x=T)
A = merge(A, Jan, by="cust", all.x=T)
A
```

- - -

##### 用平均值填補NA
```{r}
for(i in 15:24){
  mean_col <- mean(A[, i], na.rm = T)  # mean of col ith
  na.rows <- is.na(A[, i])   #col ith na data
  A[na.rows, i] <- mean_col
}


```
##### <span id='A1'> 圖片 </span> 
![Figure  - 填補NA](fig/complete_na.png)




- - -

##### 製作新變數(3)

```{r}
A$amount_total=A$amount_nov+A$amount_dec+A$amount_m1
A$gross_total=A$gross_nov+A$gross_dec+A$gross_m1
A$items_total=A$items_nov+A$items_dec+A$items_m1
A$pieces_total=A$pieces_nov+A$pieces_dec+A$pieces_m1
```



### <span id='Adjust_x'>調整變數</span>
+ P(y=1)=1/1+e^ -(B0+B1X1+B2X2+B3X3+...+BkXk)
![Figure  - AUC](fig/AUC.png)




```{r}


A$f_itemtotal=A$f*A$items_total
A$f_amounttotal=A$f*A$amount_total
A$f2=A$f^4*A$m^4
A$f3=A$r^4
A$f4=A$s^4
```

##### 切割TR與TS
```{r}
TR=subset(A,spl)
TS=subset(A,!spl)

```

- - -

##### 用新變數來建立模型(glm)
```{r}
cx=c(2:9,11,14,23,27,29,31,32,33,34)
colnames(TR[,cx])
glm1 = glm(buy ~ ., TR[,cx], family=binomial()) 
#summary(glm1)
pred =  predict(glm1, TS, type="response")
```

- - -

##### glm1的Accuracy及AUC
```{r}

cm = table(actual = TS$buy, predict = pred > 0.5); cm
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   #0.7017257     
colAUC(pred, TS$buy)                                
#0.7579886
```



- - -


### <span id='select_x'>用step自動挑選變數</span>
```{r}
glm1_step=step(glm1,direction = 'backward')
pred =  predict(glm1_step, TS, type="response")
```


##### glm1_step的Accuracy及AUC
```{r}

cm = table(actual = TS$buy, predict = pred > 0.5); cm
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts     #0.7028918     
colAUC(pred, TS$buy)                                
#0.7581489
```





##### CV: glm

```{r}
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
  buy ~ ., data=TR[,cx], method="glm", 
  trControl=ctrl, metric="ROC")
Sys.time() - t0
cv.glm$results
##### glm(), Final Model
glm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)

```







- - -










<br><br><br><br><hr><br><br><br>

<style>
.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #008800;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h3{
  color: #b36b00;
  background: #ffe0b3;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #006000;
  background: #ffffe0;
  line-height: 2;
  font-weight: bold;
}
h6{
  color: #006000;
  background: #00ffff;
  line-height: 2;
  font-weight: bold;

}
em{
  color: #FFEA6C;
  background: #7D7D7D;
  }
  
table, th, td {
    border: 1px solid black;
}


#mySidenav a {
    position: absolute;
    left: -150px;
    transition: 0.3s;
    padding: 15px;
    width: 150px;
    text-decoration: none;
    font-size: 20px;
    color: white;
    border-radius: 0 5px 5px 0;
}
#mySidenav{
    top:-10px;
    position: fixed;
}
#mySidenav a:hover {
    left: -20px;
}

#about {
    top: 20px;
    background-color: #4CAF50;
}

#blog {
    top: 80px;
    background-color: #2196F3;
}

#projects {
    top: 140px;
    background-color: #f44336;
}

#contact {
    top: 200px;
    background-color: #555
}

</style>