Preparing The Predictors (X)

rm(list=ls(all=TRUE))
Sys.setlocale("LC_TIME","C")
## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)
load("~/Desktop/2019R語言商業實務/unit15/data/tf0.rdata")
###製作副檔
A <- A0
X <- X0
Z <- Z0

###增加變數

Z = subset(Z, date < as.Date("2001-02-01"))    # 618212

Z$grossmargin = (Z$price-Z$cost)/Z$price ###新增毛利率欄位

max(X0$date)
## [1] "2001-02-28"
sum(X0$gross)
## [1] 15623911
X = group_by(Z, tid) %>% summarise(
 date = date[1],  # 交易日期
 cust = cust[1],  # 顧客 ID
 age = age[1],    # 顧客 年齡級別
 area = area[1],  # 顧客 居住區別
 items = n(),  # 交易項目(總)數
 pieces = sum(qty),          # 產品(總)件數
 total = sum(price),         # 交易(總)金額
 gross = sum(price - cost),   # 毛利
 month = months(date),        # 交易月份
 spec = sum(grossmargin*qty)/sum(qty), ###該筆訂單平均毛利率
 n_margin= sum(as.numeric(grossmargin<0)*qty)/sum(qty), ###該筆訂單負毛利率商品平均數量
 weekday = factor(weekdays(date),levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday","Saturday","Sunday")), ###星期數
 ncategory = n_distinct(cat),  # 幾種不同品類
 nproduct = n_distinct(prod)   # 幾樣不同商品
 ) %>% data.frame  ###88387


###刪除離群值
sapply(X[,c(6:9)], quantile, prob=c(.99, .999, .9995))
##        items pieces     total    gross
## 99%       33     47  5177.280  985.000
## 99.9%     56     84  9378.684 1883.228
## 99.95%    64     98 11261.751 2317.087
X = subset(X, items<=60 & pieces<=85 & total<= 9500)


nrow(X) ### 88387 -> 88228
## [1] 88228
summary(X)
##       tid             date                cust          
##  Min.   :    1   Min.   :2000-11-01   Length:88228      
##  1st Qu.:22080   1st Qu.:2000-11-23   Class :character  
##  Median :44176   Median :2000-12-12   Mode  :character  
##  Mean   :44180   Mean   :2000-12-15                     
##  3rd Qu.:66262   3rd Qu.:2001-01-12                     
##  Max.   :88387   Max.   :2001-01-31                     
##                                                         
##      age                area               items            pieces      
##  Length:88228       Length:88228       Min.   : 1.000   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 2.000   1st Qu.: 3.000  
##  Mode  :character   Mode  :character   Median : 5.000   Median : 6.000  
##                                        Mean   : 6.923   Mean   : 9.328  
##                                        3rd Qu.: 9.000   3rd Qu.:12.000  
##                                        Max.   :60.000   Max.   :85.000  
##                                                                         
##      total            gross            month                spec         
##  Min.   :   5.0   Min.   :-1645.0   Length:88228       Min.   :-1.93732  
##  1st Qu.: 229.8   1st Qu.:   23.0   Class :character   1st Qu.: 0.09302  
##  Median : 521.0   Median :   71.0   Mode  :character   Median : 0.16255  
##  Mean   : 872.0   Mean   :  135.2                      Mean   : 0.12339  
##  3rd Qu.:1115.0   3rd Qu.:  173.0                      3rd Qu.: 0.20932  
##  Max.   :9457.0   Max.   : 2570.0                      Max.   : 1.00000  
##                                                                          
##     n_margin           weekday        ncategory         nproduct     
##  Min.   :0.0000   Monday   :12606   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.:0.0000   Tuesday  :11280   1st Qu.: 2.000   1st Qu.: 2.000  
##  Median :0.0000   Wednesday: 9893   Median : 4.000   Median : 5.000  
##  Mean   :0.1263   Thursday :11233   Mean   : 5.925   Mean   : 6.923  
##  3rd Qu.:0.1176   Friday   :10646   3rd Qu.: 8.000   3rd Qu.: 9.000  
##  Max.   :1.0000   Saturday :14582   Max.   :52.000   Max.   :60.000  
##                   Sunday   :17988
par(cex=0.8)
hist(X$date, "months", las=2, freq=T, xlab="", main="No. Transaction by Month")

n_distinct(X$cust)           # 顧客數 28569
## [1] 28569
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
  r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
  s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
  f = n(),            # frquency
  m = mean(total),    # monetary
  month1f = sum(month == "November"), 
  month2f = sum(month == "December"),
  month3f = sum(month == "January"),
  weeksum = sum(weekday!="Saturday" & weekday!="Sunday"),
  weekendsum = sum(weekday=="Saturday")+sum(weekday=="Sunday"),
  frise = isTRUE(month2f - month1f > 0 && month3f - month2f > 0),
  loyal = isTRUE(month1f > 0  &&  month2f > 0  &&  month3f>0),
  month1ft = ifelse(month1f>0,1,0),
  month2ft = ifelse(month2f>0,1,0),
  month3ft = ifelse(month3f>0,1,0),
  regular = isTRUE(month1ft+month2ft+month3ft >= 2),
  realspec = sum(spec*pieces)/sum(pieces),
  realn_margin = sum(n_margin*pieces)/sum(pieces),
  rev = sum(total),   # total revenue contribution
  raw = sum(gross),   # total gross profit contribution
  age = age[1],   # age group
  area =area[1], # area code
  mean_cat = mean(ncategory),
  mean_prod= mean(nproduct)
) %>% data.frame    ##28569 

nrow(A)
## [1] 28569
顧客資料摘要
summary(A)
##      cust                 r               s               f         
##  Length:28569       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.13   Mean   :61.27   Mean   : 3.088  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##        m             month1f          month2f           month3f      
##  Min.   :   8.0   Min.   : 0.000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.: 359.2   1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median : 709.0   Median : 1.000   Median : 1.0000   Median : 1.000  
##  Mean   :1006.0   Mean   : 1.112   Mean   : 0.9338   Mean   : 1.042  
##  3rd Qu.:1313.0   3rd Qu.: 1.000   3rd Qu.: 1.0000   3rd Qu.: 1.000  
##  Max.   :9381.0   Max.   :26.000   Max.   :21.0000   Max.   :25.000  
##     weeksum         weekendsum      frise           loyal        
##  Min.   : 0.000   Min.   : 0.00   Mode :logical   Mode :logical  
##  1st Qu.: 1.000   1st Qu.: 0.00   FALSE:27710     FALSE:22594    
##  Median : 1.000   Median : 1.00   TRUE :859       TRUE :5975     
##  Mean   : 1.948   Mean   : 1.14                                  
##  3rd Qu.: 2.000   3rd Qu.: 1.00                                  
##  Max.   :44.000   Max.   :18.00                                  
##     month1ft         month2ft         month3ft       regular       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Mode :logical  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   FALSE:14429    
##  Median :1.0000   Median :1.0000   Median :1.0000   TRUE :14140    
##  Mean   :0.5859   Mean   :0.5395   Mean   :0.5787                  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000                  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000                  
##     realspec         realn_margin          rev             raw         
##  Min.   :-1.93732   Min.   :0.00000   Min.   :    8   Min.   : -742.0  
##  1st Qu.: 0.09705   1st Qu.:0.00000   1st Qu.:  638   1st Qu.:   70.0  
##  Median : 0.16058   Median :0.03333   Median : 1564   Median :  218.0  
##  Mean   : 0.12557   Mean   :0.12831   Mean   : 2693   Mean   :  417.4  
##  3rd Qu.: 0.20158   3rd Qu.:0.15152   3rd Qu.: 3415   3rd Qu.:  533.0  
##  Max.   : 0.47778   Max.   :1.00000   Max.   :99597   Max.   :15565.0  
##      age                area              mean_cat        mean_prod     
##  Length:28569       Length:28569       Min.   : 1.000   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 3.000   1st Qu.: 3.000  
##  Mode  :character   Mode  :character   Median : 5.000   Median : 6.000  
##                                        Mean   : 6.592   Mean   : 7.731  
##                                        3rd Qu.: 8.667   3rd Qu.:10.000  
##                                        Max.   :49.000   Max.   :60.000
A$regular = ifelse(A$regular == "TRUE",1,0) 
set.seed(111)
A$grp = kmeans(scale(A[,c(2,5,16,17)]),4)$cluster
table(A$grp)  # 族群大小
## 
##     1     2     3     4 
##  1321  2767 11488 12993
aaa=scale(A[,c(4,5,16,17)])%>%data.frame %>% setNames(c("recency","money","regular","grossmargin"))
split(aaa,A$grp) %>% sapply(colMeans) %>% barplot(beside=T,col=rainbow(4),main="groupbehavior")
legend('bottomright',legend=colnames(aaa),fill=rainbow(4))

A %>% ggplot(aes(x=m, y=realspec, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

A %>% ggplot(aes(x=regular, y=realspec, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

A %>% ggplot(aes(x=age, y=m, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

A %>% ggplot(aes(x=f, y=m, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

#group_by(A, grp) %>% summarise(
 # r=mean(r), 
#  f=mean(f), 
#  m=mean(m), 
#  size=n() ) %>% 
#  mutate( revenue = size*m/1000 )  %>% 
#  filter(size > 1) %>% 
#  ggplot(aes(x=f, y=m)) +
#  geom_point(aes(size=revenue, col=r),alpha=0.5) +
#  scale_size(range=c(4,30)) +
#  scale_color_gradient(low="green",high="red") +
#  scale_x_log10() + scale_y_log10(limits=c(30,3000)) + 
#  geom_text(aes(label = size ),size=3) +
#  theme_bw() + guides(size=F) +
#  labs(title="Customer Segements",
#       subtitle="(bubble_size:revenue_contribution; text:group_size)",
#       color="Recency") +
#  xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")
##製作目標變數 amount 購買金額
feb01 = as.Date("2001-02-01")
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16900
The Target for Regression - A$amount

Simply a Left Joint

A = merge(A, feb, by="cust", all.x=T)
The Target for Classification - A$buy
### 製作目標變數 buy 是否有購買 
A$buy = !is.na(A$amount) ### amount 不為0 
Summary of the Dataset
summary(A)
##      cust                 r               s               f         
##  Length:28569       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.13   Mean   :61.27   Mean   : 3.088  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m             month1f          month2f           month3f      
##  Min.   :   8.0   Min.   : 0.000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.: 359.2   1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median : 709.0   Median : 1.000   Median : 1.0000   Median : 1.000  
##  Mean   :1006.0   Mean   : 1.112   Mean   : 0.9338   Mean   : 1.042  
##  3rd Qu.:1313.0   3rd Qu.: 1.000   3rd Qu.: 1.0000   3rd Qu.: 1.000  
##  Max.   :9381.0   Max.   :26.000   Max.   :21.0000   Max.   :25.000  
##                                                                      
##     weeksum         weekendsum      frise           loyal        
##  Min.   : 0.000   Min.   : 0.00   Mode :logical   Mode :logical  
##  1st Qu.: 1.000   1st Qu.: 0.00   FALSE:27710     FALSE:22594    
##  Median : 1.000   Median : 1.00   TRUE :859       TRUE :5975     
##  Mean   : 1.948   Mean   : 1.14                                  
##  3rd Qu.: 2.000   3rd Qu.: 1.00                                  
##  Max.   :44.000   Max.   :18.00                                  
##                                                                  
##     month1ft         month2ft         month3ft         regular      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.5859   Mean   :0.5395   Mean   :0.5787   Mean   :0.4949  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##     realspec         realn_margin          rev             raw         
##  Min.   :-1.93732   Min.   :0.00000   Min.   :    8   Min.   : -742.0  
##  1st Qu.: 0.09705   1st Qu.:0.00000   1st Qu.:  638   1st Qu.:   70.0  
##  Median : 0.16058   Median :0.03333   Median : 1564   Median :  218.0  
##  Mean   : 0.12557   Mean   :0.12831   Mean   : 2693   Mean   :  417.4  
##  3rd Qu.: 0.20158   3rd Qu.:0.15152   3rd Qu.: 3415   3rd Qu.:  533.0  
##  Max.   : 0.47778   Max.   :1.00000   Max.   :99597   Max.   :15565.0  
##                                                                        
##      age                area              mean_cat        mean_prod     
##  Length:28569       Length:28569       Min.   : 1.000   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 3.000   1st Qu.: 3.000  
##  Mode  :character   Mode  :character   Median : 5.000   Median : 6.000  
##                                        Mean   : 6.592   Mean   : 7.731  
##                                        3rd Qu.: 8.667   3rd Qu.:10.000  
##                                        Max.   :49.000   Max.   :60.000  
##                                                                         
##       grp            amount         buy         
##  Min.   :1.000   Min.   :    8   Mode :logical  
##  1st Qu.:3.000   1st Qu.:  454   FALSE:15330    
##  Median :3.000   Median :  993   TRUE :13239    
##  Mean   :3.265   Mean   : 1498                  
##  3rd Qu.:4.000   3rd Qu.: 1955                  
##  Max.   :4.000   Max.   :28089                  
##                  NA's   :15330
Contest Dataset
###類別模型 預測會不會購買
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
## [1] 28569 19998  8571
        ## Training  ##Testing
cbind(A, spl) %>% filter(buy) %>% ###將剛剛SPL分完的資料框併回去,然後挑選有購買的(buy=T)
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)

###畫圖觀察,分完後的資料中,購買的比例是不是差不多,因為這樣模型會比較準
##數量模型,預測買多少錢,從A資料中找出有購買的人
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10) ##將A中有購買的分割出來,後面的向量裡變數取log
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13239  9267  3972
            ##Training ##Testing
cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)

###畫圖觀察,分完後的資料中,購買的比例是不是差不多,因為這樣模型會比較準
Spliting for Classification
str(A$grp)
##  int [1:28569] 4 4 3 4 2 2 4 3 3 3 ...
A$grp <- as.factor(A$grp)
TR = subset(A, spl)
TS = subset(A, !spl)

Classification Model

###邏輯式 預測會不會來買
glm1 = glm(buy ~ ., TR[,c(2:4,6:10,11,12,16:18,20:23,24,25,27)], family=binomial()) 
summary(glm1)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:4, 
##     6:10, 11, 12, 16:18, 20:23, 24, 25, 27)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6319  -0.8690  -0.6903   1.0164   1.9024  
## 
## Coefficients: (2 not defined because of singularities)
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.927e+00  1.701e-01 -11.330  < 2e-16 ***
## r            -4.444e-03  1.621e-03  -2.742 0.006116 ** 
## s             5.346e-03  1.564e-03   3.417 0.000632 ***
## f             4.354e-01  3.311e-02  13.148  < 2e-16 ***
## month1f      -1.812e-01  3.753e-02  -4.827 1.38e-06 ***
## month2f      -1.185e-01  3.269e-02  -3.626 0.000288 ***
## month3f              NA         NA      NA       NA    
## weeksum      -2.805e-02  1.904e-02  -1.473 0.140756    
## weekendsum           NA         NA      NA       NA    
## friseTRUE     1.281e-01  1.150e-01   1.114 0.265099    
## loyalTRUE     1.449e-01  6.240e-02   2.322 0.020227 *  
## regular       4.762e-01  1.095e-01   4.350 1.36e-05 ***
## realspec     -6.979e-01  2.286e-01  -3.053 0.002264 ** 
## realn_margin  1.570e-01  1.267e-01   1.239 0.215168    
## raw          -1.351e-04  5.328e-05  -2.536 0.011200 *  
## agea25        5.448e-02  8.749e-02   0.623 0.533505    
## agea30        1.205e-01  8.035e-02   1.500 0.133658    
## agea35        1.588e-01  7.985e-02   1.989 0.046749 *  
## agea40        1.519e-01  8.223e-02   1.847 0.064725 .  
## agea45        7.359e-02  8.529e-02   0.863 0.388254    
## agea50        5.920e-02  9.398e-02   0.630 0.528698    
## agea55        2.095e-01  1.107e-01   1.893 0.058396 .  
## agea60        1.705e-01  1.181e-01   1.444 0.148707    
## agea65        3.244e-01  1.049e-01   3.092 0.001988 ** 
## agena        -1.167e-01  1.476e-01  -0.791 0.429223    
## areaz106      1.086e-01  1.317e-01   0.825 0.409565    
## areaz110     -4.689e-02  1.045e-01  -0.449 0.653747    
## areaz114      1.323e-01  1.120e-01   1.181 0.237640    
## areaz115      3.942e-01  9.707e-02   4.061 4.88e-05 ***
## areaz221      2.859e-01  9.786e-02   2.921 0.003486 ** 
## areazOthers   3.943e-02  1.048e-01   0.376 0.706779    
## areazUnknown  3.557e-02  1.232e-01   0.289 0.772844    
## mean_cat     -1.359e-02  1.516e-02  -0.897 0.369767    
## mean_prod     2.251e-02  1.237e-02   1.820 0.068711 .  
## grp2          1.044e-01  1.323e-01   0.789 0.430301    
## grp3          2.682e-01  1.209e-01   2.218 0.026562 *  
## grp4          9.351e-02  1.339e-01   0.698 0.485109    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27616  on 19997  degrees of freedom
## Residual deviance: 23140  on 19963  degrees of freedom
## AIC: 23210
## 
## Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
cm = table(actual = TS$buy, predict = pred > 0.5); cm  # 混淆舉陣
##        predict
## actual  FALSE TRUE
##   FALSE  3641  958
##   TRUE   1617 2355
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts          # 0.69998 -> 0.699
## [1] 0.6995683
colAUC(pred, TS$buy)                                   # 0.7556 -> 0.7570
##                     [,1]
## FALSE vs. TRUE 0.7566233

Regression Model

###迴歸式 預測數量(金額) 
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
### 先選出實際有購買的 這些才有金額
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ .- weeksum - weekendsum - area - frise -month3f, TR2[,c(2:12,16:26)])
summary(lm1)
## 
## Call:
## lm(formula = amount ~ . - weeksum - weekendsum - area - frise - 
##     month3f, data = TR2[, c(2:12, 16:26)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9607 -0.2279  0.0487  0.2835  1.4442 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.507e+00  5.777e-02  26.088  < 2e-16 ***
## r             8.369e-04  4.295e-04   1.948 0.051395 .  
## s            -9.097e-05  4.331e-04  -0.210 0.833652    
## f             2.899e-02  4.040e-03   7.176 7.72e-13 ***
## m             3.438e-01  4.417e-02   7.783 7.85e-15 ***
## month1f      -1.599e-02  5.613e-03  -2.848 0.004410 ** 
## month2f      -1.475e-02  5.939e-03  -2.484 0.013003 *  
## loyalTRUE     4.955e-02  1.477e-02   3.356 0.000795 ***
## regular      -1.731e-02  3.235e-02  -0.535 0.592697    
## realspec      1.296e-01  6.961e-02   1.862 0.062649 .  
## realn_margin  9.525e-03  3.994e-02   0.238 0.811510    
## rev           2.906e-02  4.012e-02   0.724 0.468917    
## raw           7.533e-05  9.963e-06   7.561 4.40e-14 ***
## agea25        3.549e-02  2.535e-02   1.400 0.161532    
## agea30        7.240e-02  2.336e-02   3.099 0.001948 ** 
## agea35        9.753e-02  2.304e-02   4.233 2.33e-05 ***
## agea40        7.649e-02  2.361e-02   3.239 0.001202 ** 
## agea45        7.277e-02  2.442e-02   2.980 0.002889 ** 
## agea50        5.808e-02  2.670e-02   2.175 0.029663 *  
## agea55        4.993e-02  3.165e-02   1.578 0.114704    
## agea60        3.013e-02  3.303e-02   0.912 0.361652    
## agea65       -7.018e-02  2.847e-02  -2.465 0.013727 *  
## agena         5.134e-02  3.925e-02   1.308 0.190906    
## mean_cat      1.302e-02  5.082e-03   2.562 0.010431 *  
## mean_prod    -1.938e-03  4.114e-03  -0.471 0.637641    
## grp2          4.985e-02  4.166e-02   1.196 0.231544    
## grp3          8.734e-02  3.797e-02   2.300 0.021458 *  
## grp4          8.872e-02  3.865e-02   2.295 0.021745 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4249 on 9239 degrees of freedom
## Multiple R-squared:  0.2897, Adjusted R-squared:  0.2876 
## F-statistic: 139.6 on 27 and 9239 DF,  p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq  ####取出R-square
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)  ###0.2897053 0.3089972
## [1] 0.2897053 0.3089972

Prediction

####用前三個月資料來預測未來一個月
Z1 = subset(Z0, date >= as.Date("2000-12-01")) 

Z1$grossmargin = (Z1$price-Z1$cost)/Z1$price ###毛利率

X1 = group_by(Z1, tid) %>% summarise(
 date = date[1],  # 交易日期
 cust = cust[1],  # 顧客 ID
 age = age[1],    # 顧客 年齡級別
 area = area[1],  # 顧客 居住區別
 items = n(),  # 交易項目(總)數
 pieces = sum(qty),          # 產品(總)件數
 total = sum(price),         # 交易(總)金額
 gross = sum(price - cost),   # 毛利
 month = months(date), ##交易月份
 spec = sum(grossmargin*qty)/sum(qty), ###該筆訂單平均毛利率
 n_margin= sum(as.numeric(grossmargin<0)*qty)/sum(qty), ###該筆訂單負毛利率商品平均數量
 weekday = factor(weekdays(date),levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday","Saturday","Sunday")), ###星期數
 ncategory = n_distinct(cat),
 nproduct = n_distinct(prod)
 ) %>% data.frame  ###87595


sapply(X1[,c(6:9)], quantile, prob=c(.99, .999, .9995))
##        items pieces    total    gross
## 99%       33 46.000  4927.00  983.060
## 99.9%     54 83.000  8783.12 1873.406
## 99.95%    63 96.203 10323.81 2211.609
X1 = subset(X1, items<=60 & pieces<=85 & total<= 9000)

nrow(X1) ### 87595 -> 87452
## [1] 87452
d0 = max(X1$date)
A1 = group_by(X1, cust) %>% summarise(
  r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
  s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
  f = n(),            # frquency
  m = mean(total),    # monetary
  month1f = sum(month == "December"),
  month2f = sum(month == "January"),
  month3f = sum(month == "February"),
  weeksum = sum(weekday!="Saturday" & weekday!="Sunday"),
  weekendsum = sum(weekday=="Saturday")+sum(weekday=="Sunday"),
  frise = isTRUE(month2f - month1f > 0 && month3f - month2f > 0),
  loyal = isTRUE(month1f > 0  &&  month2f > 0  &&  month3f>0),
  month1ft = ifelse(month1f>0,1,0),
  month2ft = ifelse(month2f>0,1,0),
  month3ft = ifelse(month3f>0,1,0),
  regular = isTRUE(month1ft+month2ft+month3ft >= 2),
  realspec = sum(spec*pieces)/sum(pieces),
  realn_margin = sum(n_margin*pieces)/sum(pieces),
  rev = sum(total),   # total revenue contribution
  raw = sum(gross),   # total gross profit contribution
  age = age[1],   # age group
  area =area[1], # area code
  mean_cat = mean(ncategory),
  mean_prod= mean(nproduct)
) %>% data.frame    ##28516 

nrow(A1)
## [1] 28516
set.seed(111)
A1$grp = kmeans(scale(A1[,c(2,5,16,17)]),4)$cluster
table(A1$grp)  # 族群大小
## 
##     1     2     3     4 
##  5211  1632  7994 13679
aaa=scale(A[,c(4,5,16,17)])%>%data.frame %>% setNames(c("recency","money","regular","grossmargin"))
split(aaa,A$grp) %>% sapply(colMeans) %>% barplot(beside=T,col=rainbow(4),main="groupbehavior")
legend('bottomright',legend=colnames(aaa),fill=rainbow(4))

A1 %>% ggplot(aes(x=m, y=realspec, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

A1 %>% ggplot(aes(x=regular, y=realspec, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

A1$grp <- as.factor(A1$grp)
A1$regular <- ifelse(A1$regular == "TRUE",1,0)
A1$Buy = predict(glm1, A1[,c(2:4,6:10,11,12,16:18,20:23,24,25)], type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
A12 = A1 %>% mutate_at(c("m","rev"), log10)
A1$Rev = 10 ^ predict(lm1, A12[,c(2:12,16:26)])
par(mfrow=c(1,2), cex=0.8)
hist(A1$Buy)
hist(log(A1$Rev,10))

###觀察各群
subset(A1, grp == "1") -> grp1
subset(A1, grp == "2") -> grp2
subset(A1, grp == "3") -> grp3
subset(A1, grp == "4") -> grp4

A1 %>% ggplot(aes(x=age)) + geom_histogram(stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

p0 = par(cex=0.8, mfrow=c(2,2))

mean(grp1$m)
## [1] 905.6921
mean(grp2$m)
## [1] 219.2428
mean(grp3$m)
## [1] 1264.983
mean(grp4$m)
## [1] 930.3219
mean(grp1$mean_cat)
## [1] 6.279644
mean(grp2$mean_cat)
## [1] 1.916302
mean(grp3$mean_cat)
## [1] 7.751703
mean(grp4$mean_cat)
## [1] 6.270096
mean(grp1$mean_prod)
## [1] 7.346831
mean(grp2$mean_prod)
## [1] 2.057235
mean(grp3$mean_prod)
## [1] 9.200075
mean(grp4$mean_prod)
## [1] 7.340771
A1 %>% ggplot(aes(x=as.factor(grp), y=mean_cat)) + geom_boxplot()