rm(list=ls(all=TRUE))
Sys.setlocale("LC_TIME","C")## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)
load("~/Desktop/2019R語言商業實務/unit15/data/tf0.rdata")###製作副檔
A <- A0
X <- X0
Z <- Z0
###增加變數
Z = subset(Z, date < as.Date("2001-02-01")) # 618212
Z$grossmargin = (Z$price-Z$cost)/Z$price ###新增毛利率欄位
max(X0$date)## [1] "2001-02-28"
sum(X0$gross)## [1] 15623911
spec = 單筆交易總毛利率 / 該筆交易總商品數量
n_margin = 單筆交易商品毛利率 <0 的總數 / 該筆交易總商品數量
X = group_by(Z, tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost), # 毛利
month = months(date), # 交易月份
spec = sum(grossmargin*qty)/sum(qty), ###該筆訂單平均毛利率
n_margin= sum(as.numeric(grossmargin<0)*qty)/sum(qty), ###該筆訂單負毛利率商品平均數量
weekday = factor(weekdays(date),levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday","Saturday","Sunday")), ###星期數
ncategory = n_distinct(cat), # 幾種不同品類
nproduct = n_distinct(prod) # 幾樣不同商品
) %>% data.frame ###88387
###刪除離群值
sapply(X[,c(6:9)], quantile, prob=c(.99, .999, .9995))## items pieces total gross
## 99% 33 47 5177.280 985.000
## 99.9% 56 84 9378.684 1883.228
## 99.95% 64 98 11261.751 2317.087
X = subset(X, items<=60 & pieces<=85 & total<= 9500)
nrow(X) ### 88387 -> 88228## [1] 88228
summary(X)## tid date cust
## Min. : 1 Min. :2000-11-01 Length:88228
## 1st Qu.:22080 1st Qu.:2000-11-23 Class :character
## Median :44176 Median :2000-12-12 Mode :character
## Mean :44180 Mean :2000-12-15
## 3rd Qu.:66262 3rd Qu.:2001-01-12
## Max. :88387 Max. :2001-01-31
##
## age area items pieces
## Length:88228 Length:88228 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.923 Mean : 9.328
## 3rd Qu.: 9.000 3rd Qu.:12.000
## Max. :60.000 Max. :85.000
##
## total gross month spec
## Min. : 5.0 Min. :-1645.0 Length:88228 Min. :-1.93732
## 1st Qu.: 229.8 1st Qu.: 23.0 Class :character 1st Qu.: 0.09302
## Median : 521.0 Median : 71.0 Mode :character Median : 0.16255
## Mean : 872.0 Mean : 135.2 Mean : 0.12339
## 3rd Qu.:1115.0 3rd Qu.: 173.0 3rd Qu.: 0.20932
## Max. :9457.0 Max. : 2570.0 Max. : 1.00000
##
## n_margin weekday ncategory nproduct
## Min. :0.0000 Monday :12606 Min. : 1.000 Min. : 1.000
## 1st Qu.:0.0000 Tuesday :11280 1st Qu.: 2.000 1st Qu.: 2.000
## Median :0.0000 Wednesday: 9893 Median : 4.000 Median : 5.000
## Mean :0.1263 Thursday :11233 Mean : 5.925 Mean : 6.923
## 3rd Qu.:0.1176 Friday :10646 3rd Qu.: 8.000 3rd Qu.: 9.000
## Max. :1.0000 Saturday :14582 Max. :52.000 Max. :60.000
## Sunday :17988
par(cex=0.8)
hist(X$date, "months", las=2, freq=T, xlab="", main="No. Transaction by Month")n_distinct(X$cust) # 顧客數 28569## [1] 28569
month1f , month2f , month3f : 為十一月至一月,該顧客各月份的交易次數。
frise = ( 十一月至十二月交易次數上升 ) && ( 十二月至一月交易次數上升 )
weeksum : 單一顧客於週間來店的次數總和。
weekendsum : 單一顧客於週末來店的次數總和。
realspec = 單筆交易 spec * 單筆交易商品總數量 / 交易之總商品數量
realpoorbitch = 單筆交易 poorbitch * 單筆交易商品總數量 / 交易之總商品數量
mean_prod : 平均每次購買幾樣不重複商品
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
month1f = sum(month == "November"),
month2f = sum(month == "December"),
month3f = sum(month == "January"),
weeksum = sum(weekday!="Saturday" & weekday!="Sunday"),
weekendsum = sum(weekday=="Saturday")+sum(weekday=="Sunday"),
frise = isTRUE(month2f - month1f > 0 && month3f - month2f > 0),
loyal = isTRUE(month1f > 0 && month2f > 0 && month3f>0),
month1ft = ifelse(month1f>0,1,0),
month2ft = ifelse(month2f>0,1,0),
month3ft = ifelse(month3f>0,1,0),
regular = isTRUE(month1ft+month2ft+month3ft >= 2),
realspec = sum(spec*pieces)/sum(pieces),
realn_margin = sum(n_margin*pieces)/sum(pieces),
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area =area[1], # area code
mean_cat = mean(ncategory),
mean_prod= mean(nproduct)
) %>% data.frame ##28569
nrow(A)## [1] 28569
summary(A)## cust r s f
## Length:28569 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.13 Mean :61.27 Mean : 3.088
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
## m month1f month2f month3f
## Min. : 8.0 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 359.2 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 709.0 Median : 1.000 Median : 1.0000 Median : 1.000
## Mean :1006.0 Mean : 1.112 Mean : 0.9338 Mean : 1.042
## 3rd Qu.:1313.0 3rd Qu.: 1.000 3rd Qu.: 1.0000 3rd Qu.: 1.000
## Max. :9381.0 Max. :26.000 Max. :21.0000 Max. :25.000
## weeksum weekendsum frise loyal
## Min. : 0.000 Min. : 0.00 Mode :logical Mode :logical
## 1st Qu.: 1.000 1st Qu.: 0.00 FALSE:27710 FALSE:22594
## Median : 1.000 Median : 1.00 TRUE :859 TRUE :5975
## Mean : 1.948 Mean : 1.14
## 3rd Qu.: 2.000 3rd Qu.: 1.00
## Max. :44.000 Max. :18.00
## month1ft month2ft month3ft regular
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Mode :logical
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 FALSE:14429
## Median :1.0000 Median :1.0000 Median :1.0000 TRUE :14140
## Mean :0.5859 Mean :0.5395 Mean :0.5787
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## realspec realn_margin rev raw
## Min. :-1.93732 Min. :0.00000 Min. : 8 Min. : -742.0
## 1st Qu.: 0.09705 1st Qu.:0.00000 1st Qu.: 638 1st Qu.: 70.0
## Median : 0.16058 Median :0.03333 Median : 1564 Median : 218.0
## Mean : 0.12557 Mean :0.12831 Mean : 2693 Mean : 417.4
## 3rd Qu.: 0.20158 3rd Qu.:0.15152 3rd Qu.: 3415 3rd Qu.: 533.0
## Max. : 0.47778 Max. :1.00000 Max. :99597 Max. :15565.0
## age area mean_cat mean_prod
## Length:28569 Length:28569 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 3.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.592 Mean : 7.731
## 3rd Qu.: 8.667 3rd Qu.:10.000
## Max. :49.000 Max. :60.000
A$regular = ifelse(A$regular == "TRUE",1,0) set.seed(111)
A$grp = kmeans(scale(A[,c(2,5,16,17)]),4)$cluster
table(A$grp) # 族群大小##
## 1 2 3 4
## 1321 2767 11488 12993
aaa=scale(A[,c(4,5,16,17)])%>%data.frame %>% setNames(c("recency","money","regular","grossmargin"))
split(aaa,A$grp) %>% sapply(colMeans) %>% barplot(beside=T,col=rainbow(4),main="groupbehavior")
legend('bottomright',legend=colnames(aaa),fill=rainbow(4))A %>% ggplot(aes(x=m, y=realspec, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)A %>% ggplot(aes(x=regular, y=realspec, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)A %>% ggplot(aes(x=age, y=m, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)A %>% ggplot(aes(x=f, y=m, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)#group_by(A, grp) %>% summarise(
# r=mean(r),
# f=mean(f),
# m=mean(m),
# size=n() ) %>%
# mutate( revenue = size*m/1000 ) %>%
# filter(size > 1) %>%
# ggplot(aes(x=f, y=m)) +
# geom_point(aes(size=revenue, col=r),alpha=0.5) +
# scale_size(range=c(4,30)) +
# scale_color_gradient(low="green",high="red") +
# scale_x_log10() + scale_y_log10(limits=c(30,3000)) +
# geom_text(aes(label = size ),size=3) +
# theme_bw() + guides(size=F) +
# labs(title="Customer Segements",
# subtitle="(bubble_size:revenue_contribution; text:group_size)",
# color="Recency") +
# xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")##製作目標變數 amount 購買金額
feb01 = as.Date("2001-02-01")
feb = filter(X0, date>= feb01) %>% group_by(cust) %>%
summarise(amount = sum(total)) # 16900A$amountSimply a Left Joint
A = merge(A, feb, by="cust", all.x=T)A$buy### 製作目標變數 buy 是否有購買
A$buy = !is.na(A$amount) ### amount 不為0 summary(A)## cust r s f
## Length:28569 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.13 Mean :61.27 Mean : 3.088
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
##
## m month1f month2f month3f
## Min. : 8.0 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 359.2 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 709.0 Median : 1.000 Median : 1.0000 Median : 1.000
## Mean :1006.0 Mean : 1.112 Mean : 0.9338 Mean : 1.042
## 3rd Qu.:1313.0 3rd Qu.: 1.000 3rd Qu.: 1.0000 3rd Qu.: 1.000
## Max. :9381.0 Max. :26.000 Max. :21.0000 Max. :25.000
##
## weeksum weekendsum frise loyal
## Min. : 0.000 Min. : 0.00 Mode :logical Mode :logical
## 1st Qu.: 1.000 1st Qu.: 0.00 FALSE:27710 FALSE:22594
## Median : 1.000 Median : 1.00 TRUE :859 TRUE :5975
## Mean : 1.948 Mean : 1.14
## 3rd Qu.: 2.000 3rd Qu.: 1.00
## Max. :44.000 Max. :18.00
##
## month1ft month2ft month3ft regular
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.5859 Mean :0.5395 Mean :0.5787 Mean :0.4949
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## realspec realn_margin rev raw
## Min. :-1.93732 Min. :0.00000 Min. : 8 Min. : -742.0
## 1st Qu.: 0.09705 1st Qu.:0.00000 1st Qu.: 638 1st Qu.: 70.0
## Median : 0.16058 Median :0.03333 Median : 1564 Median : 218.0
## Mean : 0.12557 Mean :0.12831 Mean : 2693 Mean : 417.4
## 3rd Qu.: 0.20158 3rd Qu.:0.15152 3rd Qu.: 3415 3rd Qu.: 533.0
## Max. : 0.47778 Max. :1.00000 Max. :99597 Max. :15565.0
##
## age area mean_cat mean_prod
## Length:28569 Length:28569 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 3.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.592 Mean : 7.731
## 3rd Qu.: 8.667 3rd Qu.:10.000
## Max. :49.000 Max. :60.000
##
## grp amount buy
## Min. :1.000 Min. : 8 Mode :logical
## 1st Qu.:3.000 1st Qu.: 454 FALSE:15330
## Median :3.000 Median : 993 TRUE :13239
## Mean :3.265 Mean : 1498
## 3rd Qu.:4.000 3rd Qu.: 1955
## Max. :4.000 Max. :28089
## NA's :15330
###類別模型 預測會不會購買
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))## [1] 28569 19998 8571
## Training ##Testingcbind(A, spl) %>% filter(buy) %>% ###將剛剛SPL分完的資料框併回去,然後挑選有購買的(buy=T)
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)###畫圖觀察,分完後的資料中,購買的比例是不是差不多,因為這樣模型會比較準##數量模型,預測買多少錢,從A資料中找出有購買的人
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10) ##將A中有購買的分割出來,後面的向量裡變數取log
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))## [1] 13239 9267 3972
##Training ##Testingcbind(A2, spl2) %>%
ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)###畫圖觀察,分完後的資料中,購買的比例是不是差不多,因為這樣模型會比較準str(A$grp)## int [1:28569] 4 4 3 4 2 2 4 3 3 3 ...
A$grp <- as.factor(A$grp)
TR = subset(A, spl)
TS = subset(A, !spl)###邏輯式 預測會不會來買
glm1 = glm(buy ~ ., TR[,c(2:4,6:10,11,12,16:18,20:23,24,25,27)], family=binomial())
summary(glm1)##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:4,
## 6:10, 11, 12, 16:18, 20:23, 24, 25, 27)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6319 -0.8690 -0.6903 1.0164 1.9024
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.927e+00 1.701e-01 -11.330 < 2e-16 ***
## r -4.444e-03 1.621e-03 -2.742 0.006116 **
## s 5.346e-03 1.564e-03 3.417 0.000632 ***
## f 4.354e-01 3.311e-02 13.148 < 2e-16 ***
## month1f -1.812e-01 3.753e-02 -4.827 1.38e-06 ***
## month2f -1.185e-01 3.269e-02 -3.626 0.000288 ***
## month3f NA NA NA NA
## weeksum -2.805e-02 1.904e-02 -1.473 0.140756
## weekendsum NA NA NA NA
## friseTRUE 1.281e-01 1.150e-01 1.114 0.265099
## loyalTRUE 1.449e-01 6.240e-02 2.322 0.020227 *
## regular 4.762e-01 1.095e-01 4.350 1.36e-05 ***
## realspec -6.979e-01 2.286e-01 -3.053 0.002264 **
## realn_margin 1.570e-01 1.267e-01 1.239 0.215168
## raw -1.351e-04 5.328e-05 -2.536 0.011200 *
## agea25 5.448e-02 8.749e-02 0.623 0.533505
## agea30 1.205e-01 8.035e-02 1.500 0.133658
## agea35 1.588e-01 7.985e-02 1.989 0.046749 *
## agea40 1.519e-01 8.223e-02 1.847 0.064725 .
## agea45 7.359e-02 8.529e-02 0.863 0.388254
## agea50 5.920e-02 9.398e-02 0.630 0.528698
## agea55 2.095e-01 1.107e-01 1.893 0.058396 .
## agea60 1.705e-01 1.181e-01 1.444 0.148707
## agea65 3.244e-01 1.049e-01 3.092 0.001988 **
## agena -1.167e-01 1.476e-01 -0.791 0.429223
## areaz106 1.086e-01 1.317e-01 0.825 0.409565
## areaz110 -4.689e-02 1.045e-01 -0.449 0.653747
## areaz114 1.323e-01 1.120e-01 1.181 0.237640
## areaz115 3.942e-01 9.707e-02 4.061 4.88e-05 ***
## areaz221 2.859e-01 9.786e-02 2.921 0.003486 **
## areazOthers 3.943e-02 1.048e-01 0.376 0.706779
## areazUnknown 3.557e-02 1.232e-01 0.289 0.772844
## mean_cat -1.359e-02 1.516e-02 -0.897 0.369767
## mean_prod 2.251e-02 1.237e-02 1.820 0.068711 .
## grp2 1.044e-01 1.323e-01 0.789 0.430301
## grp3 2.682e-01 1.209e-01 2.218 0.026562 *
## grp4 9.351e-02 1.339e-01 0.698 0.485109
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27616 on 19997 degrees of freedom
## Residual deviance: 23140 on 19963 degrees of freedom
## AIC: 23210
##
## Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
cm = table(actual = TS$buy, predict = pred > 0.5); cm # 混淆舉陣## predict
## actual FALSE TRUE
## FALSE 3641 958
## TRUE 1617 2355
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.69998 -> 0.699## [1] 0.6995683
colAUC(pred, TS$buy) # 0.7556 -> 0.7570## [,1]
## FALSE vs. TRUE 0.7566233
###迴歸式 預測數量(金額)
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
### 先選出實際有購買的 這些才有金額
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)lm1 = lm(amount ~ .- weeksum - weekendsum - area - frise -month3f, TR2[,c(2:12,16:26)])
summary(lm1)##
## Call:
## lm(formula = amount ~ . - weeksum - weekendsum - area - frise -
## month3f, data = TR2[, c(2:12, 16:26)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9607 -0.2279 0.0487 0.2835 1.4442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.507e+00 5.777e-02 26.088 < 2e-16 ***
## r 8.369e-04 4.295e-04 1.948 0.051395 .
## s -9.097e-05 4.331e-04 -0.210 0.833652
## f 2.899e-02 4.040e-03 7.176 7.72e-13 ***
## m 3.438e-01 4.417e-02 7.783 7.85e-15 ***
## month1f -1.599e-02 5.613e-03 -2.848 0.004410 **
## month2f -1.475e-02 5.939e-03 -2.484 0.013003 *
## loyalTRUE 4.955e-02 1.477e-02 3.356 0.000795 ***
## regular -1.731e-02 3.235e-02 -0.535 0.592697
## realspec 1.296e-01 6.961e-02 1.862 0.062649 .
## realn_margin 9.525e-03 3.994e-02 0.238 0.811510
## rev 2.906e-02 4.012e-02 0.724 0.468917
## raw 7.533e-05 9.963e-06 7.561 4.40e-14 ***
## agea25 3.549e-02 2.535e-02 1.400 0.161532
## agea30 7.240e-02 2.336e-02 3.099 0.001948 **
## agea35 9.753e-02 2.304e-02 4.233 2.33e-05 ***
## agea40 7.649e-02 2.361e-02 3.239 0.001202 **
## agea45 7.277e-02 2.442e-02 2.980 0.002889 **
## agea50 5.808e-02 2.670e-02 2.175 0.029663 *
## agea55 4.993e-02 3.165e-02 1.578 0.114704
## agea60 3.013e-02 3.303e-02 0.912 0.361652
## agea65 -7.018e-02 2.847e-02 -2.465 0.013727 *
## agena 5.134e-02 3.925e-02 1.308 0.190906
## mean_cat 1.302e-02 5.082e-03 2.562 0.010431 *
## mean_prod -1.938e-03 4.114e-03 -0.471 0.637641
## grp2 4.985e-02 4.166e-02 1.196 0.231544
## grp3 8.734e-02 3.797e-02 2.300 0.021458 *
## grp4 8.872e-02 3.865e-02 2.295 0.021745 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4249 on 9239 degrees of freedom
## Multiple R-squared: 0.2897, Adjusted R-squared: 0.2876
## F-statistic: 139.6 on 27 and 9239 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq ####取出R-square
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts) ###0.2897053 0.3089972## [1] 0.2897053 0.3089972
####用前三個月資料來預測未來一個月
Z1 = subset(Z0, date >= as.Date("2000-12-01"))
Z1$grossmargin = (Z1$price-Z1$cost)/Z1$price ###毛利率
X1 = group_by(Z1, tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost), # 毛利
month = months(date), ##交易月份
spec = sum(grossmargin*qty)/sum(qty), ###該筆訂單平均毛利率
n_margin= sum(as.numeric(grossmargin<0)*qty)/sum(qty), ###該筆訂單負毛利率商品平均數量
weekday = factor(weekdays(date),levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday","Saturday","Sunday")), ###星期數
ncategory = n_distinct(cat),
nproduct = n_distinct(prod)
) %>% data.frame ###87595
sapply(X1[,c(6:9)], quantile, prob=c(.99, .999, .9995))## items pieces total gross
## 99% 33 46.000 4927.00 983.060
## 99.9% 54 83.000 8783.12 1873.406
## 99.95% 63 96.203 10323.81 2211.609
X1 = subset(X1, items<=60 & pieces<=85 & total<= 9000)
nrow(X1) ### 87595 -> 87452## [1] 87452
d0 = max(X1$date)
A1 = group_by(X1, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
month1f = sum(month == "December"),
month2f = sum(month == "January"),
month3f = sum(month == "February"),
weeksum = sum(weekday!="Saturday" & weekday!="Sunday"),
weekendsum = sum(weekday=="Saturday")+sum(weekday=="Sunday"),
frise = isTRUE(month2f - month1f > 0 && month3f - month2f > 0),
loyal = isTRUE(month1f > 0 && month2f > 0 && month3f>0),
month1ft = ifelse(month1f>0,1,0),
month2ft = ifelse(month2f>0,1,0),
month3ft = ifelse(month3f>0,1,0),
regular = isTRUE(month1ft+month2ft+month3ft >= 2),
realspec = sum(spec*pieces)/sum(pieces),
realn_margin = sum(n_margin*pieces)/sum(pieces),
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area =area[1], # area code
mean_cat = mean(ncategory),
mean_prod= mean(nproduct)
) %>% data.frame ##28516
nrow(A1)## [1] 28516
set.seed(111)
A1$grp = kmeans(scale(A1[,c(2,5,16,17)]),4)$cluster
table(A1$grp) # 族群大小##
## 1 2 3 4
## 5211 1632 7994 13679
aaa=scale(A[,c(4,5,16,17)])%>%data.frame %>% setNames(c("recency","money","regular","grossmargin"))
split(aaa,A$grp) %>% sapply(colMeans) %>% barplot(beside=T,col=rainbow(4),main="groupbehavior")
legend('bottomright',legend=colnames(aaa),fill=rainbow(4))A1 %>% ggplot(aes(x=m, y=realspec, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)A1 %>% ggplot(aes(x=regular, y=realspec, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)A1$grp <- as.factor(A1$grp)
A1$regular <- ifelse(A1$regular == "TRUE",1,0)
A1$Buy = predict(glm1, A1[,c(2:4,6:10,11,12,16:18,20:23,24,25)], type="response")## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
A12 = A1 %>% mutate_at(c("m","rev"), log10)
A1$Rev = 10 ^ predict(lm1, A12[,c(2:12,16:26)])par(mfrow=c(1,2), cex=0.8)
hist(A1$Buy)
hist(log(A1$Rev,10))###觀察各群
subset(A1, grp == "1") -> grp1
subset(A1, grp == "2") -> grp2
subset(A1, grp == "3") -> grp3
subset(A1, grp == "4") -> grp4
A1 %>% ggplot(aes(x=age)) + geom_histogram(stat = "count")## Warning: Ignoring unknown parameters: binwidth, bins, pad
p0 = par(cex=0.8, mfrow=c(2,2))
mean(grp1$m)## [1] 905.6921
mean(grp2$m)## [1] 219.2428
mean(grp3$m)## [1] 1264.983
mean(grp4$m)## [1] 930.3219
mean(grp1$mean_cat)## [1] 6.279644
mean(grp2$mean_cat)## [1] 1.916302
mean(grp3$mean_cat)## [1] 7.751703
mean(grp4$mean_cat)## [1] 6.270096
mean(grp1$mean_prod)## [1] 7.346831
mean(grp2$mean_prod)## [1] 2.057235
mean(grp3$mean_prod)## [1] 9.200075
mean(grp4$mean_prod)## [1] 7.340771
A1 %>% ggplot(aes(x=as.factor(grp), y=mean_cat)) + geom_boxplot()