Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(rpart)
library(rpart.plot)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A$distance[A$area == "E"] = 1
A$distance[A$area == "F"] = 2
A$distance[A$area == "D"] = 3
A$distance[A$area == "A"] = 4
A$distance[A$area == "B"] = 5
A$distance[A$area == "C"] = 6
A$distance[A$area == "G"] = 7
A$distance[A$area == "H"] = 1
X = X %>% mutate(wday = format(date, "%w"))
wp=aggregate(X$pieces, by=list(Category=X$wday), FUN=sum)
wi=aggregate(X$items, by=list(Category=X$wday), FUN=sum)
barplot(wp$x,names.arg = c("SUN","MON","TUE","WED","THU","FRI","SAT")) #pieces與星期的關係
barplot(wi$x,names.arg = c("SUN","MON","TUE","WED","THU","FRI","SAT")) #items與星期的關係
RFM模型是由喬治·卡利南(George Cullinan)於1961年所提出,他發現資料庫分析中,有三項重要的指標:最近一次消費(Recency)、消費頻率(Frequency)、與消費金額(Monetary)
#新增rscore
A <- A %>% mutate(
rscore = cut(
A$r,
breaks = c(1,4,10,12,15,21,33,48,60,72,92),
include.lowest = T,
labels = c(10,9,8,7,6,5,4,3,2,1)
)
)
#新增fscore
A <- A %>% mutate(
fscore = cut(
A$f,
breaks = c(0,1,2,3,4,6,60),
include.lowest = T,
labels = c(1,5,7,8,9,10)
)
)
#新增mscore
A <- A %>% mutate(
mscore = cut(
A$m,
breaks = c(0,179.1625,301,423,557.5333,709.5,902,1148,1526.769,2218,10634),
include.lowest = T,
labels = c(1,2,3,4,5,6,7,8,9,10)
)
)
A$days = (A$s - A$r) #第一次購買至最後一次購買之間隔天數
A$dayf = A$days/A$f #間隔天數/總購買次數=平均幾天會來一次
A$days越大代表此顧客越依賴本商店”之趨勢cor(A$f, A$days)
[1] 0.6410492
ggplot(A, aes(days, f))+ geom_point(col="#6495ED")+ geom_smooth(col="red")
A$dayf值越小代表顧客越依賴本商店A$days與A$dayf有高度相關,但在最後的迴歸模型中只放一個變數解釋力反而下降cor(A$dayf, A$days)
[1] 0.7566692
ggplot(A, aes(days, dayf))+ geom_point(col="#6495ED")
#在資料中新增pieces與items欄位
B = group_by(X, cust) %>% summarise(
pieces = sum(pieces),
items = sum(items)
) %>% data.frame
A = merge(A, B, by="cust", all.x=T)
A$days = (A$s - A$r)
A$dayf = A$days/A$f
#新增rscore
A <- A %>% mutate(
rscore = cut(
A$r,
breaks = c(1,4,10,12,15,21,33,48,60,72,92),
include.lowest = T,
labels = c(10,9,8,7,6,5,4,3,2,1)
)
)
#新增fscore
A <- A %>% mutate(
fscore = cut(
A$f,
breaks = c(0,1,2,3,4,6,60),
include.lowest = T,
labels = c(1,5,7,8,9,10)
)
)
#新增mscore
A <- A %>% mutate(
mscore = cut(
A$m,
breaks = c(0,179.1625,301,423,557.5333,709.5,902,1148,1526.769,2218,10634),
include.lowest = T,
labels = c(1,2,3,4,5,6,7,8,9,10)
)
)
A$distance[A$area == "E"] = 1
A$distance[A$area == "F"] = 2
A$distance[A$area == "D"] = 3
A$distance[A$area == "A"] = 4
A$distance[A$area == "B"] = 5
A$distance[A$area == "C"] = 6
A$distance[A$area == "G"] = 7
A$distance[A$area == "H"] = 1
table(A$distance)
1 2 3 4 5 6 7
11202 7798 1778 802 783 3169 3052
TR = subset(A, spl)
TS = subset(A, !spl)
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
glm1 = glm(buy ~ ., TR[,c(6:9,11, 13:19)], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(6:9,
11, 13:19)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7451 -0.8714 -0.6749 1.0136 1.9194
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.176e-01 1.397e-01 -5.854 4.81e-09 ***
rev 1.556e-05 1.923e-05 0.809 0.418457
raw -3.214e-04 8.629e-05 -3.725 0.000196 ***
ageB -1.310e-02 8.732e-02 -0.150 0.880719
ageC 3.914e-02 8.062e-02 0.485 0.627385
ageD 8.749e-02 7.996e-02 1.094 0.273879
ageE 9.347e-02 8.203e-02 1.139 0.254517
ageF 2.507e-02 8.533e-02 0.294 0.768962
ageG 2.766e-02 9.402e-02 0.294 0.768633
ageH 2.047e-01 1.100e-01 1.861 0.062784 .
ageI 7.415e-02 1.182e-01 0.627 0.530555
ageJ 2.794e-01 1.053e-01 2.653 0.007983 **
ageK -6.406e-02 1.477e-01 -0.434 0.664522
areaB -4.438e-02 1.333e-01 -0.333 0.739210
areaC -2.032e-01 1.054e-01 -1.928 0.053807 .
areaD 4.517e-02 1.121e-01 0.403 0.686875
areaE 2.571e-01 9.775e-02 2.630 0.008535 **
areaF 1.678e-01 9.856e-02 1.703 0.088607 .
areaG -3.724e-02 1.054e-01 -0.353 0.723774
areaH -1.393e-01 1.238e-01 -1.125 0.260544
rscore9 -2.687e-01 7.065e-02 -3.803 0.000143 ***
rscore8 -1.971e-01 7.934e-02 -2.485 0.012966 *
rscore7 -1.391e-01 8.156e-02 -1.706 0.087990 .
rscore6 -1.401e-01 7.609e-02 -1.841 0.065642 .
rscore5 -2.338e-01 7.577e-02 -3.086 0.002030 **
rscore4 -2.349e-01 7.875e-02 -2.983 0.002856 **
rscore3 -2.116e-01 7.983e-02 -2.651 0.008034 **
rscore2 -3.237e-01 8.234e-02 -3.931 8.46e-05 ***
rscore1 -4.411e-01 8.404e-02 -5.249 1.53e-07 ***
fscore5 6.633e-01 6.102e-02 10.870 < 2e-16 ***
fscore7 7.977e-01 7.319e-02 10.899 < 2e-16 ***
fscore8 8.323e-01 1.048e-01 7.939 2.04e-15 ***
fscore9 9.618e-01 1.345e-01 7.154 8.45e-13 ***
fscore10 1.543e+00 1.953e-01 7.899 2.81e-15 ***
mscore2 -1.606e-01 7.086e-02 -2.267 0.023393 *
mscore3 -1.726e-01 7.148e-02 -2.415 0.015752 *
mscore4 -6.104e-02 7.186e-02 -0.850 0.395593
mscore5 -1.585e-01 7.238e-02 -2.190 0.028531 *
mscore6 -1.809e-01 7.310e-02 -2.474 0.013346 *
mscore7 -1.461e-01 7.402e-02 -1.974 0.048380 *
mscore8 -2.478e-01 7.679e-02 -3.227 0.001249 **
mscore9 -1.981e-01 8.135e-02 -2.435 0.014885 *
mscore10 -1.848e-01 9.424e-02 -1.961 0.049890 *
days 2.407e-02 3.283e-03 7.333 2.25e-13 ***
dayf -4.939e-02 7.894e-03 -6.257 3.91e-10 ***
pieces 8.081e-05 2.264e-03 0.036 0.971531
items 8.322e-03 3.060e-03 2.719 0.006543 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23142 on 19961 degrees of freedom
AIC: 23236
Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3785 818
TRUE 1691 2282
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.69998
[1] 0.7074394
colAUC(pred, TS$buy) # 0.7556
[,1]
FALSE vs. TRUE 0.757608
lm1 = lm(amount ~ ., TR2[,c(6,8:10,13:19)])
summary(lm1)
Call:
lm(formula = amount ~ ., data = TR2[, c(6, 8:10, 13:19)])
Residuals:
Min 1Q Median 3Q Max
-1.80714 -0.22426 0.04862 0.27814 1.48097
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.631e+00 9.407e-02 17.334 < 2e-16 ***
rev 3.531e-01 4.202e-02 8.405 < 2e-16 ***
ageB 7.134e-02 2.495e-02 2.859 0.004254 **
ageC 1.167e-01 2.291e-02 5.093 3.59e-07 ***
ageD 1.203e-01 2.261e-02 5.324 1.04e-07 ***
ageE 1.278e-01 2.312e-02 5.531 3.28e-08 ***
ageF 1.015e-01 2.412e-02 4.208 2.60e-05 ***
ageG 7.908e-02 2.632e-02 3.004 0.002672 **
ageH 7.037e-02 3.106e-02 2.266 0.023490 *
ageI 6.995e-02 3.190e-02 2.193 0.028368 *
ageJ -2.204e-02 2.808e-02 -0.785 0.432475
ageK 1.281e-01 3.918e-02 3.270 0.001080 **
areaB 8.429e-02 4.325e-02 1.949 0.051308 .
areaC 4.187e-02 3.512e-02 1.192 0.233216
areaD -7.878e-03 3.691e-02 -0.213 0.831004
areaE 1.126e-02 3.240e-02 0.348 0.728202
areaF 1.426e-02 3.263e-02 0.437 0.662191
areaG 2.662e-02 3.470e-02 0.767 0.442941
areaH 1.946e-02 3.861e-02 0.504 0.614165
rscore9 -2.676e-02 1.545e-02 -1.732 0.083269 .
rscore8 -2.423e-02 1.854e-02 -1.307 0.191197
rscore7 -1.277e-02 1.926e-02 -0.663 0.507213
rscore6 1.496e-02 1.860e-02 0.805 0.421076
rscore5 -1.845e-02 1.969e-02 -0.937 0.348916
rscore4 -2.125e-02 2.169e-02 -0.979 0.327408
rscore3 -1.233e-02 2.230e-02 -0.553 0.580421
rscore2 -2.325e-02 2.378e-02 -0.978 0.328263
rscore1 4.440e-03 2.514e-02 0.177 0.859820
fscore5 -1.020e-01 2.446e-02 -4.170 3.07e-05 ***
fscore7 -1.691e-01 2.747e-02 -6.154 7.87e-10 ***
fscore8 -2.114e-01 3.245e-02 -6.515 7.66e-11 ***
fscore9 -2.499e-01 3.716e-02 -6.725 1.86e-11 ***
fscore10 -2.712e-01 4.822e-02 -5.623 1.93e-08 ***
mscore2 5.566e-02 2.620e-02 2.124 0.033679 *
mscore3 5.299e-02 3.131e-02 1.692 0.090598 .
mscore4 9.256e-02 3.541e-02 2.614 0.008974 **
mscore5 1.200e-01 3.928e-02 3.054 0.002265 **
mscore6 1.198e-01 4.284e-02 2.796 0.005181 **
mscore7 1.420e-01 4.653e-02 3.052 0.002281 **
mscore8 1.551e-01 5.070e-02 3.058 0.002231 **
mscore9 1.896e-01 5.583e-02 3.397 0.000684 ***
mscore10 1.904e-01 6.499e-02 2.929 0.003408 **
days 1.838e-03 7.315e-04 2.513 0.011976 *
dayf -5.130e-03 1.907e-03 -2.690 0.007159 **
pieces 8.643e-05 4.220e-04 0.205 0.837746
items 2.101e-03 5.885e-04 3.570 0.000359 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4182 on 9223 degrees of freedom
Multiple R-squared: 0.3042, Adjusted R-squared: 0.3008
F-statistic: 89.62 on 45 and 9223 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.3042416 0.2689460
library(caret)
library(doParallel)
clust = makeCluster(detectCores())
#顯示X個執行緒
registerDoParallel(clust); getDoParWorkers()
[1] 4
set.seed(2)
t0 = Sys.time()
cv1 = train(
amount ~ ., data = TR2[,c(6:10, 12:18)], method = "rpart",
trControl = trainControl(method = "cv", number=10), metric="Rsquared",
tuneGrid = expand.grid(cp = seq(0,0.01,0.0002))
)
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 6 (<-cap.cyberlink.com:11497)
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 5 (<-cap.cyberlink.com:11497)
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 4 (<-cap.cyberlink.com:11497)
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 3 (<-cap.cyberlink.com:11497)
cv1; plot(cv1)
CART
9269 samples
11 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 8341, 8342, 8341, 8343, 8342, 8342, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.0000 0.4779860 0.1832677 0.3724856
0.0002 0.4702405 0.1953601 0.3652944
0.0004 0.4519525 0.2210319 0.3478196
0.0006 0.4330705 0.2603622 0.3323523
0.0008 0.4292766 0.2682341 0.3300687
0.0010 0.4271255 0.2734102 0.3285059
0.0012 0.4264518 0.2748297 0.3281096
0.0014 0.4261427 0.2754180 0.3279408
0.0016 0.4260281 0.2753425 0.3277046
0.0018 0.4257974 0.2761478 0.3278138
0.0020 0.4260675 0.2752529 0.3282423
0.0022 0.4269210 0.2723708 0.3289436
0.0024 0.4271594 0.2717227 0.3292150
0.0026 0.4270964 0.2717670 0.3289797
0.0028 0.4275150 0.2704514 0.3294101
0.0030 0.4279066 0.2690965 0.3296741
0.0032 0.4282254 0.2679974 0.3300353
0.0034 0.4283739 0.2675199 0.3302126
0.0036 0.4285030 0.2670627 0.3303280
0.0038 0.4285030 0.2670627 0.3303280
0.0040 0.4288005 0.2660790 0.3305796
0.0042 0.4292198 0.2647117 0.3309161
0.0044 0.4301198 0.2615783 0.3319123
0.0046 0.4301198 0.2615783 0.3319123
0.0048 0.4301198 0.2615783 0.3319123
0.0050 0.4306987 0.2595670 0.3322874
0.0052 0.4320488 0.2548389 0.3335425
0.0054 0.4321097 0.2546678 0.3336871
0.0056 0.4332431 0.2505460 0.3345423
0.0058 0.4343981 0.2466171 0.3354148
0.0060 0.4343981 0.2466171 0.3354148
0.0062 0.4352261 0.2437258 0.3358724
0.0064 0.4354776 0.2429160 0.3360484
0.0066 0.4361454 0.2406045 0.3364778
0.0068 0.4360543 0.2409510 0.3364708
0.0070 0.4360543 0.2409510 0.3364708
0.0072 0.4360543 0.2409510 0.3364708
0.0074 0.4360543 0.2409510 0.3364708
0.0076 0.4357632 0.2421806 0.3363040
0.0078 0.4361258 0.2408624 0.3367685
0.0080 0.4361744 0.2406669 0.3369795
0.0082 0.4362522 0.2403186 0.3369814
0.0084 0.4362522 0.2403186 0.3369814
0.0086 0.4362522 0.2403186 0.3369814
0.0088 0.4364604 0.2395806 0.3372111
0.0090 0.4364604 0.2395806 0.3372111
0.0092 0.4364604 0.2395806 0.3372111
0.0094 0.4364604 0.2395806 0.3372111
0.0096 0.4364604 0.2395806 0.3372111
0.0098 0.4364604 0.2395806 0.3372111
0.0100 0.4364604 0.2395806 0.3372111
Rsquared was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.0018.
Sys.time() - t0
Time difference of 16.57244 secs
rpart = rpart(amount ~ ., TR2[,c(6:10, 12:18)], cp=0.0016)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart, TS2) - TS2$amount)^2)
1 - (SSE/SST) # 0.2460567
[1] 0.2529805
prp(rpart)
# rpart.plot(rpart3,cex=0.6)