Sys.setlocale("LC_ALL","C")
[1] "C/C/C/C/C/zh_TW.UTF-8"
library(dplyr)
library(ggplot2)
library(caTools)
library(rpart)
library(rpart.plot)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A$distance[A$area == "E"] = 1
A$distance[A$area == "F"] = 2
A$distance[A$area == "D"] = 3
A$distance[A$area == "A"] = 4
A$distance[A$area == "B"] = 5
A$distance[A$area == "C"] = 6
A$distance[A$area == "G"] = 7
A$distance[A$area == "H"] = 1
X = X %>% mutate(wday = format(date, "%w"))
wp=aggregate(X$pieces, by=list(Category=X$wday), FUN=sum)
wi=aggregate(X$items, by=list(Category=X$wday), FUN=sum)
barplot(wp$x,names.arg = c("SUN","MON","TUE","WED","THU","FRI","SAT")) #pieces與星期的關係
barplot(wi$x,names.arg = c("SUN","MON","TUE","WED","THU","FRI","SAT")) #items與星期的關係
RFM模型是由喬治·卡利南(George Cullinan)於1961年所提出,他發現資料庫分析中,有三項重要的指標:最近一次消費(Recency)、消費頻率(Frequency)、與消費金額(Monetary)
#新增rscore
A <- A %>% mutate(
rscore = cut(
A$r,
breaks = c(1,4,10,12,15,21,33,48,60,72,92),
include.lowest = T,
labels = c(10,9,8,7,6,5,4,3,2,1)
)
)
#新增fscore
A <- A %>% mutate(
fscore = cut(
A$f,
breaks = c(0,1,2,3,4,6,60),
include.lowest = T,
labels = c(1,5,7,8,9,10)
)
)
#新增mscore
A <- A %>% mutate(
mscore = cut(
A$m,
breaks = c(0,179.1625,301,423,557.5333,709.5,902,1148,1526.769,2218,10634),
include.lowest = T,
labels = c(1,2,3,4,5,6,7,8,9,10)
)
)
A$days = (A$s - A$r) #第一次購買至最後一次購買之間隔天數
A$dayf = A$days/A$f #間隔天數/總購買次數=平均幾天會來一次
A$days越大代表此顧客越依賴本商店之趨勢cor(A$f, A$days)
[1] 0.641
ggplot(A, aes(days, f))+ geom_point(col="#6495ED")+ geom_smooth(col="red")
A$dayf值越小代表顧客越依賴本商店A$days與A$dayf有高度相關,但在最後的迴歸模型中只放一個變數解釋力反而下降cor(A$dayf, A$days)
[1] 0.7567
ggplot(A, aes(days, dayf))+ geom_point(col="#6495ED")
#在資料中新增pieces與items欄位
B = group_by(X, cust) %>% summarise(
pieces = sum(pieces),
items = sum(items)
) %>% data.frame
A = merge(A, B, by="cust", all.x=T)
A$days = (A$s - A$r)
A$dayf = A$days/A$f
#新增rscore
A <- A %>% mutate(
rscore = cut(
A$r,
breaks = c(1,4,10,12,15,21,33,48,60,72,92),
include.lowest = T,
labels = c(10,9,8,7,6,5,4,3,2,1)
)
)
#新增fscore
A <- A %>% mutate(
fscore = cut(
A$f,
breaks = c(0,1,2,3,4,6,60),
include.lowest = T,
labels = c(1,5,7,8,9,10)
)
)
#新增mscore
A <- A %>% mutate(
mscore = cut(
A$m,
breaks = c(0,179.1625,301,423,557.5333,709.5,902,1148,1526.769,2218,10634),
include.lowest = T,
labels = c(1,2,3,4,5,6,7,8,9,10)
)
)
A$distance[A$area == "E"] = 1
A$distance[A$area == "F"] = 2
A$distance[A$area == "D"] = 3
A$distance[A$area == "A"] = 4
A$distance[A$area == "B"] = 5
A$distance[A$area == "C"] = 6
A$distance[A$area == "G"] = 7
A$distance[A$area == "H"] = 1
table(A$distance)
1 2 3 4 5 6 7
11202 7798 1778 802 783 3169 3052
TR = subset(A, spl)
TS = subset(A, !spl)
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
glm1 = glm(buy ~ ., TR[,c(6:9,11, 13:19)], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(6:9,
11, 13:19)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.745 -0.871 -0.675 1.014 1.919
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.8176290 0.1396764 -5.85 0.0000000048064700 ***
rev 0.0000156 0.0000192 0.81 0.41846
raw -0.0003214 0.0000863 -3.72 0.00020 ***
ageB -0.0131029 0.0873195 -0.15 0.88072
ageC 0.0391362 0.0806248 0.49 0.62738
ageD 0.0874853 0.0799556 1.09 0.27388
ageE 0.0934663 0.0820278 1.14 0.25452
ageF 0.0250656 0.0853348 0.29 0.76896
ageG 0.0276573 0.0940198 0.29 0.76863
ageH 0.2047475 0.1100367 1.86 0.06278 .
ageI 0.0741508 0.1182329 0.63 0.53056
ageJ 0.2793938 0.1053210 2.65 0.00798 **
ageK -0.0640551 0.1477020 -0.43 0.66452
areaB -0.0443848 0.1333282 -0.33 0.73921
areaC -0.2032366 0.1053918 -1.93 0.05381 .
areaD 0.0451702 0.1120573 0.40 0.68688
areaE 0.2571047 0.0977537 2.63 0.00854 **
areaF 0.1678213 0.0985564 1.70 0.08861 .
areaG -0.0372408 0.1053729 -0.35 0.72377
areaH -0.1393282 0.1238355 -1.13 0.26054
rscore9 -0.2687036 0.0706527 -3.80 0.00014 ***
rscore8 -0.1971253 0.0793359 -2.48 0.01297 *
rscore7 -0.1391499 0.0815604 -1.71 0.08799 .
rscore6 -0.1400645 0.0760864 -1.84 0.06564 .
rscore5 -0.2338014 0.0757662 -3.09 0.00203 **
rscore4 -0.2348815 0.0787455 -2.98 0.00286 **
rscore3 -0.2115949 0.0798278 -2.65 0.00803 **
rscore2 -0.3236706 0.0823394 -3.93 0.0000846162927277 ***
rscore1 -0.4411497 0.0840432 -5.25 0.0000001528565161 ***
fscore5 0.6633223 0.0610249 10.87 < 2e-16 ***
fscore7 0.7976561 0.0731868 10.90 < 2e-16 ***
fscore8 0.8323321 0.1048399 7.94 0.0000000000000020 ***
fscore9 0.9618447 0.1344541 7.15 0.0000000000008447 ***
fscore10 1.5428202 0.1953205 7.90 0.0000000000000028 ***
mscore2 -0.1606396 0.0708616 -2.27 0.02339 *
mscore3 -0.1726033 0.0714829 -2.41 0.01575 *
mscore4 -0.0610448 0.0718581 -0.85 0.39559
mscore5 -0.1585113 0.0723829 -2.19 0.02853 *
mscore6 -0.1808735 0.0730981 -2.47 0.01335 *
mscore7 -0.1461266 0.0740250 -1.97 0.04838 *
mscore8 -0.2478487 0.0767948 -3.23 0.00125 **
mscore9 -0.1981045 0.0813516 -2.44 0.01488 *
mscore10 -0.1847960 0.0942401 -1.96 0.04989 *
days 0.0240729 0.0032828 7.33 0.0000000000002250 ***
dayf -0.0493936 0.0078936 -6.26 0.0000000003913264 ***
pieces 0.0000808 0.0022644 0.04 0.97153
items 0.0083215 0.0030602 2.72 0.00654 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23142 on 19961 degrees of freedom
AIC: 23236
Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3785 818
TRUE 1691 2282
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.69998
[1] 0.7074
colAUC(pred, TS$buy) # 0.7556
[,1]
FALSE vs. TRUE 0.7576
lm1 = lm(amount ~ ., TR2[,c(6,8:10,13:19)])
summary(lm1)
Call:
lm(formula = amount ~ ., data = TR2[, c(6, 8:10, 13:19)])
Residuals:
Min 1Q Median 3Q Max
-1.8071 -0.2243 0.0486 0.2781 1.4810
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.6306325 0.0940722 17.33 < 2e-16 ***
rev 0.3531444 0.0420181 8.40 < 2e-16 ***
ageB 0.0713390 0.0249489 2.86 0.00425 **
ageC 0.1167029 0.0229124 5.09 0.000000358579 ***
ageD 0.1203492 0.0226065 5.32 0.000000104091 ***
ageE 0.1278418 0.0231154 5.53 0.000000032786 ***
ageF 0.1014801 0.0241152 4.21 0.000025991936 ***
ageG 0.0790777 0.0263249 3.00 0.00267 **
ageH 0.0703731 0.0310595 2.27 0.02349 *
ageI 0.0699467 0.0319027 2.19 0.02837 *
ageJ -0.0220438 0.0280813 -0.78 0.43248
ageK 0.1281026 0.0391771 3.27 0.00108 **
areaB 0.0842920 0.0432456 1.95 0.05131 .
areaC 0.0418660 0.0351167 1.19 0.23322
areaD -0.0078782 0.0369139 -0.21 0.83100
areaE 0.0112585 0.0323958 0.35 0.72820
areaF 0.0142555 0.0326285 0.44 0.66219
areaG 0.0266225 0.0346978 0.77 0.44294
areaH 0.0194646 0.0386084 0.50 0.61417
rscore9 -0.0267572 0.0154468 -1.73 0.08327 .
rscore8 -0.0242339 0.0185396 -1.31 0.19120
rscore7 -0.0127723 0.0192583 -0.66 0.50721
rscore6 0.0149635 0.0185977 0.80 0.42108
rscore5 -0.0184456 0.0196912 -0.94 0.34892
rscore4 -0.0212455 0.0216923 -0.98 0.32741
rscore3 -0.0123278 0.0223010 -0.55 0.58042
rscore2 -0.0232518 0.0237829 -0.98 0.32826
rscore1 0.0044405 0.0251430 0.18 0.85982
fscore5 -0.1020025 0.0244602 -4.17 0.000030718485 ***
fscore7 -0.1690644 0.0274724 -6.15 0.000000000787 ***
fscore8 -0.2114310 0.0324538 -6.51 0.000000000077 ***
fscore9 -0.2498990 0.0371616 -6.72 0.000000000019 ***
fscore10 -0.2711706 0.0482230 -5.62 0.000000019285 ***
mscore2 0.0556642 0.0262046 2.12 0.03368 *
mscore3 0.0529852 0.0313072 1.69 0.09060 .
mscore4 0.0925559 0.0354131 2.61 0.00897 **
mscore5 0.1199531 0.0392777 3.05 0.00226 **
mscore6 0.1197952 0.0428412 2.80 0.00518 **
mscore7 0.1419878 0.0465258 3.05 0.00228 **
mscore8 0.1550719 0.0507021 3.06 0.00223 **
mscore9 0.1896448 0.0558285 3.40 0.00068 ***
mscore10 0.1903761 0.0649944 2.93 0.00341 **
days 0.0018384 0.0007315 2.51 0.01198 *
dayf -0.0051296 0.0019069 -2.69 0.00716 **
pieces 0.0000864 0.0004220 0.20 0.83775
items 0.0021006 0.0005885 3.57 0.00036 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.418 on 9223 degrees of freedom
Multiple R-squared: 0.304, Adjusted R-squared: 0.301
F-statistic: 89.6 on 45 and 9223 DF, p-value: <2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.3042 0.2689
library(caret)
library(doParallel)
clust = makeCluster(detectCores())
#顯示X個執行緒
registerDoParallel(clust); getDoParWorkers()
[1] 4
set.seed(2)
t0 = Sys.time()
cv1 = train(
amount ~ ., data = TR2[,c(6:10, 12:18)], method = "rpart",
trControl = trainControl(method = "cv", number=10), metric="Rsquared",
tuneGrid = expand.grid(cp = seq(0,0.01,0.0002))
)
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 10 (<-localhost:11038)
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 9 (<-localhost:11038)
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 8 (<-localhost:11038)
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 7 (<-localhost:11038)
cv1; plot(cv1)
CART
9269 samples
11 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 8341, 8342, 8341, 8343, 8342, 8342, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.0000 0.4780 0.1833 0.3725
0.0002 0.4702 0.1954 0.3653
0.0004 0.4520 0.2210 0.3478
0.0006 0.4331 0.2604 0.3324
0.0008 0.4293 0.2682 0.3301
0.0010 0.4271 0.2734 0.3285
0.0012 0.4265 0.2748 0.3281
0.0014 0.4261 0.2754 0.3279
0.0016 0.4260 0.2753 0.3277
0.0018 0.4258 0.2761 0.3278
0.0020 0.4261 0.2753 0.3282
0.0022 0.4269 0.2724 0.3289
0.0024 0.4272 0.2717 0.3292
0.0026 0.4271 0.2718 0.3290
0.0028 0.4275 0.2705 0.3294
0.0030 0.4279 0.2691 0.3297
0.0032 0.4282 0.2680 0.3300
0.0034 0.4284 0.2675 0.3302
0.0036 0.4285 0.2671 0.3303
0.0038 0.4285 0.2671 0.3303
0.0040 0.4288 0.2661 0.3306
0.0042 0.4292 0.2647 0.3309
0.0044 0.4301 0.2616 0.3319
0.0046 0.4301 0.2616 0.3319
0.0048 0.4301 0.2616 0.3319
0.0050 0.4307 0.2596 0.3323
0.0052 0.4320 0.2548 0.3335
0.0054 0.4321 0.2547 0.3337
0.0056 0.4332 0.2505 0.3345
0.0058 0.4344 0.2466 0.3354
0.0060 0.4344 0.2466 0.3354
0.0062 0.4352 0.2437 0.3359
0.0064 0.4355 0.2429 0.3360
0.0066 0.4361 0.2406 0.3365
0.0068 0.4361 0.2410 0.3365
0.0070 0.4361 0.2410 0.3365
0.0072 0.4361 0.2410 0.3365
0.0074 0.4361 0.2410 0.3365
0.0076 0.4358 0.2422 0.3363
0.0078 0.4361 0.2409 0.3368
0.0080 0.4362 0.2407 0.3370
0.0082 0.4363 0.2403 0.3370
0.0084 0.4363 0.2403 0.3370
0.0086 0.4363 0.2403 0.3370
0.0088 0.4365 0.2396 0.3372
0.0090 0.4365 0.2396 0.3372
0.0092 0.4365 0.2396 0.3372
0.0094 0.4365 0.2396 0.3372
0.0096 0.4365 0.2396 0.3372
0.0098 0.4365 0.2396 0.3372
0.0100 0.4365 0.2396 0.3372
Rsquared was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.0018.
Sys.time() - t0
Time difference of 12.69 secs
rpart = rpart(amount ~ ., TR2[,c(6:10, 12:18)], cp=0.0016)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart, TS2) - TS2$amount)^2)
1 - (SSE/SST) # 0.2460567
[1] 0.253
prp(rpart)
Bad 'data' field in model 'call'.
To silence this warning:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
#rpart.plot(rpart3,cex=0.6)