資料前置處理
Fig-1: Feature Engineering
Sys.setlocale("LC_ALL","C")
[1] "C/C/C/C/C/en_US.UTF-8"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
library(caret)
library(e1071)
library(lattice)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A2 = subset(A, buy)
c(sum(spl), sum(spl2))
[1] 20008 9269
X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)
0 1 2 3 4 5 6
18011 12615 11288 9898 11245 10651 14587
mx = xtabs(~ cust + wday, X)
dim(mx)
[1] 28584 7
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 1 1 0 0 0 0 0
1113 2 1 0 0 0 0 1
1359 0 1 0 0 0 0 0
1823 0 1 0 1 1 0 0
2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1113 0.5000000 0.2500000 0.0000000 0.0000000 0.0000000 0.0000000 0.2500000
1359 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1823 0.0000000 0.3333333 0.0000000 0.3333333 0.3333333 0.0000000 0.0000000
2189 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.0000000 0.5000000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>%
setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>%
right_join(A, by='cust')
head(A)
A$raf = as.numeric(A$raw/A$f) # 淨利 / 購買次數(平均顧客淨利)
A$ram = as.numeric(A$raw/A$m) # 淨利 / 單次購買金額(平均訂單淨利)
A$cy = as.numeric((A$s-A$r)/A$f) # (第一次購買日期 - 最近一次購買日期) / 購買次數(顧客購買週期)
A$ref = as.numeric(A$rev/A$f) # 收益 / 購買次數(平均顧客收益)
A$fre = as.numeric(A$f*A$rev) # 購買次數 * 收益(共變項)
A$mre = as.numeric(A$m*A$rev) # 單次購買金額 * 收益(共變項)
A$rem = as.numeric(A$m/A$rev) # 單次購買金額 / 收益(顧客單次購買金額佔收益比)
A$mra = as.numeric(A$m/A$raw) # 單次購買金額 / 淨利(顧客單次購買金額佔淨利比)
TR = subset(A, spl)
TS = subset(A, !spl)
glm1 = glm(buy ~.+area:f, TR[,c(9:11,16,18,20:25)], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ . + area:f, family = binomial(), data = TR[,
c(9:11, 16, 18, 20:25)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.9779 -0.8666 -0.6733 0.9997 1.8599
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.921e-01 2.254e-01 -1.296 0.19506
r -2.142e-02 2.190e-03 -9.781 < 2e-16 ***
s 1.825e-02 2.210e-03 8.259 < 2e-16 ***
f 2.456e-01 8.791e-02 2.794 0.00521 **
areaB -2.053e-01 2.505e-01 -0.820 0.41230
areaC -1.241e-01 1.955e-01 -0.635 0.52574
areaD 2.852e-01 2.031e-01 1.404 0.16025
areaE 3.763e-01 1.824e-01 2.063 0.03908 *
areaF 3.450e-01 1.834e-01 1.881 0.05995 .
areaG -1.038e-01 1.965e-01 -0.528 0.59742
areaH 1.390e-02 2.080e-01 0.067 0.94673
ram -3.069e-01 6.299e-02 -4.872 1.11e-06 ***
cy -3.491e-02 5.124e-03 -6.814 9.52e-12 ***
ref -6.387e-06 2.973e-05 -0.215 0.82988
fre -1.259e-07 1.066e-06 -0.118 0.90598
mre -1.696e-10 3.656e-09 -0.046 0.96301
rem -9.344e-01 1.160e-01 -8.057 7.80e-16 ***
f:areaB 7.038e-02 1.114e-01 0.632 0.52760
f:areaC -5.082e-02 8.934e-02 -0.569 0.56944
f:areaD -1.395e-01 9.160e-02 -1.523 0.12768
f:areaE -7.378e-02 8.378e-02 -0.881 0.37851
f:areaF -9.145e-02 8.401e-02 -1.089 0.27633
f:areaG 1.617e-02 8.971e-02 0.180 0.85696
f:areaH -9.734e-02 8.735e-02 -1.114 0.26514
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23182 on 19984 degrees of freedom
AIC: 23230
Number of Fisher Scoring iterations: 7
predglm = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = predglm > 0.5); cm
predict
actual FALSE TRUE
FALSE 3797 806
TRUE 1709 2264
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.6989
[1] 0.7067397
colAUC(predglm, TS$buy) # AUC = 0.7551
[,1]
FALSE vs. TRUE 0.7579333
# 嘗試歷程:
# Accuracy AUC
# 0.7009 0.7508 (area:f + area:m + area:rev + area:raw + age:f + age:m + age:rev + age:raw + age:area + f:m + f:rev + m:rev)
# 0.7013 0.7523 (.+area:f + area:m + area:rev + area:raw + age:f + age:m + age:rev + age:raw + age:area + f:m + f:rev + m:rev)
# 0.7007 0.7533 (.+ area:f + area:m + area:rev + area:raw + age:f + age:m + age:rev + age:raw + age:area + f:m + f:rev + m:rev + rf + rm)
# 0.7024 0.7548 (.+area:f + area:m)
# 0.6989 0.7551 (.有week)
# 0.7018 0.7553 (.+area:f + area:m + rm)
# 0.7000 0.7556 (.無week)
# 0.7022 0.7557 (.+area:f + area:m + rf + rm)
# 0.7008 0.7561 (. + rf + rm)
# 0.7016 0.7565 (. + rf + rm 無week)
# 0.7017 0.7568 (. + rf + rm + cy 無week)
# 0.7038 0.7575 (c(9:11,16,18:23))
# 0.7039 0.7575 (9:11,16,18,20:24)
# 0.7067 0.7579 (9:11,16,18,20:25)
rpart1 = rpart(buy ~ ., TR[,c(9:11,16,18,20:25)], method="class")
pred1 = predict(rpart1, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred1 > 0.5); cm
predict
actual FALSE TRUE
FALSE 3730 873
TRUE 1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7066
[1] 0.7066231
colAUC(pred1, TS$buy) # AUC = 0.6984
[,1]
FALSE vs. TRUE 0.6983998
rpart.plot(rpart1,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart2 = rpart(buy ~ ., TR[,c(9:11,16,18,20:25)], method="class",cp=0.001)
pred2 = predict(rpart2, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred2 > 0.5); cm
predict
actual FALSE TRUE
FALSE 3869 734
TRUE 1813 2160
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7030
[1] 0.7030084
colAUC(pred2, TS$buy) # AUC = 0.7152
[,1]
FALSE vs. TRUE 0.715176
rpart.plot(rpart2,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
set.seed(2018)
TR$buy = as.factor(TR$buy)
cv1 = train(
buy ~ ., data = TR[,c(9:11,16,18,20:25)], method = "rpart",
trControl = trainControl(method = "cv", number=10),
tuneGrid = expand.grid(cp = seq(0,0.002,0.00005))
)
cv1 # cp = 0.00075
CART
20008 samples
10 predictor
2 classes: 'FALSE', 'TRUE'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 18007, 18007, 18008, 18007, 18007, 18007, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000 0.6348466 0.2627580
0.00005 0.6400444 0.2728605
0.00010 0.6539884 0.2992452
0.00015 0.6666335 0.3243584
0.00020 0.6776788 0.3460111
0.00025 0.6828266 0.3554102
0.00030 0.6871251 0.3627591
0.00035 0.6915733 0.3706101
0.00040 0.6934224 0.3738285
0.00045 0.6934224 0.3729749
0.00050 0.6952215 0.3757621
0.00055 0.6969708 0.3793355
0.00060 0.6976706 0.3809437
0.00065 0.6977706 0.3816090
0.00070 0.6977706 0.3816090
0.00075 0.6980204 0.3819688
0.00080 0.6978704 0.3818306
0.00085 0.6978204 0.3818243
0.00090 0.6978204 0.3818243
0.00095 0.6978204 0.3818243
0.00100 0.6980204 0.3823016
0.00105 0.6980204 0.3824037
0.00110 0.6974706 0.3814249
0.00115 0.6974706 0.3815262
0.00120 0.6974206 0.3815922
0.00125 0.6976705 0.3821794
0.00130 0.6968709 0.3811019
0.00135 0.6968709 0.3811019
0.00140 0.6960712 0.3798735
0.00145 0.6960712 0.3798735
0.00150 0.6960712 0.3798735
0.00155 0.6960712 0.3798735
0.00160 0.6970708 0.3822985
0.00165 0.6970708 0.3825282
0.00170 0.6966710 0.3815035
0.00175 0.6968709 0.3820585
0.00180 0.6968709 0.3820585
0.00185 0.6968709 0.3820585
0.00190 0.6968709 0.3820585
0.00195 0.6968709 0.3820585
0.00200 0.6971208 0.3829141
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.00075.
plot(cv1, main = sprintf("optimal cp at %f", cv1$bestTune$cp))
rpart3 = rpart(buy ~ ., TR[,c(9:11,16,18,20:25)], method="class",cp=0.00075)
pred3 = predict(rpart3, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred3 > 0.5); cm
predict
actual FALSE TRUE
FALSE 3841 762
TRUE 1772 2201
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7045
[1] 0.7045243
colAUC(pred3, TS$buy) # AUC = 0.7106
[,1]
FALSE vs. TRUE 0.710618
prp(rpart3)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart.plot(rpart3,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
library(rpart)
library(rpart.plot)
rpart4 = rpart(buy ~ ., TR[,c(9:16,18,21)], method="class")
pred4 = predict(rpart4, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred4 > 0.5); cm
predict
actual FALSE TRUE
FALSE 3730 873
TRUE 1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7066
[1] 0.7066231
colAUC(pred4, TS$buy) # AUC = 0.6984
[,1]
FALSE vs. TRUE 0.6983998
rpart.plot(rpart4,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart5 = rpart(buy ~ ., TR[,c(9:16,18,21)], method="class",cp=0.001)
pred5 = predict(rpart5, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred5 > 0.5); cm
predict
actual FALSE TRUE
FALSE 3822 781
TRUE 1748 2225
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7051
[1] 0.7051073
colAUC(pred5, TS$buy) # AUC = 0.7169
[,1]
FALSE vs. TRUE 0.7169118
rpart.plot(rpart5,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart6 = rpart(buy ~ ., TR[,c(9:16,18,21)], method="class",cp=0.00055)
pred6 = predict(rpart6, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred6 > 0.5); cm
predict
actual FALSE TRUE
FALSE 3768 835
TRUE 1739 2234
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.6999
[1] 0.6998601
colAUC(pred6, TS$buy) # AUC = 0.7361
[,1]
FALSE vs. TRUE 0.7361045
prp(rpart6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart.plot(rpart6,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
library(caret)
library(e1071)
library(lattice)
set.seed(2018)
TR$buy = as.factor(TR$buy)
cv2 = train(
buy ~ ., data = TR[,c(9:16,18,21)], method = "rpart",
trControl = trainControl(method = "cv", number=10),
tuneGrid = expand.grid(cp = seq(0,0.002,0.00005))
)
cv2 # cp=0.0008
CART
20008 samples
9 predictor
2 classes: 'FALSE', 'TRUE'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 18007, 18007, 18008, 18007, 18007, 18007, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000 0.6317479 0.2568032
0.00005 0.6365459 0.2657862
0.00010 0.6514893 0.2945885
0.00015 0.6674825 0.3249963
0.00020 0.6762792 0.3423637
0.00025 0.6789781 0.3476074
0.00030 0.6813274 0.3520513
0.00035 0.6834265 0.3561803
0.00040 0.6900237 0.3679008
0.00045 0.6916729 0.3704263
0.00050 0.6910232 0.3693334
0.00055 0.6938718 0.3745478
0.00060 0.6949715 0.3768500
0.00065 0.6952214 0.3770998
0.00070 0.6954213 0.3773214
0.00075 0.6961210 0.3782532
0.00080 0.6975704 0.3808554
0.00085 0.6973705 0.3805663
0.00090 0.6974205 0.3808109
0.00095 0.6974205 0.3809004
0.00100 0.6972206 0.3808203
0.00105 0.6972206 0.3808936
0.00110 0.6970707 0.3806296
0.00115 0.6970707 0.3806296
0.00120 0.6970207 0.3806541
0.00125 0.6970707 0.3812038
0.00130 0.6967710 0.3811371
0.00135 0.6967710 0.3812940
0.00140 0.6963711 0.3806516
0.00145 0.6961212 0.3800573
0.00150 0.6961212 0.3800573
0.00155 0.6961212 0.3800573
0.00160 0.6967709 0.3817813
0.00165 0.6967709 0.3817813
0.00170 0.6967709 0.3817813
0.00175 0.6968709 0.3820585
0.00180 0.6968709 0.3820585
0.00185 0.6968709 0.3820585
0.00190 0.6968709 0.3820585
0.00195 0.6968709 0.3820585
0.00200 0.6971208 0.3829141
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 8e-04.
plot(cv2, main = sprintf("optimal cp at %f", cv2$bestTune$cp))
rpart7 = rpart(buy ~ ., TR[,c(9:16,18,21)], method="class",cp=0.0008)
pred7 = predict(rpart7, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred7 > 0.5); cm
predict
actual FALSE TRUE
FALSE 3879 724
TRUE 1801 2172
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7056
[1] 0.7055737
colAUC(pred7, TS$buy) # AUC = 0.7156
[,1]
FALSE vs. TRUE 0.7156103
rpart.plot(rpart7,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
library(randomForest)
TR$buy = as.numeric(TR$buy)
TS$buy = as.numeric(TS$buy)
a_rf = randomForest(buy ~ ., data = TR[,c(9:16,18,21)] , nodesize = 25, ntree =200)
The response has five or fewer unique values. Are you sure you want to do regression?
PredictForest = predict(a_rf , newdata = TS)
table(TS$buy , PredictForest > 0.5) %>% {sum(diag(.))/sum(.)} # Accuracy = 0.5367
[1] 0.5367304
colAUC(PredictForest,TS$buy) # AUC = 0.7487
[,1]
0 vs. 1 0.7481006
A_km = subset(A[,c(9:14,18:25)])
library(caret)
preproc = preProcess(A_km)
A_kmN = predict(preproc, A_km)
summary(A_kmN)
r s f m
Min. :-1.1941 Min. :-2.3336 Min. :-0.5528 Min. :-1.0120
1st Qu.:-0.8104 1st Qu.:-0.5524 1st Qu.:-0.5528 1st Qu.:-0.6580
Median :-0.4268 Median : 0.2607 Median :-0.2882 Median :-0.3052
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.8010 3rd Qu.: 0.8416 3rd Qu.: 0.2411 3rd Qu.: 0.3049
Max. : 2.2974 Max. : 1.1901 Max. :15.0608 Max. : 9.6946
rev raw buy raf
Min. :-0.7470 Min. :-1.8636 Mode :logical Min. :-4.6329
1st Qu.:-0.5729 1st Qu.:-0.5622 FALSE:15342 1st Qu.:-0.6068
Median :-0.3164 Median :-0.3250 TRUE :13242 Median :-0.2955
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.1977 3rd Qu.: 0.1830 3rd Qu.: 0.2653
Max. :26.7788 Max. :24.2711 Max. :13.6580
ram cy ref fre
Min. :-4.3599 Min. :-0.8657 Min. :-1.0120 Min. :-0.20597
1st Qu.:-0.4487 1st Qu.:-0.8657 1st Qu.:-0.6580 1st Qu.:-0.19658
Median :-0.2702 Median :-0.2795 Median :-0.3052 Median :-0.17357
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
3rd Qu.: 0.1690 3rd Qu.: 0.6324 3rd Qu.: 0.3049 3rd Qu.:-0.07918
Max. :15.6588 Max. : 3.9827 Max. : 9.6946 Max. :54.00544
mre rem
Min. :-0.458600 Min. :-1.6642
1st Qu.:-0.430521 1st Qu.:-1.0037
Median :-0.330955 Median :-0.2959
Mean : 0.000000 Mean : 0.0000
3rd Qu.:-0.007498 3rd Qu.: 1.1195
Max. :22.465392 Max. : 1.1195
set.seed(2018)
kmc = kmeans(A_kmN,6, iter.max = 1000)
kg = kmc$cluster
table(kg)
kg
1 2 3 4 5 6
1022 10783 7732 5299 3225 523
par(cex=0.8)
kmc$centers %>% round(2) %>% t %>% barplot(beside=T,col=rainbow(14))
legend('right',inset = 0.002,legend=colnames(A_kmN),fill=rainbow(14))
rpart8 = rpart(buy ~ r+s+f+m+rev+raw+cy, A_kmN, method="class",cp=0.002)
pred8 = predict(rpart8, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred8 > 0.5); cm
predict
actual TRUE
0 4603
1 3973
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts
[1] 0.5367304
colAUC(pred8, TS$buy) # Accuracy = 0.5367
[,1]
0 vs. 1 0.5
# AUC = 0.5
rpart.plot(rpart8,cex=0.6)
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
A3 = A2
A3$rev_m = A3$rev/A3$m #新增「單一顧客總收益除以顧客單日消費平均收益」
A3$f_m = A3$f*A3$m # 新增「總次數乘以顧客單日消費平均收益」
summary(A3)
cust W1 W2 W3
Min. : 1069 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.: 959997 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median : 1622622 Median :0.0625 Median :0.0000 Median :0.0000
Mean : 1416974 Mean :0.2184 Mean :0.1407 Mean :0.1228
3rd Qu.: 1906352 3rd Qu.:0.3333 3rd Qu.:0.2222 3rd Qu.:0.2000
Max. :20002000 Max. :1.0000 Max. :1.0000 Max. :1.0000
W4 W5 W6 W7 r
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. : 1.00
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 9.00
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000 Median :15.00
Mean :0.1069 Mean :0.1228 Mean :0.1170 Mean :0.1714 Mean :24.82
3rd Qu.:0.1667 3rd Qu.:0.2000 3rd Qu.:0.1818 3rd Qu.:0.2500 3rd Qu.:36.00
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :92.00
s f m rev raw
Min. : 1.00 Min. : 1.000 Min. :0.9031 Min. :0.9031 Min. : -742.0
1st Qu.:57.00 1st Qu.: 2.000 1st Qu.:2.5670 1st Qu.:2.9791 1st Qu.: 104.0
Median :75.00 Median : 3.000 Median :2.8341 Median :3.3571 Median : 316.0
Mean :67.65 Mean : 4.517 Mean :2.8059 Mean :3.2945 Mean : 560.8
3rd Qu.:86.00 3rd Qu.: 5.000 3rd Qu.:3.0791 3rd Qu.:3.6762 3rd Qu.: 727.8
Max. :92.00 Max. :60.000 Max. :4.0267 Max. :4.9982 Max. :15565.0
age area amount buy raf
D :2714 E :5394 Min. :0.9031 Mode:logical Min. :-742.00
C :2307 F :3880 1st Qu.:2.6571 TRUE:13242 1st Qu.: 41.67
E :2129 G :1133 Median :2.9969 Median : 95.76
F :1556 C :1043 Mean :2.9425 Mean : 144.11
B :1217 D : 670 3rd Qu.:3.2911 3rd Qu.: 189.57
G : 906 H : 554 Max. :4.4485 Max. :2555.00
(Other):2413 (Other): 568
ram cy ref fre
Min. :-2.2899 Min. : 0.000 Min. : 8.0 Min. : 8
1st Qu.: 0.1856 1st Qu.: 2.000 1st Qu.: 369.0 1st Qu.: 1664
Median : 0.4085 Median : 9.333 Median : 682.5 Median : 6709
Mean : 0.6227 Mean :10.008 Mean : 938.3 Mean : 31504
3rd Qu.: 0.8257 3rd Qu.:15.000 3rd Qu.: 1199.7 3rd Qu.: 22980
Max. :10.1729 Max. :44.000 Max. :10634.0 Max. :4581462
mre rem mra rev_m
Min. : 64 Min. :0.01667 Min. :-2516.000 Min. :1.000
1st Qu.: 404572 1st Qu.:0.20000 1st Qu.: 0.948 1st Qu.:1.090
Median : 1637361 Median :0.33333 Median : 2.027 Median :1.170
Mean : 5056656 Mean :0.44482 Mean : Inf Mean :1.178
3rd Qu.: 5289058 3rd Qu.:0.50000 3rd Qu.: 4.285 3rd Qu.:1.268
Max. :215642661 Max. :1.00000 Max. : Inf Max. :1.843
f_m
Min. : 0.9031
1st Qu.: 4.5190
Median : 8.6330
Mean : 12.5693
3rd Qu.: 15.5584
Max. :169.2416
summary(A3)
cust W1 W2 W3
Min. : 1069 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.: 959997 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median : 1622622 Median :0.0625 Median :0.0000 Median :0.0000
Mean : 1416974 Mean :0.2184 Mean :0.1407 Mean :0.1228
3rd Qu.: 1906352 3rd Qu.:0.3333 3rd Qu.:0.2222 3rd Qu.:0.2000
Max. :20002000 Max. :1.0000 Max. :1.0000 Max. :1.0000
W4 W5 W6 W7 r
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. : 1.00
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 9.00
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000 Median :15.00
Mean :0.1069 Mean :0.1228 Mean :0.1170 Mean :0.1714 Mean :24.82
3rd Qu.:0.1667 3rd Qu.:0.2000 3rd Qu.:0.1818 3rd Qu.:0.2500 3rd Qu.:36.00
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :92.00
s f m rev raw
Min. : 1.00 Min. : 1.000 Min. :0.9031 Min. :0.9031 Min. : -742.0
1st Qu.:57.00 1st Qu.: 2.000 1st Qu.:2.5670 1st Qu.:2.9791 1st Qu.: 104.0
Median :75.00 Median : 3.000 Median :2.8341 Median :3.3571 Median : 316.0
Mean :67.65 Mean : 4.517 Mean :2.8059 Mean :3.2945 Mean : 560.8
3rd Qu.:86.00 3rd Qu.: 5.000 3rd Qu.:3.0791 3rd Qu.:3.6762 3rd Qu.: 727.8
Max. :92.00 Max. :60.000 Max. :4.0267 Max. :4.9982 Max. :15565.0
age area amount buy raf
D :2714 E :5394 Min. :0.9031 Mode:logical Min. :-742.00
C :2307 F :3880 1st Qu.:2.6571 TRUE:13242 1st Qu.: 41.67
E :2129 G :1133 Median :2.9969 Median : 95.76
F :1556 C :1043 Mean :2.9425 Mean : 144.11
B :1217 D : 670 3rd Qu.:3.2911 3rd Qu.: 189.57
G : 906 H : 554 Max. :4.4485 Max. :2555.00
(Other):2413 (Other): 568
ram cy ref fre
Min. :-2.2899 Min. : 0.000 Min. : 8.0 Min. : 8
1st Qu.: 0.1856 1st Qu.: 2.000 1st Qu.: 369.0 1st Qu.: 1664
Median : 0.4085 Median : 9.333 Median : 682.5 Median : 6709
Mean : 0.6227 Mean :10.008 Mean : 938.3 Mean : 31504
3rd Qu.: 0.8257 3rd Qu.:15.000 3rd Qu.: 1199.7 3rd Qu.: 22980
Max. :10.1729 Max. :44.000 Max. :10634.0 Max. :4581462
mre rem mra rev_m
Min. : 64 Min. :0.01667 Min. :-2516.000 Min. :1.000
1st Qu.: 404572 1st Qu.:0.20000 1st Qu.: 0.948 1st Qu.:1.090
Median : 1637361 Median :0.33333 Median : 2.027 Median :1.170
Mean : 5056656 Mean :0.44482 Mean : Inf Mean :1.178
3rd Qu.: 5289058 3rd Qu.:0.50000 3rd Qu.: 4.285 3rd Qu.:1.268
Max. :215642661 Max. :1.00000 Max. : Inf Max. :1.843
f_m
Min. : 0.9031
1st Qu.: 4.5190
Median : 8.6330
Mean : 12.5693
3rd Qu.: 15.5584
Max. :169.2416
TR3 = subset(A3, spl2) # Training data:9269筆
TS3 = subset(A3, !spl2) # Testing data:3973筆
summary(TR3)
cust W1 W2 W3 W4
Min. : 3667 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.: 958752 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :1620030 Median :0.0625 Median :0.0000 Median :0.0000 Median :0.0000
Mean :1415972 Mean :0.2186 Mean :0.1415 Mean :0.1195 Mean :0.1081
3rd Qu.:1909555 3rd Qu.:0.3333 3rd Qu.:0.2222 3rd Qu.:0.1875 3rd Qu.:0.1667
Max. :2173832 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
W5 W6 W7 r s
Min. :0.000 Min. :0.0000 Min. :0.0000 Min. : 1.00 Min. : 1.00
1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 9.00 1st Qu.:56.00
Median :0.000 Median :0.0000 Median :0.0000 Median :15.00 Median :75.00
Mean :0.123 Mean :0.1176 Mean :0.1718 Mean :24.98 Mean :67.54
3rd Qu.:0.200 3rd Qu.:0.1818 3rd Qu.:0.2500 3rd Qu.:37.00 3rd Qu.:86.00
Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :92.00 Max. :92.00
f m rev raw age
Min. : 1.000 Min. :1.079 Min. :1.079 Min. : -728.0 D :1899
1st Qu.: 2.000 1st Qu.:2.561 1st Qu.:2.973 1st Qu.: 101.0 C :1643
Median : 3.000 Median :2.830 Median :3.351 Median : 310.0 E :1514
Mean : 4.517 Mean :2.804 Mean :3.291 Mean : 559.2 F :1055
3rd Qu.: 5.000 3rd Qu.:3.077 3rd Qu.:3.675 3rd Qu.: 722.0 B : 837
Max. :60.000 Max. :4.027 Max. :4.948 Max. :14701.0 G : 632
(Other):1689
area amount buy raf ram
E :3784 Min. :1.000 Mode:logical Min. :-354.00 Min. :-2.2899
F :2696 1st Qu.:2.659 TRUE:9269 1st Qu.: 41.00 1st Qu.: 0.1839
G : 820 Median :2.999 Median : 94.45 Median : 0.4034
C : 715 Mean :2.944 Mean : 143.72 Mean : 0.6220
D : 470 3rd Qu.:3.292 3rd Qu.: 186.25 3rd Qu.: 0.8142
H : 407 Max. :4.421 Max. :2555.00 Max. :10.1729
(Other): 377
cy ref fre mre
Min. : 0.000 Min. : 12.0 Min. : 12 Min. : 144
1st Qu.: 1.655 1st Qu.: 364.0 1st Qu.: 1646 1st Qu.: 392502
Median : 9.167 Median : 675.4 Median : 6576 Median : 1602756
Mean : 9.917 Mean : 935.6 Mean : 31589 Mean : 5030342
3rd Qu.:15.000 3rd Qu.: 1195.0 3rd Qu.: 22913 3rd Qu.: 5257849
Max. :44.000 Max. :10634.0 Max. :3634363 Max. :191648328
rem mra rev_m f_m
Min. :0.01667 Min. :-2516.0000 Min. :1.000 Min. : 1.079
1st Qu.:0.20000 1st Qu.: 0.9364 1st Qu.:1.089 1st Qu.: 4.408
Median :0.33333 Median : 2.0528 Median :1.169 Median : 8.550
Mean :0.44790 Mean : Inf Mean :1.177 Mean : 12.562
3rd Qu.:0.50000 3rd Qu.: 4.3581 3rd Qu.:1.268 3rd Qu.: 15.547
Max. :1.00000 Max. : Inf Max. :1.722 Max. :169.223
lm2 = lm(amount ~ ., TR3[,c(9:14,17,19:25,27:28)])
summary(lm2) # Training R^2 = 0.2994
Call:
lm(formula = amount ~ ., data = TR3[, c(9:14, 17, 19:25, 27:28)])
Residuals:
Min 1Q Median 3Q Max
-1.87200 -0.22629 0.05143 0.27906 1.34637
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.354e+00 5.011e-01 10.684 < 2e-16 ***
r -8.485e-04 7.219e-04 -1.175 0.239881
s 1.015e-03 7.247e-04 1.401 0.161156
f 8.476e-02 2.337e-02 3.627 0.000289 ***
m -1.269e+00 3.177e-01 -3.996 6.50e-05 ***
rev 1.634e+00 3.116e-01 5.245 1.60e-07 ***
raw 1.822e-05 2.530e-05 0.720 0.471379
raf 9.056e-05 7.615e-05 1.189 0.234378
ram 3.361e-02 1.650e-02 2.037 0.041660 *
cy -2.503e-03 1.900e-03 -1.317 0.187789
ref 2.845e-05 2.145e-05 1.326 0.184807
fre 6.888e-08 1.323e-07 0.521 0.602678
mre 2.874e-10 1.308e-09 0.220 0.826073
rem 1.793e-01 1.253e-01 1.431 0.152432
rev_m -3.775e+00 4.971e-01 -7.593 3.42e-14 ***
f_m -2.754e-02 9.873e-03 -2.790 0.005282 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4189 on 9253 degrees of freedom
Multiple R-squared: 0.2994, Adjusted R-squared: 0.2982
F-statistic: 263.6 on 15 and 9253 DF, p-value: < 2.2e-16
predlm = predict(lm2, TS3, type="response")
SSE = sum((TS3$amount - predlm)^2)
SST = sum((TS3$amount - mean(TS3$amount))^2)
RSquare = 1 - SSE/SST
RSquare # Testing R^2 = 0.2708
[1] 0.2707844
rpart9 = rpart(amount ~ ., TR3[,c(9:14,17,19:25,27:28)], cp=0.002)
SST = sum((TS3$amount - mean(TR3$amount))^ 2)
SSE = sum((predict(rpart6, TS3) - TS3$amount)^2)
RSquare = 1 - (SSE/SST)
RSquare # Testing R^2 = 0.2446
px = cbind(glm=predglm, cart1=pred1, cart2=pred2, cart3=pred3, cart4 = pred4, cart5 = pred5, cart6 = pred6, cart7 = pred7, rf = PredictForest)
apply(px, 2, function(x) {
table(TS$buy, x > 0.5) %>% {sum(diag(.))/sum(.)}
}) %>% sort
rf cart6 cart2 cart3 cart5 cart7 cart1 cart4
0.5367304 0.6998601 0.7030084 0.7045243 0.7051073 0.7055737 0.7066231 0.7066231
glm
0.7067397
par(cex=1.25)
colAUC(px, TS$buy, T)
glm cart1 cart2 cart3 cart4 cart5 cart6 cart7
0 vs. 1 0.7579333 0.6983998 0.715176 0.710618 0.6983998 0.7169118 0.7361045 0.7156103
rf
0 vs. 1 0.7481006
cor(px)
glm cart1 cart2 cart3 cart4 cart5 cart6 cart7
glm 1.0000000 0.8744852 0.9288347 0.9214609 0.8744852 0.9300036 0.9304863 0.9253247
cart1 0.8744852 1.0000000 0.9147003 0.9092120 1.0000000 0.9185831 0.8445941 0.9143659
cart2 0.9288347 0.9147003 1.0000000 0.9919507 0.9147003 0.9860611 0.9121775 0.9813477
cart3 0.9214609 0.9092120 0.9919507 1.0000000 0.9092120 0.9800762 0.9076641 0.9758323
cart4 0.8744852 1.0000000 0.9147003 0.9092120 1.0000000 0.9185831 0.8445941 0.9143659
cart5 0.9300036 0.9185831 0.9860611 0.9800762 0.9185831 1.0000000 0.9240782 0.9942473
cart6 0.9304863 0.8445941 0.9121775 0.9076641 0.8445941 0.9240782 1.0000000 0.9293828
cart7 0.9253247 0.9143659 0.9813477 0.9758323 0.9143659 0.9942473 0.9293828 1.0000000
rf 0.9307312 0.8050556 0.8663154 0.8617552 0.8050556 0.8703514 0.8885728 0.8679884
rf
glm 0.9307312
cart1 0.8050556
cart2 0.8663154
cart3 0.8617552
cart4 0.8050556
cart5 0.8703514
cart6 0.8885728
cart7 0.8679884
rf 1.0000000
glm_cart = (px[,"glm"] + px[,"cart6"])/2
glm_rf = (px[,"glm"] + px[,"rf"])/2
px2 = cbind(px, glm_cart, glm_rf)
rbind(apply(px2, 2, function(x) {
table(TS$buy, x > 0.5) %>% {sum(diag(.))/sum(.)} }),
colAUC(px2, TS$buy)) %>% t %>%
data.frame %>% setNames(c("Accuracy", "AUC"))