資料前置處理
Fig-1: Feature Engineering
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
library(caret)
library(e1071)
library(lattice)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A2 = subset(A, buy)
c(sum(spl), sum(spl2))
[1] 20008 9269
X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)
0 1 2 3 4 5 6
18011 12615 11288 9898 11245 10651 14587
mx = xtabs(~ cust + wday, X)
dim(mx)
[1] 28584 7
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 1 1 0 0 0 0 0
1113 2 1 0 0 0 0 1
1359 0 1 0 0 0 0 0
1823 0 1 0 1 1 0 0
2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 0.5000 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
1113 0.5000 0.2500 0.0000 0.0000 0.0000 0.0000 0.2500
1359 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000
1823 0.0000 0.3333 0.0000 0.3333 0.3333 0.0000 0.0000
2189 0.0000 0.0000 0.0000 0.5000 0.0000 0.0000 0.5000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>%
setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>%
right_join(A, by='cust')
head(A)
cust W1 W2 W3 W4 W5 W6 W7 r s f m rev raw age
1 1069 0.5 0.5000 0.0 0.0000 0.0000 0.0 0.00 11 80 2 579.0 1158 129 K
2 1113 0.5 0.2500 0.0 0.0000 0.0000 0.0 0.25 26 81 4 557.5 2230 241 K
3 1359 0.0 1.0000 0.0 0.0000 0.0000 0.0 0.00 59 59 1 364.0 364 104 K
4 1823 0.0 0.3333 0.0 0.3333 0.3333 0.0 0.00 8 91 3 869.0 2607 498 K
5 2189 0.0 0.0000 0.0 0.5000 0.0000 0.0 0.50 29 61 2 7028.0 14056 3299 K
6 3667 0.0 0.0000 0.5 0.0000 0.0000 0.5 0.00 37 55 2 2379.5 4759 351 K
area amount buy
1 E 786 TRUE
2 F NA FALSE
3 G NA FALSE
4 D NA FALSE
5 B NA FALSE
6 G 1570 TRUE
A$raf = as.numeric(A$raw/A$f) # 淨利 / 購買次數(平均顧客淨利)
A$ram = as.numeric(A$raw/A$m) # 淨利 / 單次購買金額(平均訂單淨利)
A$cy = as.numeric((A$s-A$r)/A$f) # (第一次購買日期 - 最近一次購買日期) / 購買次數(顧客購買週期)
A$ref = as.numeric(A$rev/A$f) # 收益 / 購買次數(平均顧客收益)
A$fre = as.numeric(A$f*A$rev) # 購買次數 * 收益(共變項)
A$mre = as.numeric(A$m*A$rev) # 單次購買金額 * 收益(共變項)
A$rem = as.numeric(A$m/A$rev) # 單次購買金額 / 收益(顧客單次購買金額佔收益比)
A$mra = as.numeric(A$m/A$raw) # 單次購買金額 / 淨利(顧客單次購買金額佔淨利比)
TR = subset(A, spl)
TS = subset(A, !spl)
glm1 = glm(buy ~.+area:f, TR[,c(9:11,16,18,20:25)], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ . + area:f, family = binomial(), data = TR[,
c(9:11, 16, 18, 20:25)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.978 -0.867 -0.673 1.000 1.860
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.29205376481 0.22539368934 -1.30 0.1951
r -0.02142255255 0.00219016790 -9.78 < 2e-16 ***
s 0.01825034814 0.00220985649 8.26 < 2e-16 ***
f 0.24557995216 0.08790505476 2.79 0.0052 **
areaB -0.20533618463 0.25045435919 -0.82 0.4123
areaC -0.12405069153 0.19550444066 -0.63 0.5257
areaD 0.28516145271 0.20307364355 1.40 0.1603
areaE 0.37627087146 0.18235819074 2.06 0.0391 *
areaF 0.34495043028 0.18336909017 1.88 0.0599 .
areaG -0.10375954527 0.19647213135 -0.53 0.5974
areaH 0.01390019001 0.20804869921 0.07 0.9467
ram -0.30689590476 0.06299300669 -4.87 0.00000110527666415 ***
cy -0.03491463482 0.00512429840 -6.81 0.00000000000952229 ***
ref -0.00000638685 0.00002972687 -0.21 0.8299
fre -0.00000012591 0.00000106600 -0.12 0.9060
mre -0.00000000017 0.00000000366 -0.05 0.9630
rem -0.93440182279 0.11596968287 -8.06 0.00000000000000078 ***
f:areaB 0.07038326473 0.11142312634 0.63 0.5276
f:areaC -0.05082310025 0.08933945034 -0.57 0.5694
f:areaD -0.13953156762 0.09159848188 -1.52 0.1277
f:areaE -0.07377814639 0.08377766661 -0.88 0.3785
f:areaF -0.09144941054 0.08400650382 -1.09 0.2763
f:areaG 0.01617064270 0.08971419359 0.18 0.8570
f:areaH -0.09733576085 0.08734965748 -1.11 0.2651
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23182 on 19984 degrees of freedom
AIC: 23230
Number of Fisher Scoring iterations: 7
predglm = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = predglm > 0.5); cm
predict
actual FALSE TRUE
FALSE 3797 806
TRUE 1709 2264
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.6989
[1] 0.7067
colAUC(predglm, TS$buy) # AUC = 0.7551
[,1]
FALSE vs. TRUE 0.7579
# 嘗試歷程:
# Accuracy AUC
# 0.7009 0.7508 (area:f + area:m + area:rev + area:raw + age:f + age:m + age:rev + age:raw + age:area + f:m + f:rev + m:rev)
# 0.7013 0.7523 (.+area:f + area:m + area:rev + area:raw + age:f + age:m + age:rev + age:raw + age:area + f:m + f:rev + m:rev)
# 0.7007 0.7533 (.+ area:f + area:m + area:rev + area:raw + age:f + age:m + age:rev + age:raw + age:area + f:m + f:rev + m:rev + rf + rm)
# 0.7024 0.7548 (.+area:f + area:m)
# 0.6989 0.7551 (.有week)
# 0.7018 0.7553 (.+area:f + area:m + rm)
# 0.7000 0.7556 (.無week)
# 0.7022 0.7557 (.+area:f + area:m + rf + rm)
# 0.7008 0.7561 (. + rf + rm)
# 0.7016 0.7565 (. + rf + rm 無week)
# 0.7017 0.7568 (. + rf + rm + cy 無week)
# 0.7038 0.7575 (c(9:11,16,18:23))
# 0.7039 0.7575 (9:11,16,18,20:24)
# 0.7067 0.7579 (9:11,16,18,20:25)
rpart1 = rpart(buy ~ ., TR[,c(9:11,16,18,20:25)], method="class")
pred1 = predict(rpart1, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred1 > 0.5); cm
predict
actual FALSE TRUE
0 3730 873
1 1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7066
[1] 0.7066
colAUC(pred1, TS$buy) # AUC = 0.6984
[,1]
0 vs. 1 0.6984
rpart.plot(rpart1,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart2 = rpart(buy ~ ., TR[,c(9:11,16,18,20:25)], method="class",cp=0.001)
pred2 = predict(rpart2, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred2 > 0.5); cm
predict
actual FALSE TRUE
0 3869 734
1 1813 2160
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7030
[1] 0.703
colAUC(pred2, TS$buy) # AUC = 0.7152
[,1]
0 vs. 1 0.7152
rpart.plot(rpart2,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
set.seed(2018)
TR$buy = as.factor(TR$buy)
cv1 = train(
buy ~ ., data = TR[,c(9:11,16,18,20:25)], method = "rpart",
trControl = trainControl(method = "cv", number=10),
tuneGrid = expand.grid(cp = seq(0,0.002,0.00005))
)
cv1 # cp = 0.00075
CART
20008 samples
10 predictor
2 classes: '1', '2'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 18007, 18007, 18008, 18007, 18007, 18007, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000 0.6348 0.2628
0.00005 0.6400 0.2729
0.00010 0.6540 0.2992
0.00015 0.6666 0.3244
0.00020 0.6777 0.3460
0.00025 0.6828 0.3554
0.00030 0.6871 0.3628
0.00035 0.6916 0.3706
0.00040 0.6934 0.3738
0.00045 0.6934 0.3730
0.00050 0.6952 0.3758
0.00055 0.6970 0.3793
0.00060 0.6977 0.3809
0.00065 0.6978 0.3816
0.00070 0.6978 0.3816
0.00075 0.6980 0.3820
0.00080 0.6979 0.3818
0.00085 0.6978 0.3818
0.00090 0.6978 0.3818
0.00095 0.6978 0.3818
0.00100 0.6980 0.3823
0.00105 0.6980 0.3824
0.00110 0.6975 0.3814
0.00115 0.6975 0.3815
0.00120 0.6974 0.3816
0.00125 0.6977 0.3822
0.00130 0.6969 0.3811
0.00135 0.6969 0.3811
0.00140 0.6961 0.3799
0.00145 0.6961 0.3799
0.00150 0.6961 0.3799
0.00155 0.6961 0.3799
0.00160 0.6971 0.3823
0.00165 0.6971 0.3825
0.00170 0.6967 0.3815
0.00175 0.6969 0.3821
0.00180 0.6969 0.3821
0.00185 0.6969 0.3821
0.00190 0.6969 0.3821
0.00195 0.6969 0.3821
0.00200 0.6971 0.3829
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.00075.
plot(cv1, main = sprintf("optimal cp at %f", cv1$bestTune$cp))
rpart3 = rpart(buy ~ ., TR[,c(9:11,16,18,20:25)], method="class",cp=0.00075)
pred3 = predict(rpart3, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred3 > 0.5); cm
predict
actual FALSE TRUE
0 3841 762
1 1772 2201
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7045
[1] 0.7045
colAUC(pred3, TS$buy) # AUC = 0.7106
[,1]
0 vs. 1 0.7106
prp(rpart3)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart.plot(rpart3,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
library(rpart)
library(rpart.plot)
rpart4 = rpart(buy ~ ., TR[,c(9:16,18,21)], method="class")
pred4 = predict(rpart4, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred4 > 0.5); cm
predict
actual FALSE TRUE
0 3730 873
1 1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7066
[1] 0.7066
colAUC(pred4, TS$buy) # AUC = 0.6984
[,1]
0 vs. 1 0.6984
rpart.plot(rpart4,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart5 = rpart(buy ~ ., TR[,c(9:16,18,21)], method="class",cp=0.001)
pred5 = predict(rpart5, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred5 > 0.5); cm
predict
actual FALSE TRUE
0 3822 781
1 1748 2225
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7051
[1] 0.7051
colAUC(pred5, TS$buy) # AUC = 0.7169
[,1]
0 vs. 1 0.7169
rpart.plot(rpart5,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart6 = rpart(buy ~ ., TR[,c(9:16,18,21)], method="class",cp=0.00055)
pred6 = predict(rpart6, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred6 > 0.5); cm
predict
actual FALSE TRUE
0 3768 835
1 1739 2234
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.6999
[1] 0.6999
colAUC(pred6, TS$buy) # AUC = 0.7361
[,1]
0 vs. 1 0.7361
prp(rpart6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart.plot(rpart6,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
library(caret)
library(e1071)
library(lattice)
set.seed(2018)
TR$buy = as.factor(TR$buy)
cv2 = train(
buy ~ ., data = TR[,c(9:16,18,21)], method = "rpart",
trControl = trainControl(method = "cv", number=10),
tuneGrid = expand.grid(cp = seq(0,0.002,0.00005))
)
cv2 # cp=0.0008
CART
20008 samples
9 predictor
2 classes: '1', '2'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 18007, 18007, 18008, 18007, 18007, 18007, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000 0.6317 0.2568
0.00005 0.6365 0.2658
0.00010 0.6515 0.2946
0.00015 0.6675 0.3250
0.00020 0.6763 0.3424
0.00025 0.6790 0.3476
0.00030 0.6813 0.3521
0.00035 0.6834 0.3562
0.00040 0.6900 0.3679
0.00045 0.6917 0.3704
0.00050 0.6910 0.3693
0.00055 0.6939 0.3745
0.00060 0.6950 0.3768
0.00065 0.6952 0.3771
0.00070 0.6954 0.3773
0.00075 0.6961 0.3783
0.00080 0.6976 0.3809
0.00085 0.6974 0.3806
0.00090 0.6974 0.3808
0.00095 0.6974 0.3809
0.00100 0.6972 0.3808
0.00105 0.6972 0.3809
0.00110 0.6971 0.3806
0.00115 0.6971 0.3806
0.00120 0.6970 0.3807
0.00125 0.6971 0.3812
0.00130 0.6968 0.3811
0.00135 0.6968 0.3813
0.00140 0.6964 0.3807
0.00145 0.6961 0.3801
0.00150 0.6961 0.3801
0.00155 0.6961 0.3801
0.00160 0.6968 0.3818
0.00165 0.6968 0.3818
0.00170 0.6968 0.3818
0.00175 0.6969 0.3821
0.00180 0.6969 0.3821
0.00185 0.6969 0.3821
0.00190 0.6969 0.3821
0.00195 0.6969 0.3821
0.00200 0.6971 0.3829
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.0008.
plot(cv2, main = sprintf("optimal cp at %f", cv2$bestTune$cp))
rpart7 = rpart(buy ~ ., TR[,c(9:16,18,21)], method="class",cp=0.0008)
pred7 = predict(rpart7, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred7 > 0.5); cm
predict
actual FALSE TRUE
0 3879 724
1 1801 2172
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # Accuracy = 0.7056
[1] 0.7056
colAUC(pred7, TS$buy) # AUC = 0.7156
[,1]
0 vs. 1 0.7156
rpart.plot(rpart7,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
library(randomForest)
package 'randomForest' was built under R version 3.5.1randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 'randomForest'
The following object is masked from 'package:ggplot2':
margin
The following object is masked from 'package:dplyr':
combine
TR$buy = as.numeric(TR$buy)
TS$buy = as.numeric(TS$buy)
a_rf = randomForest(buy ~ ., data = TR[,c(9:16,18,21)] , nodesize = 25, ntree =200)
The response has five or fewer unique values. Are you sure you want to do regression?
PredictForest = predict(a_rf , newdata = TS)
table(TS$buy , PredictForest > 0.5) %>% {sum(diag(.))/sum(.)} # Accuracy = 0.5367
[1] 0.5367
colAUC(PredictForest,TS$buy) # AUC = 0.7487
[,1]
0 vs. 1 0.7481
A_km = subset(A[,c(9:14,18:25)])
library(caret)
preproc = preProcess(A_km)
A_kmN = predict(preproc, A_km)
summary(A_kmN)
r s f m
Min. :-1.194 Min. :-2.334 Min. :-0.553 Min. :-1.012
1st Qu.:-0.810 1st Qu.:-0.552 1st Qu.:-0.553 1st Qu.:-0.658
Median :-0.427 Median : 0.261 Median :-0.288 Median :-0.305
Mean : 0.000 Mean : 0.000 Mean : 0.000 Mean : 0.000
3rd Qu.: 0.801 3rd Qu.: 0.842 3rd Qu.: 0.241 3rd Qu.: 0.305
Max. : 2.297 Max. : 1.190 Max. :15.061 Max. : 9.695
rev raw buy raf
Min. :-0.747 Min. :-1.864 Mode :logical Min. :-4.633
1st Qu.:-0.573 1st Qu.:-0.562 FALSE:15342 1st Qu.:-0.607
Median :-0.316 Median :-0.325 TRUE :13242 Median :-0.296
Mean : 0.000 Mean : 0.000 Mean : 0.000
3rd Qu.: 0.198 3rd Qu.: 0.183 3rd Qu.: 0.265
Max. :26.779 Max. :24.271 Max. :13.658
ram cy ref fre
Min. :-4.360 Min. :-0.866 Min. :-1.012 Min. :-0.21
1st Qu.:-0.449 1st Qu.:-0.866 1st Qu.:-0.658 1st Qu.:-0.20
Median :-0.270 Median :-0.280 Median :-0.305 Median :-0.17
Mean : 0.000 Mean : 0.000 Mean : 0.000 Mean : 0.00
3rd Qu.: 0.169 3rd Qu.: 0.632 3rd Qu.: 0.305 3rd Qu.:-0.08
Max. :15.659 Max. : 3.983 Max. : 9.695 Max. :54.01
mre rem
Min. :-0.459 Min. :-1.664
1st Qu.:-0.431 1st Qu.:-1.004
Median :-0.331 Median :-0.296
Mean : 0.000 Mean : 0.000
3rd Qu.:-0.007 3rd Qu.: 1.119
Max. :22.465 Max. : 1.119
set.seed(2018)
kmc = kmeans(A_kmN,6, iter.max = 1000)
kg = kmc$cluster
table(kg)
kg
1 2 3 4 5 6
1022 10783 7732 5299 3225 523
par(cex=0.7,mar=c(3,3,1,1))
kmc$centers %>% round(2) %>% t %>% barplot(beside=T,col=rainbow(14))
legend('topleft',horiz=T,cex=0.7,legend=colnames(A_kmN),fill=rainbow(14))
rpart8 = rpart(buy ~ r+s+f+m+rev+raw+cy, A_kmN, method="class",cp=0.002)
pred8 = predict(rpart8, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred8 > 0.5); cm
predict
actual TRUE
0 4603
1 3973
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts
[1] 0.5367
colAUC(pred8, TS$buy) # Accuracy = 0.5367
[,1]
0 vs. 1 0.5
# AUC = 0.5
rpart.plot(rpart8,cex=0.6)
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
A3 = A2
A3$rev_m = A3$rev/A3$m #新增「單一顧客總收益除以顧客單日消費平均收益」
A3$f_m = A3$f*A3$m # 新增「總次數乘以顧客單日消費平均收益」
summary(A3)
cust W1 W2 W3
Min. : 1069 Min. :0.0000 Min. :0.000 Min. :0.000
1st Qu.: 959997 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.000
Median : 1622622 Median :0.0625 Median :0.000 Median :0.000
Mean : 1416974 Mean :0.2184 Mean :0.141 Mean :0.123
3rd Qu.: 1906352 3rd Qu.:0.3333 3rd Qu.:0.222 3rd Qu.:0.200
Max. :20002000 Max. :1.0000 Max. :1.000 Max. :1.000
W4 W5 W6 W7
Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
Median :0.000 Median :0.000 Median :0.000 Median :0.000
Mean :0.107 Mean :0.123 Mean :0.117 Mean :0.171
3rd Qu.:0.167 3rd Qu.:0.200 3rd Qu.:0.182 3rd Qu.:0.250
Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000
r s f m
Min. : 1.0 Min. : 1.0 Min. : 1.00 Min. :0.903
1st Qu.: 9.0 1st Qu.:57.0 1st Qu.: 2.00 1st Qu.:2.567
Median :15.0 Median :75.0 Median : 3.00 Median :2.834
Mean :24.8 Mean :67.7 Mean : 4.52 Mean :2.806
3rd Qu.:36.0 3rd Qu.:86.0 3rd Qu.: 5.00 3rd Qu.:3.079
Max. :92.0 Max. :92.0 Max. :60.00 Max. :4.027
rev raw age area
Min. :0.903 Min. : -742 D :2714 E :5394
1st Qu.:2.979 1st Qu.: 104 C :2307 F :3880
Median :3.357 Median : 316 E :2129 G :1133
Mean :3.294 Mean : 561 F :1556 C :1043
3rd Qu.:3.676 3rd Qu.: 728 B :1217 D : 670
Max. :4.998 Max. :15565 G : 906 H : 554
(Other):2413 (Other): 568
amount buy raf ram
Min. :0.903 Mode:logical Min. :-742.0 Min. :-2.290
1st Qu.:2.657 TRUE:13242 1st Qu.: 41.7 1st Qu.: 0.186
Median :2.997 Median : 95.8 Median : 0.408
Mean :2.943 Mean : 144.1 Mean : 0.623
3rd Qu.:3.291 3rd Qu.: 189.6 3rd Qu.: 0.826
Max. :4.449 Max. :2555.0 Max. :10.173
cy ref fre mre
Min. : 0.00 Min. : 8 Min. : 8 Min. : 64
1st Qu.: 2.00 1st Qu.: 369 1st Qu.: 1664 1st Qu.: 404572
Median : 9.33 Median : 682 Median : 6709 Median : 1637361
Mean :10.01 Mean : 938 Mean : 31504 Mean : 5056656
3rd Qu.:15.00 3rd Qu.: 1200 3rd Qu.: 22980 3rd Qu.: 5289058
Max. :44.00 Max. :10634 Max. :4581462 Max. :215642661
rem mra rev_m f_m
Min. :0.0167 Min. :-2516.0 Min. :1.00 Min. : 0.90
1st Qu.:0.2000 1st Qu.: 0.9 1st Qu.:1.09 1st Qu.: 4.52
Median :0.3333 Median : 2.0 Median :1.17 Median : 8.63
Mean :0.4448 Mean : Inf Mean :1.18 Mean : 12.57
3rd Qu.:0.5000 3rd Qu.: 4.3 3rd Qu.:1.27 3rd Qu.: 15.56
Max. :1.0000 Max. : Inf Max. :1.84 Max. :169.24
TR3 = subset(A3, spl2) # Training data:9269筆
TS3 = subset(A3, !spl2) # Testing data:3973筆
summary(TR3)
cust W1 W2 W3
Min. : 3667 Min. :0.0000 Min. :0.000 Min. :0.000
1st Qu.: 958752 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.000
Median :1620030 Median :0.0625 Median :0.000 Median :0.000
Mean :1415972 Mean :0.2186 Mean :0.141 Mean :0.119
3rd Qu.:1909555 3rd Qu.:0.3333 3rd Qu.:0.222 3rd Qu.:0.188
Max. :2173832 Max. :1.0000 Max. :1.000 Max. :1.000
W4 W5 W6 W7
Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
Median :0.000 Median :0.000 Median :0.000 Median :0.000
Mean :0.108 Mean :0.123 Mean :0.118 Mean :0.172
3rd Qu.:0.167 3rd Qu.:0.200 3rd Qu.:0.182 3rd Qu.:0.250
Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000
r s f m rev
Min. : 1 Min. : 1.0 Min. : 1.00 Min. :1.08 Min. :1.08
1st Qu.: 9 1st Qu.:56.0 1st Qu.: 2.00 1st Qu.:2.56 1st Qu.:2.97
Median :15 Median :75.0 Median : 3.00 Median :2.83 Median :3.35
Mean :25 Mean :67.5 Mean : 4.52 Mean :2.80 Mean :3.29
3rd Qu.:37 3rd Qu.:86.0 3rd Qu.: 5.00 3rd Qu.:3.08 3rd Qu.:3.67
Max. :92 Max. :92.0 Max. :60.00 Max. :4.03 Max. :4.95
raw age area amount
Min. : -728 D :1899 E :3784 Min. :1.00
1st Qu.: 101 C :1643 F :2696 1st Qu.:2.66
Median : 310 E :1514 G : 820 Median :3.00
Mean : 559 F :1055 C : 715 Mean :2.94
3rd Qu.: 722 B : 837 D : 470 3rd Qu.:3.29
Max. :14701 G : 632 H : 407 Max. :4.42
(Other):1689 (Other): 377
buy raf ram cy
Mode:logical Min. :-354.0 Min. :-2.290 Min. : 0.00
TRUE:9269 1st Qu.: 41.0 1st Qu.: 0.184 1st Qu.: 1.65
Median : 94.5 Median : 0.403 Median : 9.17
Mean : 143.7 Mean : 0.622 Mean : 9.92
3rd Qu.: 186.2 3rd Qu.: 0.814 3rd Qu.:15.00
Max. :2555.0 Max. :10.173 Max. :44.00
ref fre mre rem
Min. : 12 Min. : 12 Min. : 144 Min. :0.0167
1st Qu.: 364 1st Qu.: 1646 1st Qu.: 392502 1st Qu.:0.2000
Median : 675 Median : 6576 Median : 1602756 Median :0.3333
Mean : 936 Mean : 31589 Mean : 5030342 Mean :0.4479
3rd Qu.: 1195 3rd Qu.: 22913 3rd Qu.: 5257849 3rd Qu.:0.5000
Max. :10634 Max. :3634363 Max. :191648328 Max. :1.0000
mra rev_m f_m
Min. :-2516.0 Min. :1.00 Min. : 1.08
1st Qu.: 0.9 1st Qu.:1.09 1st Qu.: 4.41
Median : 2.1 Median :1.17 Median : 8.55
Mean : Inf Mean :1.18 Mean : 12.56
3rd Qu.: 4.4 3rd Qu.:1.27 3rd Qu.: 15.55
Max. : Inf Max. :1.72 Max. :169.22
lm2 = lm(amount ~ ., TR3[,c(9:14,17,19:25,27:28)])
summary(lm2) # Training R^2 = 0.2994
Call:
lm(formula = amount ~ ., data = TR3[, c(9:14, 17, 19:25, 27:28)])
Residuals:
Min 1Q Median 3Q Max
-1.8720 -0.2263 0.0514 0.2791 1.3464
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.354156232961 0.501144838958 10.68 < 2e-16 ***
r -0.000848477070 0.000721887362 -1.18 0.23988
s 0.001015467684 0.000724657768 1.40 0.16116
f 0.084756891634 0.023370908134 3.63 0.00029 ***
m -1.269416863162 0.317698048946 -4.00 0.000065011440684 ***
rev 1.634445231930 0.311630178829 5.24 0.000000159890632 ***
raw 0.000018224585 0.000025302538 0.72 0.47138
raf 0.000090563991 0.000076153247 1.19 0.23438
ram 0.033613598712 0.016500043605 2.04 0.04166 *
cy -0.002503062234 0.001900224925 -1.32 0.18779
ref 0.000028446775 0.000021449853 1.33 0.18481
fre 0.000000068875 0.000000132307 0.52 0.60268
mre 0.000000000287 0.000000001308 0.22 0.82607
rem 0.179298396859 0.125286072921 1.43 0.15243
rev_m -3.774773404485 0.497117791204 -7.59 0.000000000000034 ***
f_m -0.027544949727 0.009872730976 -2.79 0.00528 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.419 on 9253 degrees of freedom
Multiple R-squared: 0.299, Adjusted R-squared: 0.298
F-statistic: 264 on 15 and 9253 DF, p-value: <2e-16
predlm = predict(lm2, TS3, type="response")
SSE = sum((TS3$amount - predlm)^2)
SST = sum((TS3$amount - mean(TS3$amount))^2)
RSquarelm = 1 - SSE/SST
RSquarelm # Testing R^2 = 0.2708
[1] 0.2708
rpart9 = rpart(amount ~ ., TR3[,c(9:14,17,19:25,27:28)], cp=0.002)
SST = sum((TS3$amount - mean(TR3$amount))^ 2)
SSE = sum((predict(rpart9, TS3) - TS3$amount)^2)
RSquarerp = 1 - (SSE/SST)
RSquarerp # Testing R^2 = 0.2446
[1] 0.2446
px = cbind(glm=predglm, cart1=pred1, cart2=pred2, cart3=pred3, cart4 = pred4, cart5 = pred5, cart6 = pred6, cart7 = pred7, rf = PredictForest)
apply(px, 2, function(x) {
table(TS$buy, x > 0.5) %>% {sum(diag(.))/sum(.)}
}) %>% sort
rf cart6 cart2 cart3 cart5 cart7 cart1 cart4 glm
0.5367 0.6999 0.7030 0.7045 0.7051 0.7056 0.7066 0.7066 0.7067
par(cex=0.9,mar=c(6,6,5,3))
colAUC(px, TS$buy, T)
glm cart1 cart2 cart3 cart4 cart5 cart6 cart7 rf
0 vs. 1 0.7579 0.6984 0.7152 0.7106 0.6984 0.7169 0.7361 0.7156 0.7481
cor(px)
glm cart1 cart2 cart3 cart4 cart5 cart6 cart7 rf
glm 1.0000 0.8745 0.9288 0.9215 0.8745 0.9300 0.9305 0.9253 0.9307
cart1 0.8745 1.0000 0.9147 0.9092 1.0000 0.9186 0.8446 0.9144 0.8051
cart2 0.9288 0.9147 1.0000 0.9920 0.9147 0.9861 0.9122 0.9813 0.8663
cart3 0.9215 0.9092 0.9920 1.0000 0.9092 0.9801 0.9077 0.9758 0.8618
cart4 0.8745 1.0000 0.9147 0.9092 1.0000 0.9186 0.8446 0.9144 0.8051
cart5 0.9300 0.9186 0.9861 0.9801 0.9186 1.0000 0.9241 0.9942 0.8704
cart6 0.9305 0.8446 0.9122 0.9077 0.8446 0.9241 1.0000 0.9294 0.8886
cart7 0.9253 0.9144 0.9813 0.9758 0.9144 0.9942 0.9294 1.0000 0.8680
rf 0.9307 0.8051 0.8663 0.8618 0.8051 0.8704 0.8886 0.8680 1.0000
glm_cart = (px[,"glm"] + px[,"cart6"])/2
glm_rf = (px[,"glm"] + px[,"rf"])/2
px2 = cbind(px, glm_cart, glm_rf)
rbind(apply(px2, 2, function(x) {
table(TS$buy, x > 0.5) %>% {sum(diag(.))/sum(.)} }),
colAUC(px2, TS$buy)) %>% t %>%
data.frame %>% setNames(c("Accuracy", "AUC"))
Accuracy AUC
glm 0.7067 0.7579
cart1 0.7066 0.6984
cart2 0.7030 0.7152
cart3 0.7045 0.7106
cart4 0.7066 0.6984
cart5 0.7051 0.7169
cart6 0.6999 0.7361
cart7 0.7056 0.7156
rf 0.5367 0.7481
glm_cart 0.7007 0.7561
glm_rf 0.5367 0.7569
px3 = cbind(lm=RSquarelm, cart9=RSquarerp)
rbind(px3) %>% data.frame %>% setNames(c("lm","cart"))
lm cart
1 0.2708 0.2446