Sys.setlocale("LC_ALL","C")
[1] "C"
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
X$CHR=as.numeric(X$date)
X$CHR1=X$CHR>=11309 & X$CHR<=11315
X$CHR2=as.numeric(X$CHR1)
# table(X$cust) %>% sort %>% tail
maxWeekday = function(x){
y = table(x) %>% t %>% as.matrix
result = which.max(y) %>%
colnames(y)[.] %>%
as.Date %>%
weekdays
return(result)
}
d0 = max(X$date)
NewVar = group_by(X, cust) %>% summarise(
prodPrice = mean(total / pieces), # 商品平均單價
weekday = maxWeekday(date), # 顧客最常禮拜幾來買
CHR = sum(CHR2)>0, # chistmas
k = (1 + as.integer(difftime(max(date), min(date), units="days"))) / n(), # 個人平均購買週期
areaEFH = ((first(area)=="E") | (first(area)=="F") | (first(area)=="H")), # 是否住在EFH區域
areaACDEF = ((first(area)=="A") | (first(area)=="C") | (first(area)=="D") | (first(area)=="E") | (first(area)=="F")),# 是否住在ACDEF區域
mon1 = sum(date<="2000-11-30"), # 顧客第一個月來購買的次數
mon2 = sum(date<="2000-12-31" & date>"2000-11-30"), #第二個月
mon3 = sum(date<="2001-01-31" & date>"2000-12-31") #及第三個月來購買的次數
) %>% data.frame # 28584
A = merge(A, NewVar, all.x = T)
A$weekday = factor(A$weekday
, ordered=TRUE
, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
A$ageGroup = sapply(seq(1,nrow(A),1), function(x){
if((A[x, "age"] == "A") | (A[x, "age"] == "B") | (A[x, "age"] == "C"))
A$ageGroup = 1 #35歲以下者
else if((A[x, "age"] == "D") | (A[x, "age"] == "E") | (A[x, "age"] == "F"))
A$ageGroup = 2 #35~50歲
else
A$ageGroup = 3 #50歲以上者
})
A$ageGroup = as.factor(A$ageGroup)
X = X %>% mutate(wday = format(date, "%w"))
# table(X$wday)
# 製作顧客星期矩陣
mx = xtabs(~ cust + wday, X)
# dim(mx)
# mx[1:5,]
mx = mx / rowSums(mx)
# mx[1:5,]
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>%
setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>%
right_join(A, by='cust')
顧客偏向週末或平日消費?
A$weekend = sapply(seq(1,nrow(A),1), function(x){
w15 = A[x,c("W1","W2","W3","W4","W5")] %>% sum
w67 = A[x,c("W6","W7")] %>% sum
if(w67>w15)
A$weekend = 1 #偏好週末消費
else if(w67==w15)
A$weekend = 2 #無特別偏好
else
A$weekend = 3 #偏好平日者
})
A$weekend = as.factor(A$weekend)
library(Matrix)
library(slam)
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
cpm = xtabs(~ cust + prod, Z, sparse=T)
刪去購買次數小於10的產品
table(colSums(cpm) < 10) # 12862
FALSE TRUE
9652 12862
cpm = cpm[, colSums(cpm) > 10]
cpm = cpm[, order(-colSums(cpm))] # order product by frequency
dim(cpm) # 28584 9149
[1] 28584 9149
是否有購買前5大、10大、20大熱銷產品
hot5 = cpm[,1:5] %>% rowSums
A$hot5 = hot5>0
hot10 = cpm[,1:10] %>% rowSums
A$hot10 = hot10>0
hot20 = cpm[,1:20] %>% rowSums
A$hot20 = hot20>0
確認cpm和A的顧客順序及大小是否有相同
cpm = cpm %>% as.matrix
(A$cust == rownames(cpm)) %>% sum
[1] 28584
合併兩個資料並將欄位名稱變更
A = cbind(A,cpm[,1:20])
colnames(A)[33:52] = sapply(seq(33,52,1), function(x){
names = paste0("p",colnames(A)[x])
})
summary(A)
cust W1 W2
Min. : 1069 Min. :0.0000 Min. :0.0000
1st Qu.: 1060898 1st Qu.:0.0000 1st Qu.:0.0000
Median : 1654100 Median :0.0000 Median :0.0000
Mean : 1461070 Mean :0.2310 Mean :0.1399
3rd Qu.: 1945003 3rd Qu.:0.3484 3rd Qu.:0.2000
Max. :20002000 Max. :1.0000 Max. :1.0000
W3 W4 W5 W6
Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.000 Median :0.0000 Median :0.0000
Mean :0.1176 Mean :0.103 Mean :0.1188 Mean :0.1136
3rd Qu.:0.1333 3rd Qu.:0.000 3rd Qu.:0.1354 3rd Qu.:0.1111
Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
W7 r s f
Min. :0.0000 Min. : 1.00 Min. : 1.00 Min. : 1.000
1st Qu.:0.0000 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
Median :0.0000 Median :21.00 Median :68.00 Median : 2.000
Mean :0.1762 Mean :32.12 Mean :61.27 Mean : 3.089
3rd Qu.:0.2500 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
Max. :1.0000 Max. :92.00 Max. :92.00 Max. :60.000
m rev raw age
Min. : 8.0 Min. : 8 Min. : -742.0 D :5832
1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 C :5238
Median : 709.5 Median : 1566 Median : 218.0 E :4514
Mean : 1012.4 Mean : 2711 Mean : 420.8 F :3308
3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0 B :2802
Max. :10634.0 Max. :99597 Max. :15565.0 G :1940
(Other):4950
area amount buy prodPrice
E :9907 Min. : 8 Mode :logical Min. : 5.00
F :7798 1st Qu.: 454 FALSE:15342 1st Qu.: 63.36
C :3169 Median : 993 TRUE :13242 Median : 86.44
G :3052 Mean : 1499 Mean : 115.97
D :1778 3rd Qu.: 1955 3rd Qu.: 123.20
H :1295 Max. :28089 Max. :3939.00
(Other):1585 NA's :15342
weekday CHR k areaEFH
Sunday :6761 Mode :logical Min. : 1.000 Mode :logical
Monday :3463 FALSE:27066 1st Qu.: 1.000 FALSE:9584
Tuesday :2900 TRUE :1518 Median : 5.571 TRUE :19000
Wednesday:2727 Mean : 8.550
Thursday :3878 3rd Qu.:14.000
Friday :3617 Max. :45.000
Saturday :5238
areaACDEF mon1 mon2 mon3
Mode :logical Min. : 0.000 Min. : 0.0000 Min. : 0.000
FALSE:5130 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
TRUE :23454 Median : 1.000 Median : 1.0000 Median : 1.000
Mean : 1.112 Mean : 0.9339 Mean : 1.043
3rd Qu.: 1.000 3rd Qu.: 1.0000 3rd Qu.: 1.000
Max. :26.000 Max. :21.0000 Max. :25.000
ageGroup weekend hot5 hot10 hot20
1: 9445 1: 5337 Mode :logical Mode :logical Mode :logical
2:13654 2: 3205 FALSE:19029 FALSE:16425 FALSE:13336
3: 5485 3:20042 TRUE :9555 TRUE :12159 TRUE :15248
p4711271000014 p4714981010038 p4719090900065
Min. :0.0000 Min. : 0.0000 Min. :0.00000
1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000
Median :0.0000 Median : 0.0000 Median :0.00000
Mean :0.1837 Mean : 0.1631 Mean :0.07679
3rd Qu.:0.0000 3rd Qu.: 0.0000 3rd Qu.:0.00000
Max. :5.0000 Max. :16.0000 Max. :6.00000
p4711080010112 p4710908131589 p4713985863121
Min. : 0.00000 Min. :0.0000 Min. :0.000
1st Qu.: 0.00000 1st Qu.:0.0000 1st Qu.:0.000
Median : 0.00000 Median :0.0000 Median :0.000
Mean : 0.06014 Mean :0.0543 Mean :0.053
3rd Qu.: 0.00000 3rd Qu.:0.0000 3rd Qu.:0.000
Max. :21.00000 Max. :8.0000 Max. :9.000
p4710114128038 p4710421090059 p4710291112172
Min. :0.00000 Min. : 0.0000 Min. :0.0000
1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000
Median :0.00000 Median : 0.0000 Median :0.0000
Mean :0.05153 Mean : 0.0515 Mean :0.0508
3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.0000
Max. :5.00000 Max. :13.0000 Max. :5.0000
p4712425010712 p4710583996008 p4710011401128
Min. :0.00000 Min. :0.00000 Min. :0.00000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Median :0.00000 Median :0.00000 Median :0.00000
Mean :0.05076 Mean :0.04894 Mean :0.04807
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :5.00000 Max. :7.00000 Max. :8.00000
p4710088410139 p4719090900058 p4710094097768
Min. :0.00000 Min. : 0.00000 Min. : 0.00000
1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.: 0.00000
Median :0.00000 Median : 0.00000 Median : 0.00000
Mean :0.04803 Mean : 0.04628 Mean : 0.04429
3rd Qu.:0.00000 3rd Qu.: 0.00000 3rd Qu.: 0.00000
Max. :5.00000 Max. :13.00000 Max. :17.00000
p4710088410610 p4710085120628 p4710265849066
Min. :0.00000 Min. :0.00000 Min. :0.00000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Median :0.00000 Median :0.00000 Median :0.00000
Mean :0.04394 Mean :0.04121 Mean :0.04058
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :6.00000 Max. :7.00000 Max. :4.00000
p4710018004605 p4710908131534
Min. :0.00000 Min. :0.00000
1st Qu.:0.00000 1st Qu.:0.00000
Median :0.00000 Median :0.00000
Mean :0.03985 Mean :0.03778
3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :6.00000 Max. :5.00000
A$buy = factor(ifelse(A$buy, "yes", "no")) # comply to the rule of caret
TR = A[spl, ]
TS = A[!spl, ]
# 把"cust"及"amount"欄位拿掉
TR1 = subset(TR, select = -c(cust, amount))
TS1 = subset(TS, select = -c(cust, amount))
#做平行運算
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4
ctrl = trainControl(
method="repeatedcv", number=10, # 10-fold, Repeated CV
savePredictions = "final", classProbs=TRUE,
summaryFunction=twoClassSummary)
rpart(), Classification Treectrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
buy ~ ., data=TR1, method="rpart",
trControl=ctrl, metric="ROC",
tuneGrid = expand.grid(.cp = seq(0.0002,0.001,0.0001)))
Warning in .Internal(gc(verbose, reset)) :
closing unused connection 6 (<-LAPTOP-3J628RQI:11576)
Warning in .Internal(gc(verbose, reset)) :
closing unused connection 5 (<-LAPTOP-3J628RQI:11576)
Warning in .Internal(gc(verbose, reset)) :
closing unused connection 4 (<-LAPTOP-3J628RQI:11576)
Warning in .Internal(gc(verbose, reset)) :
closing unused connection 3 (<-LAPTOP-3J628RQI:11576)
Sys.time() - t0
Time difference of 37.65453 secs
plot(cv.rpart);cv.rpart
CART
20008 samples
49 predictor
2 classes: 'no', 'yes'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 2 times)
Summary of sample sizes: 18007, 18007, 18007, 18007, 18007, 18007, ...
Resampling results across tuning parameters:
cp ROC Sens Spec
2e-04 0.6945636 0.7386622 0.5807015
3e-04 0.7063938 0.7728375 0.5760073
4e-04 0.7161350 0.7787498 0.5792441
5e-04 0.7234962 0.7929951 0.5694282
6e-04 0.7256986 0.7994671 0.5675938
7e-04 0.7275238 0.8028204 0.5660290
8e-04 0.7258432 0.8104117 0.5597687
9e-04 0.7236823 0.8129257 0.5577191
1e-03 0.7224932 0.8109231 0.5605789
ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 7e-04.
#Create a CART model
rpart1 = rpart(buy ~ ., TR1, method="class", cp=0.0008)
PredictCART1 = predict(rpart1, TS, type="prob")[,2]
colAUC(PredictCART1,TS$buy,T) #0.7433228
[,1]
no vs. yes 0.7433228
# CART model
rpart2 = rpart(buy ~ ., data = TR1, method="class")
# Make predictions
PredictCART2 = predict(rpart2, newdata = TS1, type = "prob")[,2]
colAUC(PredictCART2, TS$buy,T) #0.6983998
[,1]
no vs. yes 0.6983998
glm2 = glm(buy ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6 -age -areaEFH -areaACDEF -weekday -weekend -hot20 -hot10 -hot5 -ageGroup - CHR -mon1 -mon2 -mon3 , TR1[,1:30], family=binomial())
summary(glm2)
Call:
glm(formula = buy ~ . - W7 - W1 - W2 - W3 - W4 - W5 - W6 - age -
areaEFH - areaACDEF - weekday - weekend - hot20 - hot10 -
hot5 - ageGroup - CHR - mon1 - mon2 - mon3, family = binomial(),
data = TR1[, 1:30])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.5126 -0.8816 -0.7090 1.0349 2.3789
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.034e+00 1.083e-01 -9.551 < 2e-16 ***
r -2.108e-02 2.319e-03 -9.091 < 2e-16 ***
s 1.821e-02 2.341e-03 7.776 7.47e-15 ***
f 2.252e-01 2.215e-02 10.170 < 2e-16 ***
m -3.110e-06 2.780e-05 -0.112 0.91093
rev 4.874e-05 1.958e-05 2.489 0.01281 *
raw -2.850e-04 8.663e-05 -3.290 0.00100 **
areaB -3.508e-02 1.321e-01 -0.266 0.79059
areaC -2.072e-01 1.044e-01 -1.985 0.04716 *
areaD 3.703e-02 1.109e-01 0.334 0.73850
areaE 2.606e-01 9.672e-02 2.695 0.00705 **
areaF 1.773e-01 9.738e-02 1.821 0.06859 .
areaG -4.751e-02 1.043e-01 -0.456 0.64869
areaH -1.878e-01 1.215e-01 -1.545 0.12234
prodPrice -7.209e-04 1.491e-04 -4.836 1.32e-06 ***
k -2.034e-02 5.019e-03 -4.054 5.04e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23270 on 19992 degrees of freedom
AIC: 23302
Number of Fisher Scoring iterations: 6
predGLM = predict(glm2, TS1, type="response")
colAUC(predGLM, TS$buy,T) #0.7578518
[,1]
no vs. yes 0.7578518
TS$buy %>% table
.
no yes
4603 3973
cm = table(actual = TS1$buy, predict = predGLM > 0.5); cm
predict
actual FALSE TRUE
no 3822 781
yes 1759 2214
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts
[1] 0.7038246
A$f_group = ifelse(A$f == 1,"1",
ifelse(A$f == 2,"2",
ifelse(A$f == 3,"3",
ifelse(A$f == 4,"4",">=5"))))
A <- A %>% mutate(r_group = ntile(r, 5))
#將f_group和r_group轉為factor
A[,c("f_group","r_group")] <- lapply(A[,c("f_group","r_group")], factor)
#A$buy = factor(ifelse(A$buy, "yes", "no")) # comply to the rule of caret
TR_inter = A[spl, ]
TS_inter = A[!spl, ]
# 把"cust"及"amount"欄位拿掉
TR1_inter = subset(TR_inter, select = -c(cust, amount))
TS1_inter = subset(TS_inter, select = -c(cust, amount))
glm2 = glm(buy ~ s+m+rev+k+prodPrice+raw+area+f_group:r_group, TR1_inter, family=binomial())
# glm2 = glm(buy ~ area, TR1, family = binomial)
# area E,F,H 很顯著
summary(glm2)
Call:
glm(formula = buy ~ s + m + rev + k + prodPrice + raw + area +
f_group:r_group, family = binomial(), data = TR1_inter)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4786 -0.8656 -0.6795 1.0043 2.5030
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.481e+00 4.428e-01 -3.344 0.000827 ***
s 8.204e-03 2.160e-03 3.798 0.000146 ***
m -6.267e-05 2.595e-05 -2.415 0.015729 *
rev 9.694e-05 1.803e-05 5.376 7.61e-08 ***
k -1.718e-02 5.365e-03 -3.203 0.001361 **
prodPrice -7.830e-04 1.529e-04 -5.122 3.02e-07 ***
raw -3.517e-04 8.547e-05 -4.115 3.87e-05 ***
areaB -4.696e-02 1.332e-01 -0.353 0.724401
areaC -2.152e-01 1.053e-01 -2.045 0.040885 *
areaD 2.457e-02 1.119e-01 0.220 0.826238
areaE 2.527e-01 9.749e-02 2.592 0.009541 **
areaF 1.601e-01 9.820e-02 1.631 0.102992
areaG -5.046e-02 1.052e-01 -0.480 0.631382
areaH -1.473e-01 1.212e-01 -1.216 0.224102
f_group1:r_group1 5.614e-01 4.383e-01 1.281 0.200248
f_group2:r_group1 1.095e+00 4.401e-01 2.488 0.012840 *
f_group3:r_group1 1.508e+00 4.282e-01 3.522 0.000428 ***
f_group4:r_group1 1.881e+00 4.230e-01 4.448 8.67e-06 ***
f_group>=5:r_group1 2.662e+00 4.078e-01 6.527 6.70e-11 ***
f_group1:r_group2 4.779e-01 4.308e-01 1.109 0.267317
f_group2:r_group2 1.101e+00 4.334e-01 2.540 0.011074 *
f_group3:r_group2 1.481e+00 4.220e-01 3.509 0.000450 ***
f_group4:r_group2 1.728e+00 4.203e-01 4.111 3.95e-05 ***
f_group>=5:r_group2 2.423e+00 4.103e-01 5.905 3.52e-09 ***
f_group1:r_group3 2.899e-01 4.216e-01 0.688 0.491614
f_group2:r_group3 1.050e+00 4.240e-01 2.476 0.013269 *
f_group3:r_group3 1.429e+00 4.176e-01 3.422 0.000622 ***
f_group4:r_group3 1.541e+00 4.203e-01 3.666 0.000246 ***
f_group>=5:r_group3 2.015e+00 4.128e-01 4.881 1.05e-06 ***
f_group1:r_group4 9.406e-02 4.092e-01 0.230 0.818212
f_group2:r_group4 8.120e-01 4.111e-01 1.975 0.048219 *
f_group3:r_group4 1.180e+00 4.136e-01 2.853 0.004332 **
f_group4:r_group4 1.284e+00 4.260e-01 3.015 0.002571 **
f_group>=5:r_group4 1.533e+00 4.275e-01 3.586 0.000335 ***
f_group1:r_group5 -2.060e-01 4.043e-01 -0.510 0.610385
f_group2:r_group5 3.573e-01 4.093e-01 0.873 0.382713
f_group3:r_group5 4.309e-01 4.295e-01 1.003 0.315796
f_group4:r_group5 1.519e-01 5.224e-01 0.291 0.771228
f_group>=5:r_group5 NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23270 on 19970 degrees of freedom
AIC: 23346
Number of Fisher Scoring iterations: 4
pred = predict(glm2, TS1_inter, type="response")
prediction from a rank-deficient fit may be misleading
colAUC(pred, TS$buy) # 0.7466507
[,1]
no vs. yes 0.7577671
TS$buy %>% table
.
no yes
4603 3973
cm = table(actual = TS1_inter$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
no 3794 809
yes 1701 2272
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts
[1] 0.7073228
#刪除f_group,r_group 避免影響後面
A$f_group <- NULL
A$r_group <- NULL
RFM分群參考:
集群分析的模型與預測方法參考:
#colnames(TR1)
LTR = TR1[,c(8:13,17,20)]
LTS = TS1[,c(8:13,17,20)]
#LTR = TR1[,c(1:13,16,18:24)]
#LTS = TS1[,c(1:13,16,18:24)]
#LTR = TR1[,c(2:13,16,18:24,27:49)]
#LTS = TS1[,c(2:13,16,18:24,27:49)]
library(caret)
preproc = preProcess(LTR)
NTR = predict(preproc, LTR)
NTS = predict(preproc, LTS)
#summary(NTR)
set.seed(144)
km <- kmeans(NTR, 7)
table(km$cluster)
1 2 3 4 5 6 7
303 2251 4159 1354 6052 5701 188
NTR$group = km$cluster
df = group_by(NTR, group) %>% summarise(
avg_frequency = mean(f),
avg_monetary = mean(m),
total_revenue = sum(rev),
group_size = n(),
avg_recency = mean(r)
#avg_hot5 = mean(hot5),
#avg_hot10 = mean(hot10),
#avg_hot20 = mean(hot20)
#profit = sum(raw)
) %>% ungroup %>%
mutate(
#pc_revenue = round(100*total_revenue/sum(total_revenue),1), # % revenue contrib.
#pc_profit = round(100*profit/sum(profit),1), # % profit contrib.
dummy = 2001 # to fit googleViz's data format
) %>% data.frame
plot( gvisMotionChart(
df, "group", "dummy",
options=list(width=800, height=600)
))
library(flexclust)
km.kcca = as.kcca(km, NTR[,1:8]) #NTR不取group
CTR = predict(km.kcca)
CTS = predict(km.kcca, newdata=NTS)
#colnames(TR1)
M = lapply(split(TR1[,c(8:13,17,20,16)], CTR), function(x)
glm(buy~., data=x, family=binomial) )
#sapply(M, coef)
#glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
Pred = lapply(1:7, function(i)
predict(M[[i]], TS1[CTS==i,], type='response') )
sapply(1:7, function(i)
colAUC(Pred[[i]], TS1$buy[CTS==i]))
[1] 0.8928571 0.7162432 0.6366921 0.6616960 0.6222446 0.6843932
[7] 0.6673993
a = cbind(TS1$buy[CTS==1],Pred[[1]])
b = cbind(TS1$buy[CTS==2],Pred[[2]])
c = cbind(TS1$buy[CTS==3],Pred[[3]])
d = cbind(TS1$buy[CTS==4],Pred[[4]])
e = cbind(TS1$buy[CTS==5],Pred[[5]])
f = cbind(TS1$buy[CTS==6],Pred[[6]])
g = cbind(TS1$buy[CTS==7],Pred[[7]])
CF = rbind(a,b,c,d,e,f,g)
colAUC(CF[,2],CF[,1])
[,1]
1 vs. 2 0.7564442
# GLM + CART1
result = (predGLM + PredictCART1)/2
colAUC(result, TS$buy,T) #0.7584242
[,1]
no vs. yes 0.7584242
# GLM + CART2
result = (predGLM + PredictCART2)/2
colAUC(result, TS$buy,T) # 0.7584827 higher
[,1]
no vs. yes 0.7584827
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2) %>% subset(., select = -c(cust, buy))
TS2 = subset(A2, !spl2) %>% subset(., select = -c(cust, buy))
lm1 = lm(amount ~ . -W7 -W1 -W2 -W3 -W4 -W5 -W6 -areaACDEF-area -weekday -hot5-hot10-hot20-ageGroup-mon1-mon2 ,TR2)
summary(lm1)
Call:
lm(formula = amount ~ . - W7 - W1 - W2 - W3 - W4 - W5 - W6 -
areaACDEF - area - weekday - hot5 - hot10 - hot20 - ageGroup -
mon1 - mon2, data = TR2)
Residuals:
Min 1Q Median 3Q Max
-1.8497 -0.2264 0.0478 0.2769 1.5540
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.335e+00 4.704e-02 28.376 < 2e-16 ***
r -1.546e-03 5.262e-04 -2.937 0.00332 **
s 2.038e-03 5.155e-04 3.954 7.73e-05 ***
f 1.170e-02 2.705e-03 4.325 1.54e-05 ***
m 5.367e-01 4.345e-02 12.350 < 2e-16 ***
rev -3.019e-02 4.133e-02 -0.731 0.46505
raw 6.186e-05 8.795e-06 7.033 2.16e-12 ***
ageB 7.682e-02 2.492e-02 3.082 0.00206 **
ageC 1.194e-01 2.289e-02 5.218 1.85e-07 ***
ageD 1.262e-01 2.259e-02 5.588 2.37e-08 ***
ageE 1.321e-01 2.312e-02 5.712 1.15e-08 ***
ageF 1.054e-01 2.416e-02 4.365 1.29e-05 ***
ageG 8.057e-02 2.635e-02 3.058 0.00223 **
ageH 7.178e-02 3.106e-02 2.311 0.02083 *
ageI 6.980e-02 3.196e-02 2.184 0.02899 *
ageJ -1.970e-02 2.806e-02 -0.702 0.48279
ageK 1.130e-01 3.768e-02 2.999 0.00272 **
prodPrice -3.198e-04 5.026e-05 -6.364 2.07e-10 ***
CHRTRUE -4.254e-02 1.746e-02 -2.437 0.01484 *
k -5.485e-03 1.073e-03 -5.111 3.26e-07 ***
areaEFHTRUE -1.471e-02 1.029e-02 -1.430 0.15289
mon3 1.524e-02 4.845e-03 3.145 0.00166 **
weekend2 -3.722e-03 1.755e-02 -0.212 0.83209
weekend3 -4.709e-03 1.262e-02 -0.373 0.70897
p4711271000014 -1.239e-02 9.222e-03 -1.343 0.17922
p4714981010038 -1.616e-02 8.396e-03 -1.925 0.05428 .
p4719090900065 -1.011e-02 1.227e-02 -0.824 0.40996
p4711080010112 1.684e-03 9.343e-03 0.180 0.85697
p4710908131589 1.263e-02 1.461e-02 0.864 0.38752
p4713985863121 2.320e-02 1.386e-02 1.674 0.09424 .
p4710114128038 1.809e-02 1.445e-02 1.252 0.21066
p4710421090059 -1.361e-02 1.319e-02 -1.031 0.30247
p4710291112172 3.389e-02 1.499e-02 2.262 0.02374 *
p4712425010712 3.541e-02 1.513e-02 2.340 0.01929 *
p4710583996008 1.054e-02 1.377e-02 0.766 0.44376
p4710011401128 -1.081e-03 1.237e-02 -0.087 0.93037
p4710088410139 4.659e-03 1.527e-02 0.305 0.76035
p4719090900058 2.458e-02 1.613e-02 1.524 0.12764
p4710094097768 1.946e-02 1.103e-02 1.765 0.07760 .
p4710088410610 1.353e-02 1.377e-02 0.982 0.32598
p4710085120628 6.767e-04 1.616e-02 0.042 0.96661
p4710265849066 -1.987e-02 1.750e-02 -1.135 0.25628
p4710018004605 1.555e-02 1.562e-02 0.996 0.31935
p4710908131534 -7.272e-04 1.635e-02 -0.044 0.96453
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4179 on 9225 degrees of freedom
Multiple R-squared: 0.305, Adjusted R-squared: 0.3017
F-statistic: 94.14 on 43 and 9225 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.3049726 0.2732476
library(caret)
library(e1071)
library(rpart)
# Define cross-validation experiment
# k=10
numFolds = trainControl( method = "cv", number = 10 )
# pick the posiible values for cp
cpGrid = expand.grid( .cp = seq(0.001,0.1,0.001))
# Perform the cross validation
cv = train(amount ~ ., data = TR2, method = "rpart", trControl = numFolds, tuneGrid = cpGrid )
# method="rpart" : we wanto to validate a CART model
plot(cv);cv
CART
9269 samples
49 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 8343, 8343, 8340, 8344, 8342, 8342, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.001 0.4280068 0.2711698 0.3286290
0.002 0.4236615 0.2828408 0.3262958
0.003 0.4242213 0.2806896 0.3266302
0.004 0.4262393 0.2737030 0.3284240
0.005 0.4267641 0.2717313 0.3289417
0.006 0.4269306 0.2712072 0.3292381
0.007 0.4269095 0.2712740 0.3293730
0.008 0.4289031 0.2646008 0.3304202
0.009 0.4298855 0.2611819 0.3311835
0.010 0.4328875 0.2505936 0.3339821
0.011 0.4328975 0.2505476 0.3338670
0.012 0.4333520 0.2489805 0.3344444
0.013 0.4348697 0.2436760 0.3355859
0.014 0.4367141 0.2372062 0.3372679
0.015 0.4368929 0.2365914 0.3373805
0.016 0.4374052 0.2348192 0.3375237
0.017 0.4374052 0.2348192 0.3375237
0.018 0.4374052 0.2348192 0.3375237
0.019 0.4374052 0.2348192 0.3375237
0.020 0.4374052 0.2348192 0.3375237
0.021 0.4374052 0.2348192 0.3375237
0.022 0.4374052 0.2348192 0.3375237
0.023 0.4374052 0.2348192 0.3375237
0.024 0.4374052 0.2348192 0.3375237
0.025 0.4374052 0.2348192 0.3375237
0.026 0.4374052 0.2348192 0.3375237
0.027 0.4374052 0.2348192 0.3375237
0.028 0.4374052 0.2348192 0.3375237
0.029 0.4374052 0.2348192 0.3375237
0.030 0.4386075 0.2305706 0.3383498
0.031 0.4415920 0.2202157 0.3402422
0.032 0.4435006 0.2135814 0.3417392
0.033 0.4441417 0.2112573 0.3422859
0.034 0.4441417 0.2112573 0.3422859
0.035 0.4441417 0.2112573 0.3422859
0.036 0.4441417 0.2112573 0.3422859
0.037 0.4441417 0.2112573 0.3422859
0.038 0.4441417 0.2112573 0.3422859
0.039 0.4466367 0.2024459 0.3438247
0.040 0.4466367 0.2024459 0.3438247
0.041 0.4466367 0.2024459 0.3438247
0.042 0.4466367 0.2024459 0.3438247
0.043 0.4466367 0.2024459 0.3438247
0.044 0.4466367 0.2024459 0.3438247
0.045 0.4466367 0.2024459 0.3438247
0.046 0.4466367 0.2024459 0.3438247
0.047 0.4479206 0.1978418 0.3448467
0.048 0.4527458 0.1803652 0.3490769
0.049 0.4542481 0.1749889 0.3503303
0.050 0.4555422 0.1704642 0.3515812
0.051 0.4574262 0.1635717 0.3535963
0.052 0.4581809 0.1606468 0.3543684
0.053 0.4581809 0.1606468 0.3543684
0.054 0.4581809 0.1606468 0.3543684
0.055 0.4581809 0.1606468 0.3543684
0.056 0.4581809 0.1606468 0.3543684
0.057 0.4581809 0.1606468 0.3543684
0.058 0.4581809 0.1606468 0.3543684
0.059 0.4581809 0.1606468 0.3543684
0.060 0.4581809 0.1606468 0.3543684
0.061 0.4581809 0.1606468 0.3543684
0.062 0.4581809 0.1606468 0.3543684
0.063 0.4581809 0.1606468 0.3543684
0.064 0.4581809 0.1606468 0.3543684
0.065 0.4581809 0.1606468 0.3543684
0.066 0.4581809 0.1606468 0.3543684
0.067 0.4581809 0.1606468 0.3543684
0.068 0.4581809 0.1606468 0.3543684
0.069 0.4581809 0.1606468 0.3543684
0.070 0.4581809 0.1606468 0.3543684
0.071 0.4581809 0.1606468 0.3543684
0.072 0.4581809 0.1606468 0.3543684
0.073 0.4581809 0.1606468 0.3543684
0.074 0.4581809 0.1606468 0.3543684
0.075 0.4581809 0.1606468 0.3543684
0.076 0.4581809 0.1606468 0.3543684
0.077 0.4581809 0.1606468 0.3543684
0.078 0.4581809 0.1606468 0.3543684
0.079 0.4581809 0.1606468 0.3543684
0.080 0.4581809 0.1606468 0.3543684
0.081 0.4581809 0.1606468 0.3543684
0.082 0.4581809 0.1606468 0.3543684
0.083 0.4581809 0.1606468 0.3543684
0.084 0.4581809 0.1606468 0.3543684
0.085 0.4581809 0.1606468 0.3543684
0.086 0.4581809 0.1606468 0.3543684
0.087 0.4581809 0.1606468 0.3543684
0.088 0.4581809 0.1606468 0.3543684
0.089 0.4581809 0.1606468 0.3543684
0.090 0.4581809 0.1606468 0.3543684
0.091 0.4581809 0.1606468 0.3543684
0.092 0.4581809 0.1606468 0.3543684
0.093 0.4581809 0.1606468 0.3543684
0.094 0.4581809 0.1606468 0.3543684
0.095 0.4581809 0.1606468 0.3543684
0.096 0.4581809 0.1606468 0.3543684
0.097 0.4581809 0.1606468 0.3543684
0.098 0.4581809 0.1606468 0.3543684
0.099 0.4581809 0.1606468 0.3543684
0.100 0.4581809 0.1606468 0.3543684
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.002.
rpart2 = rpart(amount ~ ., data = TR2, cp = 0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) - TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2521828
predLM = predict(lm1, newdata=TS2)
predCART = predict(rpart2, newdata=TS2)
result = (predLM + predCART)/2
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((result - TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2740926
stopCluster(clust)