Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
library(lubridate)
library(corrplot)
load(file='data/tf2.Rdata')
TR=subset(A,spl)
TS=subset(A,!spl)
is.na(TR) %>% colSums() #計算TR的NA數量
cust r s f m rev raw age area amount buy
0 0 0 0 0 0 0 0 0 10739 0
cx=c(2:9,11)
colnames(TR[,cx])
[1] "r" "s" "f" "m" "rev" "raw" "age" "area" "buy"
glm1 = glm(buy ~ ., TR[,cx], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, cx])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.7931 -0.8733 -0.6991 1.0384 1.8735
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.259e+00 1.261e-01 -9.985 < 2e-16 ***
r -1.227e-02 8.951e-04 -13.708 < 2e-16 ***
s 9.566e-03 9.101e-04 10.511 < 2e-16 ***
f 2.905e-01 1.593e-02 18.233 < 2e-16 ***
m -3.028e-05 2.777e-05 -1.090 0.27559
rev 4.086e-05 1.940e-05 2.106 0.03521 *
raw -2.306e-04 8.561e-05 -2.693 0.00708 **
ageB -4.194e-02 8.666e-02 -0.484 0.62838
ageC 1.772e-02 7.992e-02 0.222 0.82456
ageD 7.705e-02 7.921e-02 0.973 0.33074
ageE 8.699e-02 8.132e-02 1.070 0.28476
ageF 1.928e-02 8.457e-02 0.228 0.81962
ageG 1.745e-02 9.323e-02 0.187 0.85155
ageH 1.752e-01 1.094e-01 1.602 0.10926
ageI 6.177e-02 1.175e-01 0.526 0.59904
ageJ 2.652e-01 1.047e-01 2.533 0.01131 *
ageK -1.419e-01 1.498e-01 -0.947 0.34347
areaB -4.105e-02 1.321e-01 -0.311 0.75603
areaC -2.075e-01 1.045e-01 -1.986 0.04703 *
areaD 3.801e-02 1.111e-01 0.342 0.73214
areaE 2.599e-01 9.682e-02 2.684 0.00727 **
areaF 1.817e-01 9.753e-02 1.863 0.06243 .
areaG -4.677e-02 1.045e-01 -0.448 0.65435
areaH -1.695e-01 1.232e-01 -1.375 0.16912
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23295 on 19984 degrees of freedom
AIC: 23343
Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3730 873
TRUE 1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts #0.755638
[1] 0.6999767
colAUC(pred, TS$buy) #0.7556038
[,1]
FALSE vs. TRUE 0.7556038
join_df=group_by(X, cust) %>% summarise(
T11=(month(date)==11) %>% sum ,
T12=(month(date)==12) %>% sum ,
T1=(month(date)==1) %>% sum
) %>% data.frame # 28584
join_df
note
T11、T12、T1:將顧客分別在11、12、1月來店次數的總和。
A = merge(A, join_df, by="cust", all.x=T)
A
#顧客在11月的消費
Nov = filter(X, month(date)==11 ) %>%
group_by(cust) %>%
summarise(
amount_nov = sum(total),
items_nov=sum(items),
pieces_nov=sum(pieces),
gross_nov=sum(gross)
)
Nov
#顧客在12月的消費
Dec = filter(X, month(date)==12 ) %>%
group_by(cust) %>%
summarise(
amount_dec = sum(total),
items_dec=sum(items),
pieces_dec=sum(pieces),
gross_dec=sum(gross)
)
Dec
#顧客在1月的消費
Jan = filter(X, month(date)==1 ) %>%
group_by(cust) %>%
summarise(
amount_m1 = sum(total),
items_m1=sum(items),
pieces_m1=sum(pieces),
gross_m1=sum(gross),
price_m1=sum(gross)
)
Jan
note
A = merge(A, Nov, by="cust", all.x=T)
A = merge(A, Dec, by="cust", all.x=T)
A = merge(A, Jan, by="cust", all.x=T)
A
for(i in 15:27){
mean_col <- mean(A[, i], na.rm = T) # mean of col ith
na.rows <- is.na(A[, i]) #col ith na data
A[na.rows, i] <- mean_col
}
Fig-1: complete NA’s
A$amount_total=A$amount_nov+A$amount_dec+A$amount_m1
A$gross_total=A$gross_nov+A$gross_dec+A$gross_m1
A$items_total=A$items_nov+A$items_dec+A$items_m1
A$pieces_total=A$pieces_nov+A$pieces_dec+A$pieces_m1
A$f_itemtotal=A$f*A$items_total
A$f_amounttotal=A$f*A$amount_total
TR=subset(A,spl)
TS=subset(A,!spl)
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
[1] "r" "s" "f" "m" "rev"
[6] "raw" "age" "area" "buy" "T1"
[11] "amount_m1" "amount_total" "items_total" "pieces_total" "f_itemtotal"
[16] "f_amounttotal"
glm1 = glm(buy ~ ., TR[,cx], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, cx])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.5203 -0.8705 -0.6974 1.0348 2.0108
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.003e+00 1.672e-01 -5.999 1.98e-09 ***
r -5.582e-03 1.506e-03 -3.707 0.000210 ***
s 5.529e-03 1.352e-03 4.091 4.29e-05 ***
f 2.529e-01 1.823e-02 13.871 < 2e-16 ***
m -5.899e-05 3.048e-05 -1.935 0.052931 .
rev 2.226e-04 3.897e-05 5.712 1.12e-08 ***
raw -3.356e-04 8.815e-05 -3.807 0.000140 ***
ageB -3.865e-02 8.665e-02 -0.446 0.655527
ageC 1.282e-02 7.996e-02 0.160 0.872605
ageD 6.121e-02 7.927e-02 0.772 0.440067
ageE 6.887e-02 8.144e-02 0.846 0.397774
ageF -1.709e-03 8.474e-02 -0.020 0.983913
ageG 9.365e-04 9.343e-02 0.010 0.992003
ageH 1.677e-01 1.095e-01 1.532 0.125559
ageI 5.406e-02 1.178e-01 0.459 0.646179
ageJ 2.578e-01 1.049e-01 2.457 0.013991 *
ageK -1.481e-01 1.497e-01 -0.990 0.322220
areaB -4.090e-02 1.324e-01 -0.309 0.757343
areaC -2.099e-01 1.047e-01 -2.004 0.045059 *
areaD 4.644e-02 1.113e-01 0.417 0.676577
areaE 2.566e-01 9.707e-02 2.643 0.008208 **
areaF 1.730e-01 9.781e-02 1.769 0.076952 .
areaG -4.291e-02 1.047e-01 -0.410 0.681866
areaH -1.677e-01 1.235e-01 -1.358 0.174384
T1 1.090e-01 2.983e-02 3.655 0.000257 ***
amount_m1 3.149e-05 1.913e-05 1.646 0.099769 .
amount_total -1.971e-04 3.865e-05 -5.100 3.39e-07 ***
items_total 7.446e-03 3.536e-03 2.106 0.035231 *
pieces_total 8.636e-04 2.302e-03 0.375 0.707500
f_itemtotal -3.176e-04 3.707e-04 -0.857 0.391518
f_amounttotal -1.355e-06 2.907e-06 -0.466 0.641057
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23228 on 19977 degrees of freedom
AIC: 23290
Number of Fisher Scoring iterations: 8
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3755 848
TRUE 1695 2278
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts #0.7017257
[1] 0.7034748
colAUC(pred, TS$buy)
[,1]
FALSE vs. TRUE 0.7575633
#0.7575633
glm1_step=step(glm1,direction = 'backward')
Start: AIC=23290.28
buy ~ r + s + f + m + rev + raw + age + area + T1 + amount_m1 +
amount_total + items_total + pieces_total + f_itemtotal +
f_amounttotal
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- age 10 23246 23288
- pieces_total 1 23228 23288
- f_amounttotal 1 23228 23288
<none> 23228 23290
- amount_m1 1 23231 23291
- m 1 23232 23292
- items_total 1 23233 23293
- T1 1 23242 23302
- r 1 23242 23302
- raw 1 23243 23303
- s 1 23245 23305
- amount_total 1 23255 23315
- area 7 23329 23377
- f 1 23408 23468
- rev 1 23955 24015
- f_itemtotal 1 476281 476341
Step: AIC=23287.96
buy ~ r + s + f + m + rev + raw + area + T1 + amount_m1 + amount_total +
items_total + pieces_total + f_itemtotal + f_amounttotal
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- f_amounttotal 1 23246 23286
- pieces_total 1 23246 23286
<none> 23246 23288
- amount_m1 1 23249 23289
- m 1 23250 23290
- items_total 1 23250 23290
- T1 1 23260 23300
- r 1 23260 23300
- raw 1 23261 23301
- s 1 23262 23302
- amount_total 1 23274 23314
- area 7 23350 23378
- f 1 23426 23466
- rev 1 23966 24006
- f_itemtotal 1 27848 27888
Step: AIC=23286.14
buy ~ r + s + f + m + rev + raw + area + T1 + amount_m1 + amount_total +
items_total + pieces_total + f_itemtotal
glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- pieces_total 1 23246 23284
<none> 23246 23286
- amount_m1 1 23249 23287
- m 1 23250 23288
- items_total 1 23252 23290
- f_itemtotal 1 23259 23297
- T1 1 23260 23298
- r 1 23260 23298
- raw 1 23261 23299
- s 1 23263 23301
- amount_total 1 23279 23317
- rev 1 23280 23318
- area 7 23350 23376
- f 1 23427 23465
Step: AIC=23284.4
buy ~ r + s + f + m + rev + raw + area + T1 + amount_m1 + amount_total +
items_total + f_itemtotal
glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
<none> 23246 23284
- amount_m1 1 23249 23285
- m 1 23250 23286
- f_itemtotal 1 23259 23295
- T1 1 23260 23296
- r 1 23260 23296
- raw 1 23262 23298
- s 1 23263 23299
- items_total 1 23273 23309
- amount_total 1 23279 23315
- rev 1 23280 23316
- area 7 23350 23374
- f 1 23428 23464
pred = predict(glm1_step, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3757 846
TRUE 1707 2266
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts #0.7028918
[1] 0.7023088
colAUC(pred, TS$buy)
[,1]
FALSE vs. TRUE 0.7577661
#0.7577661
TR$amount_new=log(1+TR$amount)
TR$amount[is.na(TR$amount)]=0
cx=c(2:7,10)
colnames(TR[,cx])
[1] "r" "s" "f" "m" "rev" "raw" "amount"
cor(TR[,cx]) %>% corrplot()
TR=subset(A,spl)
TS=subset(A,!spl)
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
[1] "r" "s" "f" "m" "rev"
[6] "raw" "age" "area" "buy" "T1"
[11] "amount_m1" "amount_total" "items_total" "pieces_total" "f_itemtotal"
[16] "f_amounttotal"
rpart1 = rpart(buy ~ ., TR, # simplify the formula
method="class", cp=0.001)
library(rpart.plot)
rpart.plot(rpart1,cex=0.6)#cex控制字體大小,這畫法會較詳細
pred = predict(rpart1, TS, type = "class") # predict classes;如果要predict機率,class 要寫prop
x = table(actual = TS$buy, pred); x #如果是predict機率的話,後面要寫pred>0.5
pred
actual FALSE TRUE
FALSE 3884 719
TRUE 1805 2168
sum(diag(x))/sum(x) # 0.7022
[1] 0.7056903
library(ROCR)
Loading required package: gplots
Attaching package: 'gplots'
The following object is masked from 'package:stats':
lowess
PredictROC = predict(rpart1, TS) # predict prob. #決策樹有多個類別,所以把每一個類別的機率列出來,才會多這個步驟(glm只有一個類別)
head(PredictROC)
FALSE TRUE
16 0.6870852 0.3129148
22 0.3313911 0.6686089
26 0.6870852 0.3129148
34 0.1608973 0.8391027
35 0.6870852 0.3129148
37 0.6870852 0.3129148
perf = prediction(PredictROC[,2], TS$buy) #做roc的時候只需要y=1的機率而已
perf = performance(perf, "tpr", "fpr")
plot(perf)
pred = predict(rpart1, TS)[,2] # prob. of Reverse = 1
colAUC(pred, TS$buy, T) # AUC = 0.6984
[,1]
FALSE vs. TRUE 0.7144386
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4
TR=subset(A,spl)
TS=subset(A,!spl)
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
[1] "r" "s" "f" "m" "rev"
[6] "raw" "age" "area" "buy" "T1"
[11] "amount_m1" "amount_total" "items_total" "pieces_total" "f_itemtotal"
[16] "f_amounttotal"
TR$buy = factor(ifelse(TR$buy, "yes", "no"))
TS$buy = factor(ifelse(TS$buy, "yes", "no"))
ctrl = trainControl(
method="repeatedcv", number=10, # 10-fold, Repeated CV
savePredictions = "final", classProbs=TRUE,
summaryFunction=twoClassSummary)
rpart(), Classification Treectrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
buy ~ ., data=TR[,cx], method="rpart",
trControl=ctrl, metric="ROC",
tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
Time difference of 1.736375 mins
plot(cv.rpart)
cv.rpart$results
rpart1 = rpart(buy ~ ., TR[,cx], method="class", cp=0.0006)
predict(rpart1, TS, type="prob")[,2] %>%
colAUC(TS$buy)
[,1]
no vs. yes 0.7162952
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
buy ~ ., data=TR[,cx], method="glm",
trControl=ctrl, metric="ROC")
Sys.time() - t0
Time difference of 23.91107 secs
cv.glm$results
glm(), Final Modelglm1 = b=glm(buy ~ ., TR[,cx], family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy) #0.7575633
[,1]
no vs. yes 0.7575633