Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(lubridate)
library(rpart.plot)
library(corrplot)
library(ggplot2)
load(file='data/tf2.Rdata')
TR=subset(A,spl)
TS=subset(A,!spl)
is.na(TR) %>% colSums() #計算TR的NA數量
cust r s f m rev raw age area amount
0 0 0 0 0 0 0 0 0 10739
buy
0
cx=c(2:9,11)
colnames(TR[,cx])
[1] "r" "s" "f" "m" "rev" "raw" "age" "area" "buy"
glm1 = glm(buy ~ ., TR[,cx], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, cx])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.7931 -0.8733 -0.6991 1.0384 1.8735
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.259e+00 1.261e-01 -9.985 < 2e-16 ***
r -1.227e-02 8.951e-04 -13.708 < 2e-16 ***
s 9.566e-03 9.101e-04 10.511 < 2e-16 ***
f 2.905e-01 1.593e-02 18.233 < 2e-16 ***
m -3.028e-05 2.777e-05 -1.090 0.27559
rev 4.086e-05 1.940e-05 2.106 0.03521 *
raw -2.306e-04 8.561e-05 -2.693 0.00708 **
ageB -4.194e-02 8.666e-02 -0.484 0.62838
ageC 1.772e-02 7.992e-02 0.222 0.82456
ageD 7.705e-02 7.921e-02 0.973 0.33074
ageE 8.699e-02 8.132e-02 1.070 0.28476
ageF 1.928e-02 8.457e-02 0.228 0.81962
ageG 1.745e-02 9.323e-02 0.187 0.85155
ageH 1.752e-01 1.094e-01 1.602 0.10926
ageI 6.177e-02 1.175e-01 0.526 0.59904
ageJ 2.652e-01 1.047e-01 2.533 0.01131 *
ageK -1.419e-01 1.498e-01 -0.947 0.34347
areaB -4.105e-02 1.321e-01 -0.311 0.75603
areaC -2.075e-01 1.045e-01 -1.986 0.04703 *
areaD 3.801e-02 1.111e-01 0.342 0.73214
areaE 2.599e-01 9.682e-02 2.684 0.00727 **
areaF 1.817e-01 9.753e-02 1.863 0.06243 .
areaG -4.677e-02 1.045e-01 -0.448 0.65435
areaH -1.695e-01 1.232e-01 -1.375 0.16912
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23295 on 19984 degrees of freedom
AIC: 23343
Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3730 873
TRUE 1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts
[1] 0.6999767
colAUC(pred, TS$buy) #0.7556038
[,1]
FALSE vs. TRUE 0.7556038
cx=c(2:7,11)
colnames(TR[,cx])
[1] "r" "s" "f" "m" "rev" "raw" "buy"
cor(TR[,cx]) %>% corrplot.mixed()
join_df=group_by(X, cust) %>% summarise(
T11=(month(date)==11) %>% sum ,
T12=(month(date)==12) %>% sum ,
T1=(month(date)==1) %>% sum
) %>% data.frame # 28584
join_df
note
T11、T12、T1:將顧客分別在11、12、1月來店次數的總和。
A = merge(A, join_df, by="cust", all.x=T)
A
tapply(A$buy, A$T11, mean) %>% barplot
tapply(A$buy, A$T12, mean) %>% barplot
tapply(A$buy, A$T1, mean) %>% barplot
#顧客在11月的消費
Nov = filter(X, month(date)==11 ) %>%
group_by(cust) %>%
summarise(
amount_nov = sum(total),#消費總額
items_nov=sum(items),#交易件數
pieces_nov=sum(pieces),#購買商品個數
gross_nov=sum(gross)
)
Nov
#顧客在12月的消費
Dec = filter(X, month(date)==12 ) %>%
group_by(cust) %>%
summarise(
amount_dec = sum(total),
items_dec=sum(items),
pieces_dec=sum(pieces),
gross_dec=sum(gross)
)
Dec
#顧客在1月的消費
Jan = filter(X, month(date)==1 ) %>%
group_by(cust) %>%
summarise(
amount_m1 = sum(total),#消費總額
items_m1=sum(items),#交易件數
pieces_m1=sum(pieces),#購買商品個數
gross_m1=sum(gross)
)
Jan
note
A = merge(A, Nov, by="cust", all.x=T)
A = merge(A, Dec, by="cust", all.x=T)
A = merge(A, Jan, by="cust", all.x=T)
A
for(i in 15:24){
mean_col <- mean(A[, i], na.rm = T) # mean of col ith
na.rows <- is.na(A[, i]) #col ith na data
A[na.rows, i] <- mean_col
}
Figure - 填補NA
A$amount_total=A$amount_nov+A$amount_dec+A$amount_m1
A$gross_total=A$gross_nov+A$gross_dec+A$gross_m1
A$items_total=A$items_nov+A$items_dec+A$items_m1
A$pieces_total=A$pieces_nov+A$pieces_dec+A$pieces_m1
A$f_itemtotal=A$f*A$items_total
A$f_amounttotal=A$f*A$amount_total
A$f2=A$f^4*A$m^4
A$f3=A$r^4
A$f4=A$s^4
TR=subset(A,spl)
TS=subset(A,!spl)
cx=c(2:9,11,14,23,27,29,31,32,33,34)
colnames(TR[,cx])
[1] "r" "s" "f" "m"
[5] "rev" "raw" "age" "area"
[9] "buy" "T1" "amount_m1" "amount_total"
[13] "items_total" "f_itemtotal" "f_amounttotal" "f2"
[17] "f3"
glm1 = glm(buy ~ ., TR[,cx], family=binomial())
glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(glm1)
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3738 865
TRUE 1692 2281
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts #0.7017257
[1] 0.7018424
colAUC(pred, TS$buy)
[,1]
FALSE vs. TRUE 0.7578508
#0.7579886
glm1_step=step(glm1,direction = 'backward')
Start: AIC=23259.64
buy ~ r + s + f + m + rev + raw + age + area + T1 + amount_m1 +
amount_total + items_total + f_itemtotal + f_amounttotal +
f2 + f3
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- age 10 23213 23257
- r 1 23196 23258
- amount_m1 1 23197 23259
<none> 23196 23260
- f_amounttotal 1 23199 23261
- f_itemtotal 1 23203 23265
- m 1 23205 23267
- f2 1 23208 23270
- raw 1 23210 23272
- f3 1 23214 23276
- amount_total 1 23214 23276
- s 1 23214 23276
- items_total 1 23217 23279
- T1 1 23223 23285
- rev 1 23233 23295
- area 7 23296 23346
- f 1 23355 23417
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=23256.62
buy ~ r + s + f + m + rev + raw + area + T1 + amount_m1 + amount_total +
items_total + f_itemtotal + f_amounttotal + f2 + f3
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- r 1 23213 23255
- amount_m1 1 23214 23256
<none> 23213 23257
- f_amounttotal 1 23216 23258
- f_itemtotal 1 23220 23262
- m 1 23222 23264
- f2 1 23225 23267
- raw 1 23227 23269
- s 1 23231 23273
- f3 1 23232 23274
- amount_total 1 23232 23274
- items_total 1 23234 23276
- T1 1 23241 23283
- rev 1 23251 23293
- area 7 23315 23345
- f 1 23372 23414
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=23254.77
buy ~ s + f + m + rev + raw + area + T1 + amount_m1 + amount_total +
items_total + f_itemtotal + f_amounttotal + f2 + f3
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- amount_m1 1 23214 23254
<none> 23213 23255
- f_amounttotal 1 23216 23256
- f_itemtotal 1 23220 23260
- m 1 23223 23263
- f2 1 23226 23266
- raw 1 23228 23268
- items_total 1 23234 23274
- amount_total 1 23235 23275
- s 1 23238 23278
- f3 1 23244 23284
- T1 1 23251 23291
- rev 1 23260 23300
- area 7 23315 23343
- f 1 23377 23417
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=23253.94
buy ~ s + f + m + rev + raw + area + T1 + amount_total + items_total +
f_itemtotal + f_amounttotal + f2 + f3
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
<none> 23214 23254
- f_amounttotal 1 23217 23255
- f_itemtotal 1 23221 23259
- m 1 23224 23262
- f2 1 23227 23265
- raw 1 23228 23266
- amount_total 1 23235 23273
- items_total 1 23235 23273
- s 1 23241 23279
- f3 1 23245 23283
- T1 1 23258 23296
- rev 1 23260 23298
- area 7 23316 23342
- f 1 23377 23415
pred = predict(glm1_step, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3741 862
TRUE 1694 2279
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts #0.7028918
[1] 0.701959
colAUC(pred, TS$buy)
[,1]
FALSE vs. TRUE 0.7578848
#0.7581489
cx=c(2:9,11,14,23,28,30:33)
colnames(TR[,cx])
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
buy ~ ., data=TR[,cx], method="glm",
trControl=ctrl, metric="ROC")
Sys.time() - t0
cv.glm$results
##### glm(), Final Model
glm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)