rm(list=ls(all=T))
There were 50 or more warnings (use warnings() to see the first 50)
Sys.setlocale("LC_ALL","C")
[1] "C/C/C/C/C/zh_TW.UTF-8"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
library(corrplot)
library(randomForest)
A : 將 X 中單個顧客的所有交易綁起來
過程中並沒有更動到次序,因此 spl 及 spl2 並不會有資料欄位錯誤的問題。
load("data/tf2.rdata")
rm(A)
rm(X)
rm(Z)
load("data/tf0.rdata")
CopyA <- A0
CopyX <- X0
CopyZ <- Z0
加入新變數
grossmargin = ( 商品總價 - 商品總成本 ) / 商品總價
Z = subset(CopyZ, date < as.Date("2001-02-01")) # 618212
Z$grossmargin = (Z$price-Z$cost)/Z$price
加入新變數
month : 為該筆交易的月份。
spec = 單筆交易總毛利率 / 該筆交易總商品數量
poorbitch = 單筆交易商品毛利率 <0 的總數 / 該筆交易總商品數量
weekday : 為該筆交易的星期數值,並由星期一排到星期日。
X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
month = months(date),
spec = sum(grossmargin*qty)/sum(qty),
poorbitch = sum(as.numeric(grossmargin<0)*qty)/sum(qty),
weekday = factor(weekdays(date),levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday","Saturday","Sunday")),
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387
X = subset(X, items<=64 & pieces<=98 & total<=11260)
summary(X)
tid date month spec poorbitch
Min. : 1 Min. :2000-11-01 Length:88295 Min. :-1.93732 Min. :0.0000
1st Qu.:22086 1st Qu.:2000-11-23 Class :character 1st Qu.: 0.09309 1st Qu.:0.0000
Median :44181 Median :2000-12-12 Mode :character Median : 0.16258 Median :0.0000
Mean :44184 Mean :2000-12-15 Mean : 0.12344 Mean :0.1263
3rd Qu.:66270 3rd Qu.:2001-01-12 3rd Qu.: 0.20933 3rd Qu.:0.1176
Max. :88387 Max. :2001-01-31 Max. : 1.00000 Max. :1.0000
weekday cust age area items pieces
Monday :12615 Min. : 1069 D :17515 E :37482 Min. : 1.000 Min. : 1.000
Tuesday :11288 1st Qu.: 923873 C :14616 F :25383 1st Qu.: 2.000 1st Qu.: 3.000
Wednesday: 9898 Median : 1607000 E :14560 G : 6769 Median : 5.000 Median : 6.000
Thursday :11245 Mean : 1395102 F :10340 C : 6319 Mean : 6.948 Mean : 9.372
Friday :10651 3rd Qu.: 1888840 B : 7811 H : 5519 3rd Qu.: 9.000 3rd Qu.:12.000
Saturday :14587 Max. :20002000 G : 6306 D : 3647 Max. :64.000 Max. :98.000
Sunday :18011 (Other):17147 (Other): 3176
total gross
Min. : 5.0 Min. :-1645.0
1st Qu.: 230.0 1st Qu.: 23.0
Median : 522.0 Median : 72.0
Mean : 877.6 Mean : 136.2
3rd Qu.: 1116.0 3rd Qu.: 174.0
Max. :10980.0 Max. : 3217.0
加入新變數
month1f , month2f , month3f : 為十一月至一月,該顧客各月份的交易次數。
frise = ( 十一月至十二月交易次數上升 ) && ( 十二月至一月交易次數上升 )
weeksum : 單一顧客於週間來店的次數總和。
weekendsum : 單一顧客於週末來店的次數總和。
realspec = 單筆交易 spec * 單筆交易商品總數量 / 交易之總商品數量
realpoorbitch = 單筆交易 poorbitch * 單筆交易商品總數量 / 交易之總商品數量
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
month1f = sum(month == "November"),
month2f = sum(month == "December"),
month3f = sum(month == "January"),
weeksum = sum(weekday!="Saturday" & weekday!="Sunday"),
weekendsum = sum(weekday=="Saturday")+sum(weekday=="Sunday"),
frise = isTRUE(month2f - month1f > 0 && month3f - month2f > 0),
realspec = sum(spec*pieces)/sum(pieces),
realpoorbitch = sum(poorbitch*pieces)/sum(pieces),
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = first(age), # age group
area = first(area) # area code
) %>% data.frame # 28584
feb = filter(CopyX, date>= as.Date("2001-02-01")) %>% group_by(cust) %>% summarise(amount = sum(total)) # 16899
A = merge(A, feb, by="cust", all.x=T)
A$buy = !is.na(A$amount)
summary(A)
cust r s f month1f month2f
Min. : 1069 Min. : 1.00 Min. : 1.00 Min. : 1.000 Min. : 0.000 Min. : 0.0000
1st Qu.: 1060898 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000 1st Qu.: 0.000 1st Qu.: 0.0000
Median : 1654100 Median :21.00 Median :68.00 Median : 2.000 Median : 1.000 Median : 1.0000
Mean : 1461070 Mean :32.12 Mean :61.27 Mean : 3.089 Mean : 1.112 Mean : 0.9339
3rd Qu.: 1945003 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000 3rd Qu.: 1.000 3rd Qu.: 1.0000
Max. :20002000 Max. :92.00 Max. :92.00 Max. :60.000 Max. :26.000 Max. :21.0000
month3f weeksum weekendsum frise realspec realpoorbitch
Min. : 0.000 Min. : 0.000 Min. : 0.00 Mode :logical Min. :-1.93732 Min. :0.00000
1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.: 0.00 FALSE:27724 1st Qu.: 0.09716 1st Qu.:0.00000
Median : 1.000 Median : 1.000 Median : 1.00 TRUE :860 Median : 0.16064 Median :0.03333
Mean : 1.043 Mean : 1.949 Mean : 1.14 Mean : 0.12562 Mean :0.12825
3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 1.00 3rd Qu.: 0.20159 3rd Qu.:0.15152
Max. :25.000 Max. :44.000 Max. :18.00 Max. : 0.47778 Max. :1.00000
m rev raw age area amount
Min. : 8.0 Min. : 8 Min. : -742.0 D :5832 E :9907 Min. : 8
1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 C :5238 F :7798 1st Qu.: 454
Median : 709.5 Median : 1566 Median : 218.0 E :4514 C :3169 Median : 993
Mean : 1012.4 Mean : 2711 Mean : 420.8 F :3308 G :3052 Mean : 1499
3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0 B :2802 D :1778 3rd Qu.: 1955
Max. :10634.0 Max. :99597 Max. :15565.0 G :1940 H :1295 Max. :28089
(Other):4950 (Other):1585 NA's :15342
buy
Mode :logical
FALSE:15342
TRUE :13242
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
加入新變數
avgbuy = 所有交易之商品總價 / 所有交易之商品總數量
Y = group_by(Z, cust) %>% summarise(
avgbuy=sum(price)/sum(qty)
)%>% data.frame # 119422
A$avgbuy = Y$avgbuy
加入新變數
( 並無更動 Tony 的程式碼 )
X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)
0 1 2 3 4 5 6
18011 12615 11288 9898 11245 10651 14587
mx = xtabs(~ cust + wday, X)
dim(mx)
[1] 28584 7
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 1 1 0 0 0 0 0
1113 2 1 0 0 0 0 1
1359 0 1 0 0 0 0 0
1823 0 1 0 1 1 0 0
2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1113 0.5000000 0.2500000 0.0000000 0.0000000 0.0000000 0.0000000 0.2500000
1359 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1823 0.0000000 0.3333333 0.0000000 0.3333333 0.3333333 0.0000000 0.0000000
2189 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.0000000 0.5000000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>%
setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>%
right_join(A, by='cust')
加入新變數
之所以會這樣設定是因為我們觀察到 E/F 區的顧客於二月來消費的機率較高。
之所以會這樣設定是因為我們觀察到 H/I/J 年齡區段的顧客於二月來消費的機率較高。
之所以會這樣設定是因為我們懷疑 r , s , f 與應變數的關係可能並非線性相關。
A$ef <- ifelse(as.character(A$area) == "E"| as.character(A$area) =="F",T,F)
A$age_hij <- ifelse(A$age == "H" | A$age == "I" | A$age == "J",T,F)
A$r2 <- A$r^2
A$s2 <- A$s^2
A$f2 <- A$f^2
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
TR = subset(A, spl)
TS = subset(A, !spl)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
可以看到模型summary中還有許多變數並不顯著或是係數異常
AUC = 0.7592
logis_m1_original = glm(buy ~ . - m - rev + log(m) + log(rev) + ef*month1f + ef*month2f + ef*month3f , TR[,c(9:24,26:32)], family=binomial())
summary(logis_m1_original)
Call:
glm(formula = buy ~ . - m - rev + log(m) + log(rev) + ef * month1f +
ef * month2f + ef * month3f, family = binomial(), data = TR[,
c(9:24, 26:32)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6911 -0.8712 -0.6653 1.0064 2.4708
Coefficients: (4 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.742e+00 2.063e-01 -8.440 < 2e-16 ***
r 1.551e-03 3.397e-03 0.457 0.647996
s 4.490e-04 3.544e-03 0.127 0.899195
f 4.118e-01 5.782e-02 7.121 1.07e-12 ***
month1f -1.796e-01 5.533e-02 -3.246 0.001171 **
month2f -1.367e-01 5.203e-02 -2.627 0.008621 **
month3f NA NA NA NA
weeksum -2.135e-02 1.870e-02 -1.142 0.253630
weekendsum NA NA NA NA
friseTRUE 1.394e-01 1.149e-01 1.213 0.225027
realspec -7.509e-01 1.970e-01 -3.811 0.000138 ***
realpoorbitch -4.236e-02 1.273e-01 -0.333 0.739366
raw -8.544e-05 4.735e-05 -1.804 0.071188 .
ageB -1.412e-02 8.735e-02 -0.162 0.871581
ageC 2.873e-02 8.065e-02 0.356 0.721698
ageD 7.974e-02 8.001e-02 0.997 0.318974
ageE 8.086e-02 8.211e-02 0.985 0.324700
ageF 1.500e-03 8.538e-02 0.018 0.985983
ageG 8.496e-03 9.404e-02 0.090 0.928017
ageH 1.633e-01 1.103e-01 1.480 0.138977
ageI 4.426e-02 1.186e-01 0.373 0.708951
ageJ 2.390e-01 1.057e-01 2.261 0.023738 *
ageK -1.291e-01 1.510e-01 -0.855 0.392388
areaB -4.611e-02 1.337e-01 -0.345 0.730237
areaC -2.085e-01 1.057e-01 -1.973 0.048491 *
areaD 4.039e-02 1.125e-01 0.359 0.719483
areaE 3.207e-01 1.054e-01 3.042 0.002348 **
areaF 2.476e-01 1.059e-01 2.338 0.019379 *
areaG -3.204e-02 1.058e-01 -0.303 0.761946
areaH -1.777e-01 1.254e-01 -1.418 0.156320
avgbuy -7.833e-04 1.791e-04 -4.374 1.22e-05 ***
efTRUE NA NA NA NA
age_hijTRUE NA NA NA NA
r2 -8.019e-05 3.713e-05 -2.160 0.030769 *
s2 6.577e-05 3.475e-05 1.893 0.058374 .
f2 -4.040e-03 9.720e-04 -4.156 3.24e-05 ***
log(m) -3.667e-01 1.038e-01 -3.531 0.000414 ***
log(rev) 4.258e-01 1.011e-01 4.211 2.55e-05 ***
month1f:efTRUE -9.498e-02 3.310e-02 -2.869 0.004117 **
month2f:efTRUE 2.268e-02 3.752e-02 0.604 0.545577
month3f:efTRUE -9.113e-03 3.767e-02 -0.242 0.808831
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23114 on 19971 degrees of freedom
AIC: 23188
Number of Fisher Scoring iterations: 6
logis_m1_original_pred = predict(logis_m1_original, TS, type="response")
prediction from a rank-deficient fit may be misleading
colAUC(logis_m1_original_pred, TS$buy) # AUC = 0.759249
[,1]
FALSE vs. TRUE 0.7592495
step 演算法目的是在一個完整的回歸中逐一移除變數,當移除特定變數時,模型損失了過多的解釋力就停止
AUC = 0.7591
logis_m1_original_steped <- step(logis_m1_original, direction = "backward")
Start: AIC=23188.15
buy ~ (r + s + f + month1f + month2f + month3f + weeksum + weekendsum +
frise + realspec + realpoorbitch + m + rev + raw + age +
area + avgbuy + ef + age_hij + r2 + s2 + f2) - m - rev +
log(m) + log(rev) + ef * month1f + ef * month2f + ef * month3f
Step: AIC=23188.15
buy ~ r + s + f + month1f + month2f + month3f + weeksum + weekendsum +
frise + realspec + realpoorbitch + raw + age + area + avgbuy +
ef + r2 + s2 + f2 + log(m) + log(rev) + month1f:ef + month2f:ef +
month3f:ef
Step: AIC=23188.15
buy ~ r + s + f + month1f + month2f + month3f + weeksum + frise +
realspec + realpoorbitch + raw + age + area + avgbuy + ef +
r2 + s2 + f2 + log(m) + log(rev) + month1f:ef + month2f:ef +
month3f:ef
Step: AIC=23188.15
buy ~ r + s + month1f + month2f + month3f + weeksum + frise +
realspec + realpoorbitch + raw + age + area + avgbuy + ef +
r2 + s2 + f2 + log(m) + log(rev) + month1f:ef + month2f:ef +
month3f:ef
Df Deviance AIC
- age 10 23129 23183
- s 1 23114 23186
- month3f:ef 1 23114 23186
- realpoorbitch 1 23114 23186
- r 1 23114 23186
- month2f:ef 1 23114 23186
- weeksum 1 23116 23188
- frise 1 23116 23188
<none> 23114 23188
- raw 1 23117 23189
- s2 1 23118 23190
- r2 1 23119 23191
- area 6 23131 23193
- f2 1 23122 23194
- month1f:ef 1 23122 23194
- log(m) 1 23127 23199
- realspec 1 23129 23201
- log(rev) 1 23132 23204
- avgbuy 1 23137 23209
Step: AIC=23182.9
buy ~ r + s + month1f + month2f + month3f + weeksum + frise +
realspec + realpoorbitch + raw + area + avgbuy + ef + r2 +
s2 + f2 + log(m) + log(rev) + month1f:ef + month2f:ef + month3f:ef
Df Deviance AIC
- s 1 23129 23181
- month3f:ef 1 23129 23181
- realpoorbitch 1 23129 23181
- r 1 23129 23181
- month2f:ef 1 23129 23181
- weeksum 1 23130 23182
- frise 1 23130 23182
<none> 23129 23183
- raw 1 23132 23184
- s2 1 23132 23184
- r2 1 23134 23186
- area 6 23146 23188
- f2 1 23136 23188
- month1f:ef 1 23137 23189
- log(m) 1 23142 23194
- realspec 1 23144 23196
- log(rev) 1 23147 23199
- avgbuy 1 23152 23204
Step: AIC=23180.92
buy ~ r + month1f + month2f + month3f + weeksum + frise + realspec +
realpoorbitch + raw + area + avgbuy + ef + r2 + s2 + f2 +
log(m) + log(rev) + month1f:ef + month2f:ef + month3f:ef
Df Deviance AIC
- month3f:ef 1 23129 23179
- realpoorbitch 1 23129 23179
- r 1 23129 23179
- month2f:ef 1 23129 23179
- weeksum 1 23130 23180
- frise 1 23130 23180
<none> 23129 23181
- raw 1 23132 23182
- r2 1 23134 23184
- area 6 23146 23186
- f2 1 23136 23186
- month1f:ef 1 23137 23187
- log(m) 1 23143 23193
- realspec 1 23145 23195
- log(rev) 1 23148 23198
- avgbuy 1 23152 23202
- s2 1 23164 23214
Step: AIC=23178.96
buy ~ r + month1f + month2f + month3f + weeksum + frise + realspec +
realpoorbitch + raw + area + avgbuy + ef + r2 + s2 + f2 +
log(m) + log(rev) + month1f:ef + month2f:ef
glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- realpoorbitch 1 23129 23177
- r 1 23129 23177
- month2f:ef 1 23129 23177
- weeksum 1 23130 23178
- frise 1 23130 23178
<none> 23129 23179
- raw 1 23132 23180
- r2 1 23134 23182
- area 6 23146 23184
- f2 1 23136 23184
- month1f:ef 1 23137 23185
- log(m) 1 23143 23191
- realspec 1 23145 23193
- log(rev) 1 23149 23197
- avgbuy 1 23152 23200
- s2 1 23164 23212
- month3f 1 23182 23230
Step: AIC=23177.1
buy ~ r + month1f + month2f + month3f + weeksum + frise + realspec +
raw + area + avgbuy + ef + r2 + s2 + f2 + log(m) + log(rev) +
month1f:ef + month2f:ef
glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- r 1 23129 23175
- month2f:ef 1 23130 23176
- weeksum 1 23130 23176
- frise 1 23130 23176
<none> 23129 23177
- raw 1 23132 23178
- r2 1 23135 23181
- area 6 23146 23182
- f2 1 23137 23183
- month1f:ef 1 23137 23183
- log(m) 1 23143 23189
- log(rev) 1 23149 23195
- avgbuy 1 23153 23199
- s2 1 23164 23210
- realspec 1 23165 23211
- month3f 1 23182 23228
Step: AIC=23175.44
buy ~ month1f + month2f + month3f + weeksum + frise + realspec +
raw + area + avgbuy + ef + r2 + s2 + f2 + log(m) + log(rev) +
month1f:ef + month2f:ef
glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- month2f:ef 1 23130 23174
- weeksum 1 23131 23175
- frise 1 23131 23175
<none> 23129 23175
- raw 1 23133 23177
- f2 1 23137 23181
- area 6 23147 23181
- month1f:ef 1 23138 23182
- log(m) 1 23144 23188
- log(rev) 1 23150 23194
- avgbuy 1 23153 23197
- r2 1 23153 23197
- s2 1 23164 23208
- realspec 1 23166 23210
- month3f 1 23185 23229
Step: AIC=23173.84
buy ~ month1f + month2f + month3f + weeksum + frise + realspec +
raw + area + avgbuy + ef + r2 + s2 + f2 + log(m) + log(rev) +
month1f:ef
glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- weeksum 1 23131 23173
- frise 1 23131 23173
<none> 23130 23174
- raw 1 23133 23175
- f2 1 23137 23179
- area 6 23147 23179
- month1f:ef 1 23138 23180
- log(m) 1 23144 23186
- log(rev) 1 23150 23192
- avgbuy 1 23154 23196
- r2 1 23154 23196
- month2f 1 23164 23206
- s2 1 23164 23206
- realspec 1 23166 23208
- month3f 1 23186 23228
Step: AIC=23173.07
buy ~ month1f + month2f + month3f + frise + realspec + raw +
area + avgbuy + ef + r2 + s2 + f2 + log(m) + log(rev) + month1f:ef
glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- frise 1 23132 23172
<none> 23131 23173
- raw 1 23134 23174
- f2 1 23139 23179
- area 6 23149 23179
- month1f:ef 1 23139 23179
- log(m) 1 23145 23185
- log(rev) 1 23151 23191
- avgbuy 1 23155 23195
- r2 1 23155 23195
- month2f 1 23164 23204
- s2 1 23167 23207
- realspec 1 23167 23207
- month3f 1 23186 23226
Step: AIC=23172.52
buy ~ month1f + month2f + month3f + realspec + raw + area + avgbuy +
ef + r2 + s2 + f2 + log(m) + log(rev) + month1f:ef
glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
<none> 23132 23172
- raw 1 23136 23174
- f2 1 23140 23178
- area 6 23150 23178
- month1f:ef 1 23140 23178
- log(m) 1 23148 23186
- log(rev) 1 23155 23193
- r2 1 23155 23193
- avgbuy 1 23157 23195
- month2f 1 23165 23203
- s2 1 23167 23205
- realspec 1 23169 23207
- month3f 1 23189 23227
summary(logis_m1_original_steped)
Call:
glm(formula = buy ~ month1f + month2f + month3f + realspec +
raw + area + avgbuy + ef + r2 + s2 + f2 + log(m) + log(rev) +
month1f:ef, family = binomial(), data = TR[, c(9:24, 26:32)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6511 -0.8696 -0.6676 1.0068 2.4838
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.667e+00 1.726e-01 -9.655 < 2e-16 ***
month1f 2.038e-01 4.861e-02 4.192 2.76e-05 ***
month2f 2.765e-01 3.929e-02 7.038 1.95e-12 ***
month3f 3.871e-01 4.052e-02 9.554 < 2e-16 ***
realspec -7.206e-01 1.189e-01 -6.059 1.37e-09 ***
raw -8.198e-05 4.704e-05 -1.743 0.08138 .
areaB -5.089e-02 1.335e-01 -0.381 0.70312
areaC -2.161e-01 1.054e-01 -2.049 0.04041 *
areaD 2.558e-02 1.121e-01 0.228 0.81957
areaE 3.137e-01 1.009e-01 3.110 0.00187 **
areaF 2.421e-01 1.015e-01 2.385 0.01709 *
areaG -4.431e-02 1.054e-01 -0.420 0.67416
areaH -2.128e-01 1.231e-01 -1.728 0.08399 .
avgbuy -8.041e-04 1.798e-04 -4.473 7.70e-06 ***
efTRUE NA NA NA NA
r2 -6.096e-05 1.279e-05 -4.768 1.86e-06 ***
s2 6.851e-05 1.167e-05 5.873 4.29e-09 ***
f2 -4.015e-03 9.574e-04 -4.193 2.75e-05 ***
log(m) -3.893e-01 9.864e-02 -3.947 7.91e-05 ***
log(rev) 4.499e-01 9.573e-02 4.700 2.60e-06 ***
month1f:efTRUE -9.019e-02 3.273e-02 -2.756 0.00585 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23133 on 19988 degrees of freedom
AIC: 23173
Number of Fisher Scoring iterations: 6
logis_m1_original_steped_pred = predict(logis_m1_original_steped, TS, type="response")
prediction from a rank-deficient fit may be misleading
colAUC(logis_m1_original_steped_pred, TS$buy) # AUC = 0.75914
[,1]
FALSE vs. TRUE 0.759137
例如 step 演算法中只有保留 month1f*ef 此變數,然而直覺上如果 month1f*ef 存在,則 month2*ef 及 month3*ef 等同質性變數都應存在於模型中。所以我們手動加入 month2f*ef 及 month3f*ef。
AUC = 0.7592
logis_m1_handcraft = glm(buy ~ month1f + month2f + month3f + realspec +
raw + area + avgbuy + ef + r2 + s2 + weekendsum + log(m) + log(rev) +
month1f*ef + month2f*ef + month3f*ef , TR[,c(9:24,26:32)], family=binomial())
summary(logis_m1_handcraft)
Call:
glm(formula = buy ~ month1f + month2f + month3f + realspec +
raw + area + avgbuy + ef + r2 + s2 + weekendsum + log(m) +
log(rev) + month1f * ef + month2f * ef + month3f * ef, family = binomial(),
data = TR[, c(9:24, 26:32)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.9239 -0.8650 -0.6661 1.0016 2.4703
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.570e+00 1.730e-01 -9.074 < 2e-16 ***
month1f 1.041e-01 3.826e-02 2.722 0.00649 **
month2f 1.562e-01 3.962e-02 3.943 8.04e-05 ***
month3f 2.894e-01 4.257e-02 6.797 1.06e-11 ***
realspec -7.168e-01 1.190e-01 -6.025 1.69e-09 ***
raw -8.454e-05 4.750e-05 -1.780 0.07513 .
areaB -4.620e-02 1.335e-01 -0.346 0.72937
areaC -2.115e-01 1.055e-01 -2.004 0.04505 *
areaD 3.063e-02 1.122e-01 0.273 0.78492
areaE 3.026e-01 1.060e-01 2.856 0.00429 **
areaF 2.303e-01 1.064e-01 2.165 0.03041 *
areaG -3.793e-02 1.055e-01 -0.360 0.71918
areaH -2.031e-01 1.235e-01 -1.645 0.10005
avgbuy -7.928e-04 1.797e-04 -4.413 1.02e-05 ***
efTRUE NA NA NA NA
r2 -5.880e-05 1.279e-05 -4.596 4.30e-06 ***
s2 6.678e-05 1.170e-05 5.707 1.15e-08 ***
weekendsum 2.061e-02 1.864e-02 1.106 0.26877
log(m) -5.637e-01 8.377e-02 -6.729 1.71e-11 ***
log(rev) 6.215e-01 8.063e-02 7.708 1.27e-14 ***
month1f:efTRUE -8.953e-02 3.349e-02 -2.674 0.00750 **
month2f:efTRUE 2.813e-02 3.793e-02 0.742 0.45826
month3f:efTRUE -1.275e-03 3.796e-02 -0.034 0.97321
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23138 on 19986 degrees of freedom
AIC: 23182
Number of Fisher Scoring iterations: 6
logis_m1_handcraft_pred = predict(logis_m1_handcraft, TS, type="response")
prediction from a rank-deficient fit may be misleading
colAUC(logis_m1_handcraft_pred, TS$buy) # AUC = 0.759227
[,1]
FALSE vs. TRUE 0.7592278
嘗試使用決策樹方法建立模型
AUC = 0.7448
tree_m1 = rpart(buy ~ . - m - rev + log(m) + log(rev) , TR[,c(9:24,26:32)], method="class",cp=0.001)
tree_1_pred = predict(tree_m1, TS)[,2] # predict prob
colAUC(tree_1_pred, TS$buy) # AUC = 0.74478
[,1]
FALSE vs. TRUE 0.7447816
rpart.plot(tree_m1,cex=0.6)
Bad 'data' field in model 'call'.
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
同時建立隨機森林,目的是透過 model ensemble 來觀察組合後的影響。
AUC = 0.7503
rf_m1 = randomForest(buy ~ . , TR[,c(9:24,26:32)], ntree=200, nodesize=25)
The response has five or fewer unique values. Are you sure you want to do regression?
rf_m1_pred = predict(rf_m1, TS, type="response")
colAUC(rf_m1_pred, TS$buy)
[,1]
FALSE vs. TRUE 0.7502786
# AUC = 0.7490941
我們混合了先前嘗試做過的手動挑選變數、決策樹、step 所選擇的羅吉斯回歸
AUC = 0.7591
logisHandcraft_tree_logisStep_m1 <- (logis_m1_handcraft_pred + tree_1_pred + logis_m1_original_steped_pred ) / 3
colAUC(logisHandcraft_tree_logisStep_m1, TS$buy)
[,1]
FALSE vs. TRUE 0.7591006
# AUC = 0.75910
雖然全部變數丟進去的羅吉斯模型 AUC 稍高一些,但變數解釋力較差,故不以該模型為最終成果。
AUC = 0.759227
將變數的交互作用、非線性關係以及共線性問題一併考慮後,手動挑選變數。
Testing R^2 = 0.2729
lm_m2_handcraft = lm(amount ~ . - weeksum - weekendsum - area - frise - f2 - r2 - s2 - s + f*avgbuy + f*ef + realspec*realpoorbitch - age_hij , TR2[,c(9:25,27:32)])
summary(lm_m2_handcraft)
Call:
lm(formula = amount ~ . - weeksum - weekendsum - area - frise -
f2 - r2 - s2 - s + f * avgbuy + f * ef + realspec * realpoorbitch -
age_hij, data = TR2[, c(9:25, 27:32)])
Residuals:
Min 1Q Median 3Q Max
-1.86836 -0.22775 0.04597 0.27865 1.52612
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.284e+00 5.314e-02 24.169 < 2e-16 ***
r 6.973e-04 2.628e-04 2.653 0.007990 **
f 3.230e-02 4.114e-03 7.852 4.56e-15 ***
month1f -1.977e-02 5.285e-03 -3.741 0.000185 ***
month2f -1.188e-02 5.461e-03 -2.176 0.029588 *
month3f NA NA NA NA
realspec 3.591e-01 8.043e-02 4.465 8.11e-06 ***
realpoorbitch -5.123e-03 3.901e-02 -0.131 0.895525
m 4.260e-01 2.969e-02 14.352 < 2e-16 ***
rev 6.775e-02 2.597e-02 2.609 0.009103 **
raw 6.366e-05 9.582e-06 6.644 3.23e-11 ***
ageB 7.573e-02 2.488e-02 3.043 0.002346 **
ageC 1.188e-01 2.284e-02 5.202 2.01e-07 ***
ageD 1.257e-01 2.255e-02 5.573 2.57e-08 ***
ageE 1.341e-01 2.308e-02 5.810 6.45e-09 ***
ageF 1.058e-01 2.409e-02 4.390 1.15e-05 ***
ageG 7.996e-02 2.627e-02 3.044 0.002342 **
ageH 7.600e-02 3.099e-02 2.453 0.014200 *
ageI 7.362e-02 3.185e-02 2.311 0.020844 *
ageJ -2.115e-02 2.798e-02 -0.756 0.449689
ageK 1.113e-01 3.903e-02 2.852 0.004349 **
avgbuy -3.170e-04 7.677e-05 -4.129 3.67e-05 ***
efTRUE -1.539e-02 1.246e-02 -1.235 0.216923
f:avgbuy -3.696e-05 1.959e-05 -1.887 0.059223 .
f:efTRUE 1.260e-03 2.048e-03 0.615 0.538497
realspec:realpoorbitch -3.027e-01 9.989e-02 -3.030 0.002451 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4176 on 9244 degrees of freedom
Multiple R-squared: 0.3045, Adjusted R-squared: 0.3027
F-statistic: 168.7 on 24 and 9244 DF, p-value: < 2.2e-16
r2.tr = summary(lm_m2_handcraft)$r.sq
lm_m2_handcraft_pred <- predict(lm_m2_handcraft, TS2)
prediction from a rank-deficient fit may be misleading
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((lm_m2_handcraft_pred - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.3045320 0.2728831
# TR R^2 = 0.3045 , TS R^2 = 0.2728831
table(TR2$area) # why we need ef
A B C D E F G H
178 199 715 470 3784 2696 820 407
同樣利用 step 函式跑一次系統自動挑選變數的流程。
Testing R^2 = 0.2724
lm_m2_steped <- step(lm_m2_handcraft, direction = "backward")
Start: AIC=-16163.53
amount ~ (r + s + f + month1f + month2f + month3f + weeksum +
weekendsum + frise + realspec + realpoorbitch + m + rev +
raw + age + area + avgbuy + ef + age_hij + r2 + s2 + f2) -
weeksum - weekendsum - area - frise - f2 - r2 - s2 - s +
f * avgbuy + f * ef + realspec * realpoorbitch - age_hij
Step: AIC=-16163.53
amount ~ r + f + month1f + month2f + realspec + realpoorbitch +
m + rev + raw + age + avgbuy + ef + f:avgbuy + f:ef + realspec:realpoorbitch
Df Sum of Sq RSS AIC
- f:ef 1 0.066 1612.0 -16165
<none> 1612.0 -16164
- f:avgbuy 1 0.621 1612.6 -16162
- month2f 1 0.826 1612.8 -16161
- rev 1 1.187 1613.2 -16159
- r 1 1.227 1613.2 -16158
- realspec:realpoorbitch 1 1.601 1613.6 -16156
- month1f 1 2.440 1614.4 -16152
- raw 1 7.697 1619.7 -16121
- age 10 15.338 1627.3 -16096
- m 1 35.919 1647.9 -15961
Step: AIC=-16165.15
amount ~ r + f + month1f + month2f + realspec + realpoorbitch +
m + rev + raw + age + avgbuy + ef + f:avgbuy + realspec:realpoorbitch
Df Sum of Sq RSS AIC
- ef 1 0.206 1612.2 -16166
<none> 1612.0 -16165
- f:avgbuy 1 0.651 1612.7 -16163
- month2f 1 0.824 1612.9 -16162
- rev 1 1.187 1613.2 -16160
- r 1 1.218 1613.2 -16160
- realspec:realpoorbitch 1 1.608 1613.6 -16158
- month1f 1 2.391 1614.4 -16153
- raw 1 7.819 1619.8 -16122
- age 10 15.335 1627.4 -16097
- m 1 35.886 1647.9 -15963
Step: AIC=-16165.96
amount ~ r + f + month1f + month2f + realspec + realpoorbitch +
m + rev + raw + age + avgbuy + f:avgbuy + realspec:realpoorbitch
Df Sum of Sq RSS AIC
<none> 1612.2 -16166
- f:avgbuy 1 0.645 1612.9 -16164
- month2f 1 0.803 1613.0 -16163
- rev 1 1.089 1613.3 -16162
- r 1 1.234 1613.5 -16161
- realspec:realpoorbitch 1 1.652 1613.9 -16158
- month1f 1 2.397 1614.6 -16154
- raw 1 7.841 1620.1 -16123
- age 10 15.515 1627.8 -16097
- m 1 37.188 1649.4 -15957
summary(lm_m2_steped)
Call:
lm(formula = amount ~ r + f + month1f + month2f + realspec +
realpoorbitch + m + rev + raw + age + avgbuy + f:avgbuy +
realspec:realpoorbitch, data = TR2[, c(9:25, 27:32)])
Residuals:
Min 1Q Median 3Q Max
-1.86889 -0.22851 0.04612 0.28001 1.52183
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.272e+00 5.217e-02 24.379 < 2e-16 ***
r 6.989e-04 2.628e-04 2.660 0.00783 **
f 3.319e-02 3.870e-03 8.576 < 2e-16 ***
month1f -1.953e-02 5.267e-03 -3.708 0.00021 ***
month2f -1.171e-02 5.459e-03 -2.146 0.03192 *
realspec 3.611e-01 8.041e-02 4.491 7.17e-06 ***
realpoorbitch -4.117e-03 3.901e-02 -0.106 0.91595
m 4.299e-01 2.944e-02 14.604 < 2e-16 ***
rev 6.445e-02 2.579e-02 2.499 0.01248 *
raw 6.412e-05 9.562e-06 6.706 2.12e-11 ***
ageB 7.631e-02 2.488e-02 3.068 0.00216 **
ageC 1.197e-01 2.283e-02 5.242 1.62e-07 ***
ageD 1.268e-01 2.253e-02 5.628 1.87e-08 ***
ageE 1.353e-01 2.306e-02 5.868 4.55e-09 ***
ageF 1.067e-01 2.408e-02 4.429 9.58e-06 ***
ageG 8.072e-02 2.626e-02 3.074 0.00212 **
ageH 7.649e-02 3.098e-02 2.469 0.01358 *
ageI 7.401e-02 3.184e-02 2.324 0.02014 *
ageJ -2.023e-02 2.797e-02 -0.723 0.46952
ageK 1.109e-01 3.758e-02 2.952 0.00317 **
avgbuy -3.145e-04 7.671e-05 -4.100 4.17e-05 ***
f:avgbuy -3.759e-05 1.954e-05 -1.923 0.05446 .
realspec:realpoorbitch -3.073e-01 9.982e-02 -3.078 0.00209 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4176 on 9246 degrees of freedom
Multiple R-squared: 0.3044, Adjusted R-squared: 0.3028
F-statistic: 183.9 on 22 and 9246 DF, p-value: < 2.2e-16
r3.tr = summary(lm_m2_steped)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm_m2_steped, TS2) - TS2$amount)^2)
r3.ts = 1 - (SSE/SST)
c(r3.tr, r3.ts)
[1] 0.3044146 0.2724408
# TR R^2 = 0.3044 , TS R^2 = 0.2724408
利用 rpart 跑一次決策樹模型。
Testing R^2 = 0.2563
tree_m2 = rpart(amount ~ . , TR2[,c(9:25,27:32)], cp=0.002)
tree_m2_pred <- predict(tree_m2, TS2)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((tree_m2_pred - TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2562628
# TS R^2 = 0.2562628
利用 randomForest 跑一次 200 棵樹的隨機森林模型。
Testing R^2 = 0.2748
rf_m2 = randomForest(amount ~ . , TR2[,c(9:25,27:32)], ntree=200, nodesize=25)
rf_m2_pred = predict(rf_m2, TS2)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((rf_m2_pred - TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2747813
# TS R^2 = 0.2738538
將手動挑選模型、決策樹模型以及隨機森林模型之預測值相加後除以三,作為混合模型
Testing R^2 = 0.2813
lmHandcraft_tree_rf_m2 <- (lm_m2_handcraft_pred + tree_m2_pred + rf_m2_pred ) / 3
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((lmHandcraft_tree_rf_m2 - TS2$amount)^2)
1 - (SSE/SST)
[1] 0.281278
# TS R^2 = 0.2810461
最後我們將三種模型組合起來,得到了最終答案。
Testing R^2 = 0.2813