模型訓練與測試流程

Fig-1: The First Model

Fig-1: The First Model


Loading & Preparing Data

Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
Loading Data
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
Spliting for Classification
TR = subset(A, spl)
TS = subset(A, !spl)


Classification Model

glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial()) 
summary(glm1)

Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9, 
    11)])

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-3.793  -0.873  -0.699   1.038   1.873  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.2593126  0.1261221   -9.98   <2e-16 ***
r           -0.0122705  0.0008951  -13.71   <2e-16 ***
s            0.0095658  0.0009101   10.51   <2e-16 ***
f            0.2905112  0.0159334   18.23   <2e-16 ***
m           -0.0000303  0.0000278   -1.09   0.2756    
rev          0.0000409  0.0000194    2.11   0.0352 *  
raw         -0.0002306  0.0000856   -2.69   0.0071 ** 
ageB        -0.0419432  0.0866577   -0.48   0.6284    
ageC         0.0177179  0.0799239    0.22   0.8246    
ageD         0.0770457  0.0792142    0.97   0.3307    
ageE         0.0869859  0.0813197    1.07   0.2848    
ageF         0.0192837  0.0845670    0.23   0.8196    
ageG         0.0174470  0.0932309    0.19   0.8516    
ageH         0.1751644  0.1093734    1.60   0.1093    
ageI         0.0617744  0.1174900    0.53   0.5990    
ageJ         0.2652147  0.1047072    2.53   0.0113 *  
ageK        -0.1419182  0.1498084   -0.95   0.3435    
areaB       -0.0410521  0.1321265   -0.31   0.7560    
areaC       -0.2074662  0.1044624   -1.99   0.0470 *  
areaD        0.0380134  0.1110612    0.34   0.7321    
areaE        0.2598788  0.0968223    2.68   0.0073 ** 
areaF        0.1817243  0.0975301    1.86   0.0624 .  
areaG       -0.0467700  0.1044621   -0.45   0.6544    
areaH       -0.1694538  0.1232346   -1.38   0.1691    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27629  on 20007  degrees of freedom
Residual deviance: 23295  on 19984  degrees of freedom
AIC: 23343

Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3730  873
  TRUE   1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts          # 0.69998
[1] 0.69998
colAUC(pred, TS$buy)                                   # 0.7556
                 [,1]
FALSE vs. TRUE 0.7556


Regression Model

A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)

Call:
lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8330 -0.2281  0.0485  0.2810  1.6424 

Coefficients:
              Estimate Std. Error t value     Pr(>|t|)    
(Intercept)  1.1403704  0.0504979   22.58      < 2e-16 ***
r            0.0000702  0.0003090    0.23       0.8203    
s            0.0001173  0.0003123    0.38       0.7072    
f            0.0256836  0.0017965   14.30      < 2e-16 ***
m            0.5045943  0.0372711   13.54      < 2e-16 ***
rev          0.0450307  0.0360945    1.25       0.2122    
ageB         0.0737926  0.0251165    2.94       0.0033 ** 
ageC         0.1204660  0.0230651    5.22 0.0000001800 ***
ageD         0.1264592  0.0227496    5.56 0.0000000279 ***
ageE         0.1382214  0.0232522    5.94 0.0000000029 ***
ageF         0.1085828  0.0242698    4.47 0.0000077690 ***
ageG         0.0787808  0.0264917    2.97       0.0029 ** 
ageH         0.0703242  0.0312462    2.25       0.0244 *  
ageI         0.0694822  0.0321119    2.16       0.0305 *  
ageJ        -0.0284007  0.0282282   -1.01       0.3144    
ageK         0.1124434  0.0395589    2.84       0.0045 ** 
areaB        0.0789586  0.0435321    1.81       0.0697 .  
areaC        0.0375241  0.0353641    1.06       0.2887    
areaD       -0.0111101  0.0371762   -0.30       0.7651    
areaE        0.0111809  0.0325803    0.34       0.7315    
areaF        0.0147066  0.0328141    0.45       0.6540    
areaG        0.0249228  0.0349567    0.71       0.4759    
areaH        0.0105550  0.0388962    0.27       0.7861    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.422 on 9246 degrees of freedom
Multiple R-squared:  0.291, Adjusted R-squared:  0.289 
F-statistic:  172 on 22 and 9246 DF,  p-value: <2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.29099 0.25760







LS0tDQp0aXRsZTogIkZpcnN0IE1vZGVsLCBUYS1GZW5nIg0KYXV0aG9yOiAi5Y2T6ZuN54S2LCDkuK3lsbHlpKflrbgg566h55CG5a246KGT56CU56m25Lit5b+DIg0KZGF0ZTogImByIFN5cy50aW1lKClgIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KDQojIyMg5qih5Z6L6KiT57e06IiH5ris6Kmm5rWB56iLDQoNCjxjZW50ZXI+DQoNCiFbRmlnLTE6IFRoZSBGaXJzdCBNb2RlbF0oZmlnL21vZGVsaW5nLmpwZykNCg0KPC9jZW50ZXI+DQoNCjxocj4NCg0KIyMjIExvYWRpbmcgJiBQcmVwYXJpbmcgRGF0YQ0KDQpgYGB7ciBlY2hvPVQsIG1lc3NhZ2U9RiwgY2FjaGU9Riwgd2FybmluZz1GfQ0KU3lzLnNldGxvY2FsZSgiTENfQUxMIiwiQyIpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShjYVRvb2xzKQ0KYGBgDQoNCiMjIyMjIExvYWRpbmcgRGF0YQ0KYGBge3J9DQpybShsaXN0PWxzKGFsbD1UUlVFKSkNCmxvYWQoImRhdGEvdGYyLnJkYXRhIikNCmBgYA0KDQojIyMjIyBTcGxpdGluZyBmb3IgQ2xhc3NpZmljYXRpb24gDQpgYGB7cn0NClRSID0gc3Vic2V0KEEsIHNwbCkNClRTID0gc3Vic2V0KEEsICFzcGwpDQpgYGANCjxicj48aHI+DQoNCiMjIyBDbGFzc2lmaWNhdGlvbiBNb2RlbA0KYGBge3J9DQpnbG0xID0gZ2xtKGJ1eSB+IC4sIFRSWyxjKDI6OSwgMTEpXSwgZmFtaWx5PWJpbm9taWFsKCkpIA0Kc3VtbWFyeShnbG0xKQ0KcHJlZCA9ICBwcmVkaWN0KGdsbTEsIFRTLCB0eXBlPSJyZXNwb25zZSIpDQpjbSA9IHRhYmxlKGFjdHVhbCA9IFRTJGJ1eSwgcHJlZGljdCA9IHByZWQgPiAwLjUpOyBjbQ0KYWNjLnRzID0gY20gJT4lIHtzdW0oZGlhZyguKSkvc3VtKC4pfTsgYWNjLnRzICAgICAgICAgICMgMC42OTk5OA0KY29sQVVDKHByZWQsIFRTJGJ1eSkgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgMC43NTU2DQpgYGANCjxicj48aHI+DQoNCiMjIyBSZWdyZXNzaW9uIE1vZGVsDQpgYGB7cn0NCkEyID0gc3Vic2V0KEEsIEEkYnV5KSAlPiUgbXV0YXRlX2F0KGMoIm0iLCJyZXYiLCJhbW91bnQiKSwgbG9nMTApDQpUUjIgPSBzdWJzZXQoQTIsIHNwbDIpDQpUUzIgPSBzdWJzZXQoQTIsICFzcGwyKQ0KYGBgDQoNCmBgYHtyfQ0KbG0xID0gbG0oYW1vdW50IH4gLiwgVFIyWyxjKDI6Niw4OjEwKV0pDQpzdW1tYXJ5KGxtMSkNCmBgYA0KDQpgYGB7cn0NCnIyLnRyID0gc3VtbWFyeShsbTEpJHIuc3ENClNTVCA9IHN1bSgoVFMyJGFtb3VudCAtIG1lYW4oVFIyJGFtb3VudCkpXiAyKQ0KU1NFID0gc3VtKChwcmVkaWN0KGxtMSwgVFMyKSAtICBUUzIkYW1vdW50KV4yKQ0KcjIudHMgPSAxIC0gKFNTRS9TU1QpDQpjKHIyLnRyLCByMi50cykNCmBgYA0KPGJyPjxicj48YnI+PGhyPjxicj48YnI+PGJyPg0KDQo8c3R5bGU+DQoNCi5jYXB0aW9uIHsNCiAgY29sb3I6ICM3Nzc7DQogIG1hcmdpbi10b3A6IDEwcHg7DQp9DQpwIGNvZGUgew0KICB3aGl0ZS1zcGFjZTogaW5oZXJpdDsNCn0NCnByZSB7DQogIHdvcmQtYnJlYWs6IG5vcm1hbDsNCiAgd29yZC13cmFwOiBub3JtYWw7DQogIGxpbmUtaGVpZ2h0OiAxOw0KfQ0KcHJlIGNvZGUgew0KICB3aGl0ZS1zcGFjZTogaW5oZXJpdDsNCn0NCnAsbGkgew0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KLnJ7DQogIGxpbmUtaGVpZ2h0OiAxLjI7DQp9DQoNCi5xaXogew0KICBsaW5lLWhlaWdodDogMS43NTsNCiAgYmFja2dyb3VuZDogI2YwZjBmMDsNCiAgYm9yZGVyLWxlZnQ6IDEycHggc29saWQgI2NjZmZjYzsNCiAgcGFkZGluZzogNHB4Ow0KICBwYWRkaW5nLWxlZnQ6IDEwcHg7DQogIGNvbG9yOiAjMDA5OTAwOw0KfQ0KDQp0aXRsZXsNCiAgY29sb3I6ICNjYzAwMDA7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpib2R5ew0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KaDEsaDIsaDMsaDQsaDV7DQogIGNvbG9yOiAjMDA2NmZmOw0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KDQpoM3sNCiAgY29sb3I6ICMwMDg4MDA7DQogIGJhY2tncm91bmQ6ICNlNmZmZTY7DQogIGxpbmUtaGVpZ2h0OiAyOw0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KaDV7DQogIGNvbG9yOiAjMDA2MDAwOw0KICBiYWNrZ3JvdW5kOiAjZjhmOGY4Ow0KICBsaW5lLWhlaWdodDogMS41Ow0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KPC9zdHlsZT4NCg0K