Fig-1: The First Model
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
TR = subset(A, spl)
TS = subset(A, !spl)
glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9,
11)])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.793 -0.873 -0.699 1.038 1.873
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.2593126 0.1261221 -9.98 <2e-16 ***
r -0.0122705 0.0008951 -13.71 <2e-16 ***
s 0.0095658 0.0009101 10.51 <2e-16 ***
f 0.2905112 0.0159334 18.23 <2e-16 ***
m -0.0000303 0.0000278 -1.09 0.2756
rev 0.0000409 0.0000194 2.11 0.0352 *
raw -0.0002306 0.0000856 -2.69 0.0071 **
ageB -0.0419432 0.0866577 -0.48 0.6284
ageC 0.0177179 0.0799239 0.22 0.8246
ageD 0.0770457 0.0792142 0.97 0.3307
ageE 0.0869859 0.0813197 1.07 0.2848
ageF 0.0192837 0.0845670 0.23 0.8196
ageG 0.0174470 0.0932309 0.19 0.8516
ageH 0.1751644 0.1093734 1.60 0.1093
ageI 0.0617744 0.1174900 0.53 0.5990
ageJ 0.2652147 0.1047072 2.53 0.0113 *
ageK -0.1419182 0.1498084 -0.95 0.3435
areaB -0.0410521 0.1321265 -0.31 0.7560
areaC -0.2074662 0.1044624 -1.99 0.0470 *
areaD 0.0380134 0.1110612 0.34 0.7321
areaE 0.2598788 0.0968223 2.68 0.0073 **
areaF 0.1817243 0.0975301 1.86 0.0624 .
areaG -0.0467700 0.1044621 -0.45 0.6544
areaH -0.1694538 0.1232346 -1.38 0.1691
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23295 on 19984 degrees of freedom
AIC: 23343
Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3730 873
TRUE 1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.69998
[1] 0.69998
colAUC(pred, TS$buy) # 0.7556
[,1]
FALSE vs. TRUE 0.7556
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
Call:
lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
Residuals:
Min 1Q Median 3Q Max
-1.8330 -0.2281 0.0485 0.2810 1.6424
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.1403704 0.0504979 22.58 < 2e-16 ***
r 0.0000702 0.0003090 0.23 0.8203
s 0.0001173 0.0003123 0.38 0.7072
f 0.0256836 0.0017965 14.30 < 2e-16 ***
m 0.5045943 0.0372711 13.54 < 2e-16 ***
rev 0.0450307 0.0360945 1.25 0.2122
ageB 0.0737926 0.0251165 2.94 0.0033 **
ageC 0.1204660 0.0230651 5.22 0.0000001800 ***
ageD 0.1264592 0.0227496 5.56 0.0000000279 ***
ageE 0.1382214 0.0232522 5.94 0.0000000029 ***
ageF 0.1085828 0.0242698 4.47 0.0000077690 ***
ageG 0.0787808 0.0264917 2.97 0.0029 **
ageH 0.0703242 0.0312462 2.25 0.0244 *
ageI 0.0694822 0.0321119 2.16 0.0305 *
ageJ -0.0284007 0.0282282 -1.01 0.3144
ageK 0.1124434 0.0395589 2.84 0.0045 **
areaB 0.0789586 0.0435321 1.81 0.0697 .
areaC 0.0375241 0.0353641 1.06 0.2887
areaD -0.0111101 0.0371762 -0.30 0.7651
areaE 0.0111809 0.0325803 0.34 0.7315
areaF 0.0147066 0.0328141 0.45 0.6540
areaG 0.0249228 0.0349567 0.71 0.4759
areaH 0.0105550 0.0388962 0.27 0.7861
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.422 on 9246 degrees of freedom
Multiple R-squared: 0.291, Adjusted R-squared: 0.289
F-statistic: 172 on 22 and 9246 DF, p-value: <2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.29099 0.25760