rm(list=ls(all=T))
options(digits=3, scipen=40)
library(caTools)
library(magrittr)
\[ \sigma = \sqrt{\frac{s^2 \pi^2}{3}} \quad ;\quad s = \sqrt{\frac{3\sigma^2}{\pi^2}}\]
par(cex=0.7, mfcol=c(2,2), mar=c(4,4,4,2))
mu=100; sigma=10; s = sqrt(3 * sigma^2 / pi^2)
curve(dlogis(x,mu,s),60,140,col='blue',lwd=2,xlab="",ylab="",
main="Logistic Dist. over WTP")
curve(1-plogis(x,mu,s),60,140,col='cyan',lwd=2,xlab="",ylab="",
main="Logistic Price Response Function")
curve(dnorm(x,mu,sigma),60,140,col='blue',lwd=2,xlab="",ylab="",
main="Normal Dist. over WTP")
curve(1-pnorm(x,mu,sigma),60,140,col='cyan',lwd=2,xlab="",ylab="",
main="Probit Price Response Function")
\(value = utility - price = V\)
\(V \sim \mathit{logistic} \rightarrow Pr[buy] \sim \mathit{Logistic}\)
\(Pr[buy] = \frac{1}{1+exp(-V)}\)
par(cex=0.8, mar=c(4,4,4,2))
curve(plogis(x,0,4),-30,30,col='cyan',lwd=3,xlab="Value",ylab="Prob[buy]",
main="Logistic Function")
abline(v=seq(-30,30,5), h=seq(0,1,0.1), col='lightgray',lty=3)
points(0,0.5,pch=20,cex=2.5,col='orange')
Value as a function of product attribures - \(V(x_1, x_2, ..., price)\) where \(x\)’s are product option items - - -
1.將資料中要加入的變數先做線性迴歸模型,得出linear function: \(f(x)=y=b_0+b_1 x_1+⋯+b_k x_k\) 這裡的概念是透過一些屬性\((x_1,x_2,x_3…,x_k)\)來得出顧客對這次訂單的效用(y)
2.由剛剛線性迴歸得出的效用(y)代入logistic function可轉換出「購買機率(p)」 p= 1/(1+Exp(-f(X)))
接下來以一份顧客資料來實際操作模型。
bd <- read.csv("RawShort1.csv")
head(bd)
mod1 <- glm(Order ~ Time+Quantity+PricePerLb, data=bd, family=binomial)
summary(mod1)
Call:
glm(formula = Order ~ Time + Quantity + PricePerLb, family = binomial,
data = bd)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.332 -1.083 -0.805 1.221 3.175
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.64640 0.05509 11.7 <0.0000000000000002 ***
Time -0.03106 0.00259 -12.0 <0.0000000000000002 ***
Quantity -0.48567 0.02522 -19.3 <0.0000000000000002 ***
PricePerLb -0.19365 0.01514 -12.8 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 21532 on 15886 degrees of freedom
Residual deviance: 20702 on 15883 degrees of freedom
AIC: 20710
Number of Fisher Scoring iterations: 5
coef(mod1)
(Intercept) Time Quantity PricePerLb
0.6464 -0.0311 -0.4857 -0.1936
\[V(x) = 0.646 - 0.031 Time - 0.4857 Qty - 0.1937 Price\] \[\text{Logit (log odd):} \quad V(x) = b_0 + b_1 x_1 + ... + b_k x_k = logit = log(odd) = log(\frac{p}{1-p})\]
\[\text{Prob. of Buying:} \quad p = \mathfrak{L}(logit) = \frac{1}{1+exp(-V)}\]
bd$logOdd = predict(mod1, bd)
bd$ProbBuy = predict(mod1, bd, type='response')
head(bd)
1/(1+exp(-bd$logOdd[1:6]))
[1] 0.149 0.367 0.472 0.362 0.505 0.387
從迴歸係數估計預測變數對購買機率的邊際效果
\[ \text{odd} = e^{b_0+b_1x_1} \\ e^{b_0+b_1(x_1+1)} = e^{b_0+b_1x_1+b_1} = e^{b_0+b_1x_1} \times e^{b_1}\]
迴歸係數可以透過指數函數轉換為勝率比
coef(mod1) %>% exp
(Intercept) Time Quantity PricePerLb
1.909 0.969 0.615 0.824
注意:勝率和機率之間沒有固定的比率關係
\[ o = \frac{p}{1-p} \quad ; \quad p = \frac{o}{1+o} \]
par(mfcol= c(1,2), cex=0.8)
curve(x/(1-x), 0.01, 0.99, xlab="prob", ylab="odd")
curve(x/(1+x), 0.01, 100, xlab="odd", ylab="prob")
k = 2
p = seq(0.01, 0.99, 0.01)
o = p/(1-p)
pko = (k*o)/(1+k*o)
plot(p, pko - p, type='l', xlab="prob", ylab="delta prob", lwd=2, col="cyan")
par(cex=0.8)
plot(0.5,0,pch=20,col='gray',xlim=c(0,1),ylim=c(-0.8,0.8),
xlab="base prob.",ylab="prob. increment",
main = "Odd Ratio = 50, 10, 5, 2, 1, 1/2, 1/5, 1/10, 1/50")
abline(h=seq(-0.8,0.8,0.1), v=seq(0,1,0.05), col='lightgray', lty=3 )
p = seq(0.01, 0.99, 0.01)
for(k in c(1/50, 1/10, 1/5, 1/2, 1, 2, 5, 10, 50)) {
o = p/(1-p)
pko = (k*o)/(1+k*o)
lines(p, pko - p, lwd=2)
}
par(cex=0.8)
plot(0.5,0.5,pch=20,col='gray',xlim=c(0,1),ylim=c(0,1),
xlab="base prob.",ylab="resultant prob.",
main = "Odd Ratio = 50, 10, 5, 2, 1, 1/2, 1/5, 1/10, 1/50")
abline(h=seq(0,1,0.1), v=seq(0,1,0.1), col='lightgray', lty=3 )
p = seq(0.01, 0.99, 0.01)
for(k in c(1/50, 1/10, 1/5, 1/2, 1, 2, 5, 10, 50)) {
o = p/(1-p)
pko = (k*o)/(1+k*o)
lines(p, pko, lwd=2)
}
expProfit = function(price, mod, data) {
data$PricePerLb = price
(price - data$CostPerLb) * predict(mod, data, type="response")
}
expProfit(4.5, mod1, bd[1,])
1
0.29
mean(bd$PricePerLb)
[1] 3.15
optim(4, expProfit, method="BFGS", control=list(fnscale=-1),
mod=mod1, data=bd[1,] )
$`par`
[1] 7.09
$value
[1] 0.345
$counts
function gradient
8 7
$convergence
[1] 0
$message
NULL
bd[1,]
price = seq(4, 10, 0.2)
prob = predict(mod1, data.frame(Time=9, Quantity=3.5, PricePerLb=price), type="response")
expReturn = prob * (price - 1.58)
plot(price, expReturn , type='l')
bd$gain = (bd$LagPrice - bd$PricePerLb)*(bd$LagPrice > bd$PricePerLb)
bd$loss = (bd$PricePerLb - bd$LagPrice)*(bd$PricePerLb > bd$LagPrice)
mod2 = glm(Order ~ Time+Quantity+gain+loss+Quantity:gain+Quantity:loss,
data=bd, family=binomial)
summary(mod2)
Call:
glm(formula = Order ~ Time + Quantity + gain + loss + Quantity:gain +
Quantity:loss, family = binomial, data = bd)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.544 -1.081 -0.751 1.206 3.176
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.13223 0.02789 4.74 0.00000213 ***
Time -0.03248 0.00261 -12.43 < 0.0000000000000002 ***
Quantity -0.50230 0.03079 -16.31 < 0.0000000000000002 ***
gain 0.10646 0.02124 5.01 0.00000054 ***
loss -0.32350 0.02513 -12.87 < 0.0000000000000002 ***
Quantity:gain 0.05545 0.01818 3.05 0.0023 **
Quantity:loss -0.13806 0.07437 -1.86 0.0634 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 21532 on 15886 degrees of freedom
Residual deviance: 20475 on 15880 degrees of freedom
AIC: 20489
Number of Fisher Scoring iterations: 5
par(cex=0.8)
data.frame(
model1 = predict(mod1, bd, type="response"),
model2 = predict(mod2, bd, type="response")
) %>% colAUC(bd$Order, plotROC=TRUE)
model1 model2
0 vs. 1 0.614 0.633
expProfit2 = function(price, mod, data) {
data$gain = (price - data$PricePerLb)*(data$LagPrice > price)
data$loss = (price - data$LagPrice)*(price > data$LagPrice)
(price - data$CostPerLb) * predict(mod, data, type="response")
}
expProfit2(4.5, mod2, bd[1,])
1
0.0655
optim(4, expProfit2, method="BFGS", control=list(fnscale=-1),
mod=mod2, data=bd[1,] )
$`par`
[1] 2.93
$value
[1] 0.101
$counts
function gradient
10 7
$convergence
[1] 0
$message
NULL