rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr); library(ggplot2)
Odd = \(p/(1-p)\)
Logit = \(log(odd)\) = \(log(\frac{p}{1=p})\)
\(o = p/(1-p)\) ; \(p = o/(1+o)\) ; \(logit = log(o)\)
par(cex=0.8, mfcol=c(1,2))
curve(x/(1-x), 0.02, 0.98, col='cyan',lwd=2, main='odd')
abline(v=seq(0,1,0.1), h=seq(0,50,5), col='lightgray', lty=3)
curve(log(x/(1-x)), 0.005, 0.995, lwd=2, col='purple', main="logit")
abline(v=seq(0,1,0.1), h=seq(-5,5,1), col='lightgray', lty=3)
Linear Model: \(y = f(x) = b_0 + b_1x_1 + b_2x_2 + ...\)
General Linear Model(GLM): \(y = Link(f(x))\)
Logistic Regression: \(logit(y) = log(\frac{p}{1-p}) = f(x) \text{ where } p = prob[y=1]\)
Logistic Function: \(Logistic(F_x) = \frac{1}{1+Exp(-F_x)} = \frac{Exp(F_x)}{1+Exp(F_x)}\)
par(cex=0.8)
curve(1/(1+exp(-x)), -5, 5, col='blue', lwd=2,main="Logistic Function",
xlab="f(x): the logit of y = 1", ylab="the probability of y = 1")
abline(v=-5:5, h=seq(0,1,0.1), col='lightgray', lty=2)
abline(v=0,h=0.5,col='pink')
points(0,0.5,pch=20,cex=1.5,col='red')
【Q】What are the definiion of ligit & logistic function? What is the relationship between them?
【Ans】logistic function 是把ligit轉換成機率的公式
glm(, family=binomial)glm()的功能:在 \(\{x\}\) 的空間之中,找出區隔 \(y\) 的(類別)界線
Q = read.csv("data/quality.csv") # Read in dataset
glm1 = glm(PoorCare~OfficeVisits+Narcotics, Q, family=binomial)
summary(glm1)
Call:
glm(formula = PoorCare ~ OfficeVisits + Narcotics, family = binomial,
data = Q)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.377 -0.627 -0.510 -0.154 2.119
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.5402 0.4500 -5.64 0.000000017 ***
OfficeVisits 0.0627 0.0240 2.62 0.00892 **
Narcotics 0.1099 0.0326 3.37 0.00076 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 147.88 on 130 degrees of freedom
Residual deviance: 116.45 on 128 degrees of freedom
AIC: 122.4
Number of Fisher Scoring iterations: 5
b = coef(glm1); b # extract the regression coef
(Intercept) OfficeVisits Narcotics
-2.54021 0.06273 0.10990
Given OfficeVisits=3, Narcotics=4, what are the predicted logit, odd and probability? #代表x(變數)值
logit = sum(b * c(1, 3, 4)) #係數b*x(變數)的值c
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
odd = exp(logit) #ln(odd)=logit
prob = odd/(1+odd)
c(logit=logit, odd=odd, prob=prob)
logit odd prob
-1.9124 0.1477 0.1287
【Q】What if OfficeVisits=2, Narcotics=3?
【Ans】-2.08, 0.12 0.11
#-2.08, 0.12 0.11
logit = sum(b * c(1, 2, 3))
odd = exp(logit)
prob = odd/(1+odd)
c(logit=logit, odd=odd, prob=prob)
logit odd prob
-2.0851 0.1243 0.1106
We can plot the line of logit = 0 or prob = 0.5 on the plane of \(X\)
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
plot(Q$OfficeVisits, Q$Narcotics, col=1+Q$PoorCare,pch=20) #R的顏色從1開始算,但PoorCare是factor(o,1)而0無法辨識,所以給他加1 #pch是點的形狀
abline(-b[1]/b[3], -b[2]/b[3])
Furthermore, we can translate probability, logit and coefficents to intercept & slope …
\[f(x) = b_1 + b_2 x_2 + b_3 x_3 = g \Rightarrow x_3 = \frac{g - b_1}{b_3} - \frac{b_2}{b_3}x_2\]
p = seq(0.1,0.9,0.1) #連續做,從0.1到0.9再到0.1
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
logit = log(p/(1-p))
data.frame(prob = p, logit)
prob logit
1 0.1 -2.1972
2 0.2 -1.3863
3 0.3 -0.8473
4 0.4 -0.4055
5 0.5 0.0000
6 0.6 0.4055
7 0.7 0.8473
8 0.8 1.3863
9 0.9 2.1972
then mark the contours of proabilities into the scatter plot
par(cex=0.7)
plot(Q$OfficeVisits, Q$Narcotics, col=1+Q$PoorCare,
pch=20, cex=1.3, xlab='X2', ylab='X3')
for(g in logit) {
abline((g-b[1])/b[3], -b[2]/b[3], col=ifelse(g==0,'blue','cyan')) }
【Q】What do the blue/cyan lines means?
【Ans】Threshold機率=0.1,0.2,0.3,…,0.9
【Q】Given any point in the figure above, how can you tell its (predicted) probability approximately?
【Ans】0.7
Figure 1 - Confusion Matrix
Confusion matrix is not fixed. It changes by Threshold …
Figure 2 - Dist. Prediected Prob.
library(caTools)
DPP2 = function(pred,class,tvalue,breaks=0.01) {
mx = table(class == tvalue, pred > 0.5)
tn = sum(class != tvalue & pred <= 0.5)
fn = sum(class == tvalue & pred <= 0.5)
fp = sum(class != tvalue & pred > 0.5)
tp = sum(class == tvalue & pred > 0.5)
acc = (tn + tp)/length(pred)
sens = tp/(fn+tp)
spec = tn/(tn+fp)
auc = colAUC(pred,class)
data.frame(pred,class) %>%
ggplot(aes(x=pred, fill=class)) +
geom_histogram(col='gray',alpha=0.5,breaks=seq(0,1,breaks)) +
xlim(0,1) + theme_bw() + xlab("predicted probability") +
ggtitle(
sprintf("Distribution of Prob[class = \'%s\']", tvalue),
sprintf("AUC=%.3f, Acc=%.3f, Sens=%.3f, Spec=%.3f",
auc, acc, sens, spec) )
}
N1 = 300; N2 = 100 #sample size
DPP2(pred = c(rnorm(N1,0.125,0.03), rnorm(N2,0.375,0.03)), #變數名,平均值,標準差
class = c(rep('B',N1), rep('A',N2)),
tvalue = 'A')
【Q】Is it possible to have AUC = ACC = SENS = SPEC = 1? Can you modify the code to make it happen?
【Ans】
# 0.25, 0.75
N1 = 300; N2 = 100
DPP2(pred = c(rnorm(N1,0.25,0.03), rnorm(N2,0.75,0.03)),
class = c(rep('B',N1), rep('A',N2)),
tvalue = 'A')
【Q】Is it possible to have AUC = ACC = SENS = SPEC = 0? Can you modify the code to make that happen?
【Ans】
#
N1 = 300; N2 = 100
DPP2(pred = c(rnorm(N1,0.9,0.03), rnorm(N2,0.1,0.03)),
class = c(rep('B',N1), rep('A',N2)),
tvalue = 'A')
set.seed(88)
split = sample.split(Q$PoorCare, SplitRatio = 0.75)
table(split) %>% prop.table()
split
FALSE TRUE
0.2443 0.7557
table(y = Q$PoorCare, split) %>% prop.table(2)
split
y FALSE TRUE
0 0.7500 0.7475
1 0.2500 0.2525
TR = subset(Q, split == TRUE)
TS = subset(Q, split == FALSE)
glm1 = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial)
summary(glm1)
Call:
glm(formula = PoorCare ~ OfficeVisits + Narcotics, family = binomial,
data = TR)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.0630 -0.6316 -0.5050 -0.0969 2.1669
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.6461 0.5236 -5.05 0.00000043 ***
OfficeVisits 0.0821 0.0305 2.69 0.0072 **
Narcotics 0.0763 0.0321 2.38 0.0173 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 111.888 on 98 degrees of freedom
Residual deviance: 89.127 on 96 degrees of freedom
AIC: 95.13
Number of Fisher Scoring iterations: 4
pred = predict(glm1, type='response')
mx = table(TR$PoorCare, pred > 0.5); mx
FALSE TRUE
0 70 4
1 15 10
c(accuracy = sum(diag(mx))/sum(mx),
sensitivity = mx[2,2]/sum(mx[2,]),
specificity = mx[1,1]/sum(mx[1,]))
accuracy sensitivity specificity
0.8081 0.4000 0.9459
library(ROCR)
ROCRpred = prediction(pred, TR$PoorCare)
ROCRperf = performance(ROCRpred, "tpr", "fpr")
par(cex=0.8)
plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,0.1))
as.numeric(performance(ROCRpred, "auc")@y.values)
[1] 0.7746
caTools::colAUC(pred, TR$PoorCare)
[,1]
0 vs. 1 0.7746
source("DPP.R")
F = read.csv("data/framingham.csv")
set.seed(1000)
split = sample.split(F$TenYearCHD, SplitRatio = 0.65)
TR = subset(F, split==TRUE)
TS = subset(F, split==FALSE)
glm2 = glm(TenYearCHD ~ ., TR, family=binomial)
summary(glm2)
Call:
glm(formula = TenYearCHD ~ ., family = binomial, data = TR)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.849 -0.601 -0.426 -0.284 2.837
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.88657 0.89073 -8.85 < 2e-16 ***
male 0.52846 0.13544 3.90 0.0000955212349 ***
age 0.06206 0.00834 7.44 0.0000000000001 ***
education -0.05892 0.06243 -0.94 0.3453
currentSmoker 0.09324 0.19401 0.48 0.6308
cigsPerDay 0.01501 0.00783 1.92 0.0551 .
BPMeds 0.31122 0.28741 1.08 0.2789
prevalentStroke 1.16579 0.57121 2.04 0.0413 *
prevalentHyp 0.31582 0.17176 1.84 0.0660 .
diabetes -0.42149 0.40799 -1.03 0.3016
totChol 0.00384 0.00138 2.79 0.0053 **
sysBP 0.01134 0.00457 2.48 0.0130 *
diaBP -0.00474 0.00800 -0.59 0.5535
BMI 0.01072 0.01616 0.66 0.5069
heartRate -0.00810 0.00531 -1.52 0.1274
glucose 0.00893 0.00284 3.15 0.0016 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2020.7 on 2384 degrees of freedom
Residual deviance: 1792.3 on 2369 degrees of freedom
(371 observations deleted due to missingness)
AIC: 1824
Number of Fisher Scoring iterations: 5
pred = predict(glm2, TS, type="response")
y = TS$TenYearCHD[!is.na(pred)] # remove NA
pred = pred[!is.na(pred)]
mx = table(y, pred > 0.5); mx
y FALSE TRUE
0 1069 6
1 187 11
c(accuracy = sum(diag(mx))/sum(mx),
sensitivity = mx[2,2]/sum(mx[2,]),
specificity = mx[1,1]/sum(mx[1,]))
accuracy sensitivity specificity
0.84839 0.05556 0.99442
par(cex=0.7)
auc = DPP(pred, y, 1, b=seq(0,1,0.02)) # 0.74211
Figure 3 - Startegic Optimization
range(pred)
[1] 0.01641 0.72958
payoff = matrix(c(0,-100,-10,-60),2,2)
cutoff = seq(0.02, 0.64, 0.01)
result = sapply(cutoff, function(p) sum(table(y,pred>p)*payoff) )
i = which.max(result)
par(cex=0.7)
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.05),h=seq(-23000,-17000,500),col='lightgray',lty=3)
abline(v=cutoff[i],col='red')
【Q】如果什麼都不做,期望報酬是多少?
【Ans】-$19800
TN=1069
TP=11
FN=187
FP=6
risk=-100
havepill=-60
cost=0
TN*0 + TP*risk + FN*risk + FP*cost
[1] -19800
【Q】如果每位病人都做呢?
【Ans】-$32090
TN=1069
TP=11
FN=187
FP=6
Total=1069+11+187+6
risk=-100
havepill=-60
cost=-10
TN*0 + TP*havepill + FN*risk + Total*cost
[1] -32090
【Q】以上哪一種做法期望報酬比較高?
【Ans】每個人都不吃
【Q】在所有的商務情境都是這種狀況嗎?
【Ans】不是,根據不同商務情況會有不同報酬矩陣
【Q】你可以模擬出「全做」比「全不做」還要好的狀況、並舉出一個會發生這種狀況的商務情境嗎?
【Ans】比起全部單一化政策,讓有病的人吃藥,沒病的人不吃藥是最好的;將藥的成本調降(控制在低於$0.3)或是研發出功效更強的藥。
TN=1069
TP=11
FN=187
FP=6
Total=1069+11+187+6
risk=-100
havepill=-60
cost=-0.3
TN*0 + TP*havepill + FN*risk + Total*cost
[1] -19742
library(manipulate)
p0 = par(mfrow=c(2,1),cex=0.8)
manipulate({
Y0 = -22000; Y1 = -12000
mx = matrix(c(true_neg, false_neg, false_pos, true_pos),2,2)
cx = seq(0.02, 0.64, 0.01)
rx = sapply(cx, function(p) sum(table(y, pred>p)*mx) )
i = which.max(rx)
plot(cx, rx, type='l',col='cyan',lwd=2,main=sprintf(
"Optomal Expected Result: $%d @ %.2f, T:%d",rx[i],cx[i],sum(pred>cx[i])),
ylim=c(Y0,Y1))
abline(v=cx[i],col='red')
abline(v=seq(0,1,0.1),h=seq(Y0,Y1,2000),col='lightgray',lty=3)
DPP(pred, y, 1, b=seq(0,1,0.02))
abline(v=cx[i],col='red')
},
true_neg = slider(-100,100,0,step=5),
false_neg = slider(-100,100,-100,step=5),
false_pos = slider(-100,100,-10,step=5),
true_pos = slider(-100,100,-60,step=5)
)
par(p0)
【Q】有五種成本分別為 $5, $10, $15, $20, $30 的藥,它們分別可以將風險成本從 $100 降低到 $70, $60, $50, $40, $25,哪一種藥的期望效益是最大的呢?
【Ans】$20的藥
#1 (-5,-70) Expected Result:-17415
#2 (10,-60) Expected Result:-17500
#3 (15,-50) Expected Result:-17450
#4 (20,-40) Expected Result:-17400
#5 (30,-25) Expected Result:-17655
Figure 4 - 資料、模型、預測、決策