本週上課將介紹二元迴歸的基本原理,描述自變數與二元變數之間的關係。例如:
\(\blacksquare\) 什麼樣的人會無法償還信用卡的債務?
library(ISLR); library(stargazer)
Default$default.n <- as.numeric(Default$default)
Default$default.n <- Default$default.n-1 #No=0, Yes=1
M1 <- lm(default.n ~ balance, data=Default)
Default$predicted<-predict(M1)
stargazer (Default, type='html')
Statistic | N | Mean | St. Dev. | Min | Max |
balance | 10,000 | 835.375 | 483.715 | 0.000 | 2,654.323 |
income | 10,000 | 33,516.980 | 13,336.640 | 771.968 | 73,554.230 |
default.n | 10,000 | 0.033 | 0.179 | 0 | 1 |
predicted | 10,000 | 0.033 | 0.063 | -0.075 | 0.270 |
\[exp^{log(a)} = a\]
\[\mathrm{log}(exp^{a})=a \] \[exp^{a-b}=\frac{exp^{a}}{exp^{b}} \] \[exp^{a+b}=exp^{a}exp^{b} \] 我們實際計算看看:
a=100
exp(log(a))
## [1] 100
log(exp(a))
## [1] 100
進一步計算指數:
a=1
exp(a)
## [1] 2.718282
exp(-a)
## [1] 0.3678794
\(\blacksquare\) \(20\%\)的打擊率,表示打出安打跟無法上壘的勝算為1:4,因為\(\frac{0.2}{1-0.2}=\frac{1}{4}\),或者說無法上壘的機率是打安打的4倍。
\(\blacksquare\) \(90\%\)的合格率,表示合格是不合格機率的9倍,因為\(\frac{0.9}{1-0.9}=9\)。
\(\blacksquare\) 什麼樣的人會無法償還信用卡的債務?
library(ISLR); library(stargazer)
M2 <- glm(default ~ balance, data=Default, family=binomial(logit))
stargazer (M2, type='html')
Dependent variable: | |
default | |
balance | 0.005*** |
(0.0002) | |
Constant | -10.651*** |
(0.361) | |
Observations | 10,000 |
Log Likelihood | -798.226 |
Akaike Inf. Crit. | 1,600.452 |
Note: ***:p<0.01 |
M3 <- glm(default ~ student, data=Default, family=binomial(logit))
stargazer (M3, type='html')
Dependent variable: | |
default | |
studentYes | 0.405*** |
(0.115) | |
Constant | -3.504*** |
(0.071) | |
Observations | 10,000 |
Log Likelihood | -1,454.342 |
Akaike Inf. Crit. | 2,912.683 |
Note: | p<0.05; p<0.01; p<0.001 |
library(ISLR); library(stargazer)
Default$default.n <- as.numeric(Default$default)
Default$default.n <- Default$default.n-1 #No=0, Yes=1
M1 <- lm(default.n ~ balance, data=Default)
logM <- glm(default.n ~ balance, data=Default,
family=binomial(logit))
stargazer(M1, logM, type="html")
Dependent variable: | ||
default.n | ||
OLS | logistic | |
(1) | (2) | |
balance | 0.0001*** | 0.005*** |
(0.00000) | (0.0002) | |
Constant | -0.075*** | -10.651*** |
(0.003) | (0.361) | |
Observations | 10,000 | 10,000 |
R2 | 0.123 | |
Adjusted R2 | 0.122 | |
Log Likelihood | -798.226 | |
Akaike Inf. Crit. | 1,600.452 | |
Residual Std. Error | 0.168 (df = 9998) | |
F Statistic | 1,396.816*** (df = 1; 9998) | |
Note: | p<0.1; p<0.05; p<0.01 |
可用R
計算:
pnorm(-2, 0, 1)
## [1] 0.02275013
pnorm(-1, 0, 1)
## [1] 0.1586553
pnorm(2, 0, 1)
## [1] 0.9772499
計算全部X\(\beta\)從-3到3在常態分佈時\(N(0,1)\)的累積機率密度:
j<-c()
k<-seq(-3, 3, 0.1)
n <- length(k)
for (i in 1:n)
{
j[i] <- pnorm (k[i], 0, 1)
}
plot(k, j, main="CDF", xlab=expression(paste('X',beta)), ylab="Probability",
type="l", lwd=1.5)
Probit的累積機率(cdf)可畫成如下:
參考UCLA的介紹:
Admit<-read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
Admit$rank <- factor(Admit$rank)
fit1 <- glm(admit ~ gre + gpa + rank, family = binomial(link = "probit"),
data = Admit)
stargazer(fit1, type='html')
Dependent variable: | |
admit | |
gre | 0.001** |
(0.001) | |
gpa | 0.478** |
(0.197) | |
rank2 | -0.415** |
(0.195) | |
rank3 | -0.812*** |
(0.208) | |
rank4 | -0.936*** |
(0.245) | |
Constant | -2.387*** |
(0.674) | |
Observations | 400 |
Log Likelihood | -229.207 |
Akaike Inf. Crit. | 470.413 |
Note: | p<0.1; p<0.05; p<0.01 |
newdata <- data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
4 * 4), gpa = rep(c(2.5, 3, 3.5, 4), each = 100 * 4), rank = factor(rep(rep(1:4,
each = 100), 4)))
newdata[, c("p", "se")] <- predict(fit1, newdata,
type = "response", se.fit = TRUE)[-3]
library(ggplot2)
ggplot(newdata, aes(x = gre, y = p, colour = rank)) + geom_line() + facet_wrap(~gpa)
\[\pi=\frac{\sum_{i=1}^n y_{i}}{n} \]
\[\hat{\beta}\sim \mathcal{N(\beta, \phi(X^{T}WX)^{-1})} \]
R
一樣的估計
\[\lambda=\frac{\mathcal{L}(\theta)}{\mathcal{L}(\hat{\theta})} \]
\[-2\hspace{.1em}\text{log}\hspace{.1em}\lambda\hspace{.5em}\xrightarrow{d}\hspace{.5em}\chi^2\]
\(\blacksquare\)哪些因素影響信用卡違約?
library(ISLR); library(stargazer)
M5 <-glm(default ~ balance+income+student, data=Default, family=binomial(logit))
stargazer(M5, type='html')
Dependent variable: | |
default | |
balance | 0.006*** |
(0.0002) | |
income | 0.00000 |
(0.00001) | |
studentYes | -0.647*** |
(0.236) | |
Constant | -10.869*** |
(0.492) | |
Observations | 10,000 |
Log Likelihood | -785.772 |
Akaike Inf. Crit. | 1,579.545 |
Note: | p<0.1; p<0.05; p<0.01 |
以下指令可建立\(\text{exp}^{\beta}\):
M5.or <- exp(coef(M5))
stargazer(M5, coef=list(M5.or), type='html')
Dependent variable: | |
default | |
balance | 1.006*** |
(0.0002) | |
income | 1.000*** |
(0.00001) | |
studentYes | 0.524** |
(0.236) | |
Constant | 0.00002 |
(0.492) | |
Observations | 10,000 |
Log Likelihood | -785.772 |
Akaike Inf. Crit. | 1,579.545 |
Note: | p<0.1; p<0.05; p<0.01 |
先建立一個資料框,包含三個自變數的平均值、最大值或者眾數。
allcoef <-data.frame(balance=rep(mean(Default$balance),2),
income=rep(mean(Default$income),2),
student=as.factor(c("Yes", "No")))
allcoef
## balance income student
## 1 835.3749 33516.98 Yes
## 2 835.3749 33516.98 No
代入模型M5
allcoef$pred <-predict(M5, newdata=allcoef, type="response")
allcoef<-cbind(allcoef, allcoef$pred)
allcoef
## balance income student pred allcoef$pred
## 1 835.3749 33516.98 Yes 0.001328976 0.001328976
## 2 835.3749 33516.98 No 0.002534451 0.002534451
library(ISLR); library(stargazer)
M5 <-glm(default ~ balance+income+student, data=Default, family=binomial(logit))
M6 <-glm(default ~ balance+income+student, data=Default, family=binomial(probit))
stargazer(M5, M6, type='html')
Dependent variable: | ||
default | ||
logistic | probit | |
(1) | (2) | |
balance | 0.006*** | 0.003*** |
(0.0002) | (0.0001) | |
income | 0.00000 | 0.00000 |
(0.00001) | (0.00000) | |
studentYes | -0.647*** | -0.296** |
(0.236) | (0.119) | |
Constant | -10.869*** | -5.475*** |
(0.492) | (0.238) | |
Observations | 10,000 | 10,000 |
Log Likelihood | -785.772 | -791.609 |
Akaike Inf. Crit. | 1,579.545 | 1,591.217 |
Note: | p<0.1; p<0.05; p<0.01 |
library(kableExtra)
DT <-data.frame(admission=c(rep(1,10),rep(0,10)),
gender=c(rep(1,7), rep(0,3),rep(1,3),rep(0,7)))
DT %>%
kable('html') %>%
kable_styling()
admission | gender |
---|---|
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 0 |
1 | 0 |
1 | 0 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
library(kableExtra)
DT <-data.frame(Surv=c(rep(1,11),rep(0,22)),
Ag=c(rep(1,9), rep(0,2),rep(1,8),rep(0,14)))
DT %>%
kable('html') %>%
kable_styling()
Surv | Ag |
---|---|
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 0 |
1 | 0 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |