線性迴歸適用於依變數是連續變數,自變數可以是連續或者類別變數。以最小平方法計算樣本的迴歸係數,推論母體的迴歸係數。 社會科學的研究對象經常是類別變數,例如有去投票跟沒去投票,民主與非民主國家,有接受外援與沒有接受等等。 本週上課將介紹二元變數迴歸的基本原理,描述自變數與二元變數之間的關係。例如信用卡是否違約與信用卡未償金額的關係。並且介紹如何用R
運算最大概似法函數。
\(\blacksquare\) 什麼樣的人會無法償還信用卡的債務?
library(ISLR); library(stargazer)
Default$default.n <- as.numeric(Default$default)
Default$default.n <- Default$default.n-1 #No=0, Yes=1
M1 <- lm(default.n ~ balance, data=Default)
Default$predicted<-predict(M1)
stargazer (Default, type='html')
Statistic | N | Mean | St. Dev. | Min | Max |
balance | 10,000 | 835.375 | 483.715 | 0.000 | 2,654.323 |
income | 10,000 | 33,516.980 | 13,336.640 | 771.968 | 73,554.230 |
default.n | 10,000 | 0.033 | 0.179 | 0 | 1 |
predicted | 10,000 | 0.033 | 0.063 | -0.075 | 0.270 |
\[\mathrm{log}1=0\] \[\mathrm{log}_{a}a=1\] \[\mathrm{log}(a)+\mathrm{log}(b)=\mathrm{log}(a*b)\] \[\mathrm{log}(a)-\mathrm{log}(b)=\mathrm{log}(a/b)\] \[\mathrm{log}(a) =\infty\hspace{.1cm} \rm{if}\hspace{.1cm} a\approx \infty\] \[\mathrm{log}0=-\mathrm{Inf}\] \[\mathrm{log}a^k=k*\mathrm{log}a\] \[\rm{exp}^{log(a)} = a\] \[\mathrm{log}(\rm{exp}(a))=a\]
我們實際計算看看:
## [1] 1
## [1] 3.912023
## [1] 3.912023
## [1] 0.6931472
## [1] 0.6931472
## [1] 6.907755
## [1] 6.907755
## [1] 10
## [1] 5
\[\rm{exp}^{0}=1\] \[\rm{exp}^{1}=2.718282\] \[\rm{exp}^{a-b}=\frac{\rm{exp}^{a}}{\rm{exp}^{b}} \] \[\rm{exp}^{a+b}=\rm{exp}^{a}\rm{exp}^{b} \]
指數運算的例子:
## [1] 1
## [1] 8103.084
## [1] 8103.084
## [1] 59874.14
## [1] 59874.14
## [1] 1
\(\blacksquare\) \(20\%\)的打擊率,表示打出安打跟無法上壘的勝算為1:4,因為\(\frac{0.2}{1-0.2}=\frac{1}{4}\),或者說無法上壘的機率是打安打的4倍。
\(\blacksquare\) \(90\%\)的合格率,表示合格是不合格機率的9倍,因為\(\frac{0.9}{1-0.9}=9\)。
curve(dnorm(x, 0.35, 0.1), from=-1.5, to=1, col='#3322EE', lwd=2)
curve(dnorm(x, -0.8, 0.2), from=-1.5, to=1, col='#FF22EE', lwd=2, add=T)
x<-c(-1, -0.85, -0.6,-0.50, -0.45, -0.30, -0.25, -0.20, -0.15, -0.09, 0.00, 0.10, 0.16, 0.30, 0.50)
y<-rep(0, 15)
points(x,y, pch=16, cex=2, col='gray30')
curve(dnorm(x, -0.22, 0.38), from=-1.5, to=1, col='#FF3333', lwd=2, xlab='y')
y<-c(-1, -0.85, -0.6,-0.50, -0.45, -0.30, -0.25, -0.20, -0.15, -0.09, 0.00, 0.10, 0.16, 0.30, 0.50)
y2<-rep(0, 15)
points(y, y2, pch=16, cex=2, col='gray30')
abline(v=-0.22, lty=2, lwd=1.5, add=T)
y<-20; sigma=2; mu<-seq(10, 30, length.out = 300)
L <- c()
n=length(mu)
for (i in 1: n){
L[i] <- 1/(sqrt(2*pi*sigma^2))*exp(-((y-mu[i])^2)/2*sigma^2)
}
plot(mu, L, type='l', lwd=2, col='#FF3322', xlab=expression(hat(mu)))
abline(h=max(L), lty=2, lwd=2, col='#998833')
\[ \begin{eqnarray} \rm{ln}[\mathcal{L}(\mu, \sigma|y_{1},\ldots,y_{n})]&=&\rm{ln}\mathcal{(}\frac{1}{\sqrt{2 \pi\sigma^2}}e^{-(y_{1}-\mu)^2/2\sigma^2}\times\ldots\times\frac{1}{\sqrt{2 \pi\sigma^2}}e^{-(y_{2}-\mu)^2/2\sigma^2}\mathcal{)}\\ & =&\rm{ln}\mathcal{(}\frac{1}{\sqrt{2 \pi\sigma^2}}e^{-(y_{1}-\mu)^2/2\sigma^2}\mathcal{)}+\ldots+\rm{ln}\mathcal{(}\frac{1}{\sqrt{2 \pi\sigma^2}}e^{-(y_{n}-\mu)^2/2\sigma^2}\mathcal{)} \label{eq:LL} \tag{1} \end{eqnarray} \]
其中
\[ \begin{eqnarray} \rm{ln}\mathcal{(}\frac{1}{\sqrt{2 \pi\sigma^2}}e^{-(y_{1}-\mu)^2/2\sigma^2}\mathcal{)}&=&\rm{ln}(\frac{1}{\sqrt{2 \pi\sigma^2}})+\rm{ln}(e^{-(y_{1}-\mu)^2/2\sigma^2}) \\ & = & \rm{ln}[(2\pi\sigma^2)^{-1/2}]-\frac{(y_{1}-\mu)^2}{2\sigma^2}\rm{ln}(e) \\ & =& -\frac{1}{2}\rm{ln}[(2\pi\sigma^2)-\frac{(y_{1}-\mu)^2}{2\sigma^2} \\ & = &-\frac{1}{2}\rm{ln}(2\pi)-\frac{1}{2}\rm{ln}(\sigma^2)-\frac{(y_{1}-\mu)^2}{2\sigma^2} \\ &=&-\frac{1}{2}\rm{ln}(2\pi)-\rm{ln}(\sigma)-\frac{(y_{1}-\mu)^2}{2\sigma^2} \label{eq:likelihood1} \tag{2} \end{eqnarray} \]\[ \begin{eqnarray} \rm{ln}[\mathcal{L}(\mu, \sigma|y_{1},\ldots,y_{n})]&=&-\frac{n}{2}\rm{ln}(2\pi)-\it{n}\rm{ln}(\sigma)-\frac{(y_{1}-\mu)^2}{2\sigma^2}-\ldots-\frac{(y_{n}-\mu)^2}{2\sigma^2} \label{eq:likelihood2} \tag{3} \end{eqnarray} \]
\[ \begin{eqnarray} 0&=& -\frac{n}{\sigma}+\frac{(y_{1}-\mu)^2}{\sigma^3}+\ldots+\frac{(y_{n}-\mu)^2}{\sigma^3} \\ &= & -n+\frac{(y_{1}-\mu)^2}{\sigma^2}+\ldots+\frac{(y_{n}-\mu)^2}{\sigma^2} \\ &= & -n+\frac{1}{\sigma^2}[(y_{1}-\mu)^2+\ldots+(y_{n}-\mu)^2] \\ n&=& \frac{1}{\sigma^2}[(y_{1}-\mu)^2+\ldots+(y_{n}-\mu)^2] \\ n\sigma^2& = & (y_{1}-\mu)^2+\ldots+(y_{n}-\mu)^2 \\ \sigma^2& = & \frac{\sum_{1}^{n} (y_{i}-\mu)^2}{n} \end{eqnarray} \label{eq:sigma_derivative2} \tag{7} \]
library(data.table); library(ggplot2)
set.seed(5000)
obs<-c(6, 5, 10, 3, 1, 8, 2, 9, 0, 2.5, 3.5, 4.2, 4.5, 4.9)
logL<-function(p) sum(log(dnorm(obs,p[1],p[2]))) # p=c(p[1],p[2])=c(mu,sigma)
library('plyr')
dLogL<-expand.grid(museq=seq(4,8, 0.1),sigmaseq=seq(2,4,0.1))
dLogL<-ddply(dLogL,.(museq, sigmaseq),
transform,logLike=logL(c(museq,sigmaseq)))
DT<-data.frame(dLogL)
is.na(DT) <- sapply(DT, is.infinite)
for (j in 1:ncol(DT)) set(DT, which(is.infinite(DT[[j]])), j, NA)
min_value<-min(DT$logLike)
max_value<-max(DT$logLike)
DT[which(DT$logLike==max_value),]
## museq sigmaseq logLike
## 114 4.5 2.8 -34.39078
dt <- DT %>% group_by(sigmaseq)
#mean(obs) =4.5
#fix mu=4.5
dt <- dt %>% filter(museq==4.5)
ggplot(data=dt, aes(x=sigmaseq, y=logLike))+
geom_line(col='#339999', size=1) +
geom_hline(aes(yintercept=max(logLike, na.rm=T)),
color="red", linetype="dashed", size=1) +
labs(x=expression(hat(sigma)),y='Log Likelihood') +
scale_x_continuous(labels=seq(2,4, by=0.1),
breaks=seq(2,4, by=0.1))
#Negative log-likelihood
negLogL<-function(p) -logL(p)
estPar<-optim(c(20,10),negLogL)
MLEparameters<-estPar$par
MLEparameters
## [1] 4.543086 2.821499
\[\begin{equation*} \mathcal{L}(y_{i}) =\begin{cases} \pi & \text{if}\hspace{.15cm} y_{i}=1 \\ 1-\pi & \text{if} \hspace{.15cm} y_{i}=0 \end{cases} \end{equation*}\]
\[ \begin{eqnarray} \text{max}\sum_{i=1}^{n}[\text{log}\mathcal{L}(y_{i}|\hat{\pi})]&=&\sum \text{log}[\hat{\pi}^{y_{i}}(1-\hat{\pi})^{1-y_{i}}] \nonumber \\ & = & \sum y_{i}\text{log}(\hat{\pi})+(1-y_{i})\text{log}(1-\hat{\pi}) \end{eqnarray} \]\[\hat{\pi}=\frac{\sum_{i=1}^n y_{i}}{n} \]
#http://www.johnmyleswhite.com/notebook/2010/04/21/doing-maximum-likelihood-estimation-by-hand-in-r/
set.seed(1000)
p.parameter <- 0.6
sequence <- rbinom(10, 1, p.parameter)
log.likelihood.sum <- function(sequence, p) {
log.likelihood <- sum(log(p)*(sequence==1)) + sum(log(1-p)*(sequence==0))
}
possible.p <-seq(0,1, length.out=100)
#jpeg('Likelihood_Concavity.jpg')
library('ggplot2')
g1<-qplot(possible.p,
sapply(possible.p, function (p) {log.likelihood.sum(sequence, p)}),
geom = 'line',
main = 'Likelihood as a Function of p',
xlab = 'p',
ylab = 'Likelihood')
g1 + scale_x_continuous(labels=seq(0,1, by=0.1),
breaks=seq(0,1, by=0.1))
#dev.off()
dt<-data.frame(p=possible.p,
value=sapply(possible.p, function (p) {log.likelihood.sum(sequence, p)}))
dt<-dt[-c(1,100),]
m.value=max(dt$value)
dt[which(dt$value==m.value),]
## p value
## 70 0.6969697 -6.108861
R
的optimize函數應用在二元變數的方式。首先寫下log函數。
def <- as.numeric(ISLR::Default$default)
y<-c()
y[def==2]<-1
y[def==1]<-0
N<-1
logL <- function(p) sum(log(dbinom(y, N, p)))
## [1] -1785.281
## $maximum
## [1] 0.03330411
##
## $objective
## [1] -1460.325
## [1] 0.0333
\[\mathcal{L}(X|\mu,\sigma^2)=\frac{-n}{2}\rm{ln}(2\pi\sigma^2)-\frac{1}{2\sigma^2}\sum (x_{i}-\mu)^2\]
# specify the single value normal probability function
norm_lik = function(x, mu, s){
y = 1/sqrt(2*pi*s^2)*exp((-1/(2*s^2))*(x-mu)^2)
}
# and plot it just to make sure
xval=seq(-3,3, length.out = 600)
yval=sapply(seq(-3,3, length.out=600),FUN=norm_lik,mu=0,s=1)
plot(xval, yval, type='l', ylab='',xlab='', main='Normal Distribution',
col='#FF22EE', lwd=1.5)
#Data
set.seed(270511)
xvec<-rnorm(600, 1.5, sqrt(pi))
fn <- function(theta) {
sum ( 0.5*(xvec - theta[1])^2/theta[2] + 0.5* log(theta[2]) )
}
library(stats4)
nlm(fn, theta <- c(0,1), hessian=TRUE)
## $minimum
## [1] 641.4347
##
## $estimate
## [1] 1.539712 3.120880
##
## $gradient
## [1] -2.215093e-07 5.828450e-07
##
## $hessian
## [,1] [,2]
## [1,] 192.253461808 -0.004693916
## [2,] -0.004693916 30.788876072
##
## $code
## [1] 1
##
## $iterations
## [1] 17
# Likelihood function
llik = function(x,par){
m=par[1]
s=par[2]
n=length(x)
# log of the normal likelihood
# -n/2 * log(2*pi*s^2) + (-1/(2*s^2)) * sum((x-m)^2)
ll = -(n/2)*(log(2*pi*s^2)) + (-1/(2*s^2)) * sum((x-m)^2) # return the negative to maximize rather than minimize
return(-ll) }
set.seed(270511)
xvec<-rnorm(600, 1.5, sqrt(pi))
# call optim with the starting values 'par', # the function (here 'llik'),
# and the observations 'x'
res0 = optim(par=c(.5,.5), llik, x=xvec)
#Result
print(kable( cbind('direct'=c('mean'=mean(xvec),
'sd'=sd(xvec)),
'optim'=res0$par),digits=3))
\(\blacksquare\) 什麼樣的人會無法償還信用卡的債務?我們用glm()函數估計模型,注意要設定family=binomial(logit)。
library(ISLR); library(stargazer)
M2 <- glm(default ~ balance, data=Default, family=binomial(logit))
stargazer (M2, type='html')
Dependent variable: | |
default | |
balance | 0.005*** |
(0.0002) | |
Constant | -10.651*** |
(0.361) | |
Observations | 10,000 |
Log Likelihood | -798.226 |
Akaike Inf. Crit. | 1,600.452 |
Note: ***:p<0.01 |
\[ \begin{eqnarray} \frac{p(X)}{1-p(X)} & = & exp^{-10.651+0.0055\cdot \mathtt{balance}}\\ & = & exp^{-10.651}\cdot exp^{0.0055\cdot \mathtt{balance}} \end{eqnarray} \]
我們畫圖表示\(\frac{p(X)}{1-p(X)}\)與自變數的關係:
a=-10.651; b=0.0055
y=exp(a)*exp(b*ISLR::Default$balance)
yi=sort(y)
xi=sort(ISLR::Default$balance)
dt <-data.frame(yi, xi)
library(ggplot2)
ggplot(dt, aes(x=xi, y=yi)) +
geom_point(col='#CC9933')
另外畫圖表示\(p(X)\)與\(\frac{e^\eta}{1+e^\eta}\)的關係:
a=-10.651; b=0.0055
xi=sort(ISLR::Default$balance)
ETA=a + b*xi
pi=exp(ETA)/(1+exp(ETA))
dt <-data.frame(x=xi, y=pi)
library(ggplot2)
ggplot(dt, aes(x=xi, y=pi)) +
geom_point(col='#FF9933')
可以看出\(p\)值隨著自變數的增加而上升,但是不會小於0或是大於1。中間有一段接近線性函數。當\(p\)接近0,\(\rm{log}\frac{p(X)}{1-p(X)}\approx -\infty\),當\(p\)接近1,\(\rm{log}\frac{p(X)}{1-p(X)}\approx \infty\)。
Dependent variable: | |
default | |
studentYes | 0.405*** |
(0.115) | |
Constant | -3.504*** |
(0.071) | |
Observations | 10,000 |
Log Likelihood | -1,454.342 |
Akaike Inf. Crit. | 2,912.683 |
Note: | p<0.05; p<0.01; p<0.001 |
我們也可以同時估計兩個自變數與依變數的關係:
M4 <- glm(default ~ balance + student, data=Default, family=binomial(logit))
stargazer (M4, type='html')
Dependent variable: | |
default | |
balance | 0.006*** |
(0.0002) | |
studentYes | -0.715*** |
(0.148) | |
Constant | -10.749*** |
(0.369) | |
Observations | 10,000 |
Log Likelihood | -785.841 |
Akaike Inf. Crit. | 1,577.682 |
Note: | p<0.1; p<0.05; p<0.01 |
library(ISLR); library(stargazer)
Default$default.n <- as.numeric(Default$default)
Default$default.n <- Default$default.n-1 #No=0, Yes=1
M1 <- lm(default.n ~ balance+student, data=Default)
logM <- glm(default.n ~ balance+student, data=Default,
family=binomial(logit))
stargazer(M1, logM, type="html")
Dependent variable: | ||
default.n | ||
OLS | logistic | |
(1) | (2) | |
balance | 0.0001*** | 0.006*** |
(0.00000) | (0.0002) | |
studentYes | -0.015*** | -0.715*** |
(0.004) | (0.148) | |
Constant | -0.073*** | -10.749*** |
(0.003) | (0.369) | |
Observations | 10,000 | 10,000 |
R2 | 0.124 | |
Adjusted R2 | 0.124 | |
Log Likelihood | -785.841 | |
Akaike Inf. Crit. | 1,577.682 | |
Residual Std. Error | 0.168 (df = 9997) | |
F Statistic | 707.060*** (df = 2; 9997) | |
Note: | p<0.1; p<0.05; p<0.01 |
上面的表格中的第一欄顯示,增加一個單位的信用卡未償金額,只會增加萬分之一單位的違約。但是第二欄顯示,增加一個單位的信用卡未償金額,違約對不違約的機率會增加1.005倍。
\(Z \sim \mathcal{N}(\mu = 0, \hspace{0.1cm}\sigma = 1)\)
例如我們想計算X<0.25的標準化CDF,而且\(\mu=1\)、\(\sigma^2=2\)。計算方式為: \(P(X < 0.25) = P(\frac{X - \mu}{\sigma} < \frac{0.25-\mu}{\sigma}) = P(Z < \frac{0.25 - 1}{2}) = \Phi\left(\frac{0.25 - 1}{2}\right) = 0.3538\)
## [1] 0.3538302
我們可以畫圖如下:
x<-seq(-5, 5, length.out=1000)
CDF <- c()
for (i in 1: 1000){
CDF[i] = pnorm(x[i], 1, 2)
}
dt <- data.frame(x, CDF)
library(ggplot2)
ggplot(data=dt, aes(x=x, y=CDF)) +
geom_line(col='#CC33FF', size=1.5)
set.seed(100)
n=3000; k=1000; p=runif(1,0.1,0.99); d=rnorm(1, 1, 0.01)
V=rbinom(n, k, p)
Vy<-c()
for (i in 1: n){
if(V[i]>=k*p*d)
Vy[i]=1
else(Vy[i]=0)
}
table(Vy)
## Vy
## 0 1
## 1289 1711
##
## Call:
## glm(formula = Vy ~ X, family = binomial(probit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.389 -1.291 1.024 1.063 1.161
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.21068 0.02858 7.372 1.68e-13 ***
## X -0.09420 0.04757 -1.980 0.0476 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4099.3 on 2999 degrees of freedom
## Residual deviance: 4095.4 on 2998 degrees of freedom
## AIC: 4099.4
##
## Number of Fisher Scoring iterations: 3
接著代入係數,求出\(Z(X\beta)\),也就是\(Z\)分數。
set.seed(100)
n=3000; k=1000; p=runif(1,0.1,0.99); d=rnorm(1, 1, 0.01)
newCDF <- c()
xval=sort(X)
for (i in 1: n){
newCDF[i] = pnorm(0.210-0.094*X[i], p*d, sqrt(p*(1-p)))
}
Z <-sort(newCDF)
dt<-data.frame(Vy, X, xval, Z)
library(ggplot2)
G=ggplot(data=dt, aes(x=xval, y=Vy)) +
geom_point(size=1.5)
G+geom_line(data=dt, aes(x=xval, y=Z),
col='#33EEFF', size=1.2)
\[\rm{Pr}(y=1|X)=\Phi(X\beta)\]
可用R
計算:
## [1] 0.02275013
## [1] 0.1586553
## [1] 0.9772499
計算全部X\(\beta\)從-3到3在常態分佈時\(N(0,1)\)的累積機率密度:
j<-c()
k<-seq(-3, 3, 0.1)
n <- length(k)
for (i in 1:n)
{
j[i] <- pnorm (k[i], 0, 1)
}
plot(k, j, main="CDF", xlab=expression(paste('X',beta)), ylab="Probability",
type="l", lwd=1.5)
Probit的累積機率(cdf)可畫成如下:
Admit<-read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
Admit$rank.f<- factor(Admit$rank)
stargazer(Admit, type = 'html')
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
admit | 400 | 0.318 | 0.466 | 0 | 0 | 1 | 1 |
gre | 400 | 587.700 | 115.517 | 220 | 520 | 660 | 800 |
gpa | 400 | 3.390 | 0.381 | 2.260 | 3.130 | 3.670 | 4.000 |
rank | 400 | 2.485 | 0.944 | 1 | 2 | 3 | 4 |
接下來用glm函數估計自變數的係數:
fit1 <- glm(admit ~ gre + gpa + rank.f,
family = binomial(link = "probit"), data = Admit)
stargazer(fit1, type='html')
Dependent variable: | |
admit | |
gre | 0.001** |
(0.001) | |
gpa | 0.478** |
(0.197) | |
rank.f2 | -0.415** |
(0.195) | |
rank.f3 | -0.812*** |
(0.208) | |
rank.f4 | -0.936*** |
(0.245) | |
Constant | -2.387*** |
(0.674) | |
Observations | 400 |
Log Likelihood | -229.207 |
Akaike Inf. Crit. | 470.413 |
Note: | p<0.1; p<0.05; p<0.01 |
\[F(-2.387+0.001*500)=0.0295\] 用R
計算:
## [1] 0.02958016
\[F(-2.387+0.001*600)=0.0369\]
## [1] 0.03696874
\[F(-2.387+0.001*600)=0.0458\]
## [1] 0.04580168
以上計算方式可以代入GPA的平均值,或者其他的GRE分數,得到不同的累積機率密度。
newdata <- data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
4 * 4), gpa = rep(c(2.5, 3, 3.5, 4), each = 100 * 4), rank.f= factor(rep(rep(1:4,
each = 100), 4)))
newdata[, c("p", "se")] <- predict(fit1, newdata,
type = "response", se.fit = TRUE)[-3]
library(ggplot2)
ggplot(newdata, aes(x = gre, y = p, colour = rank.f)) + geom_line() +
facet_wrap(~gpa)
\[ \pi_{i}=\frac{exp(X\beta)}{1+exp(X\beta)} \]
\[\hat{\beta}\sim \mathcal{N(\beta, \phi(X^{T}WX)^{-1})} \]
R
一樣的估計
\[\lambda=\frac{\mathcal{L}(\theta)}{\mathcal{L}(\hat{\theta})} \]
\[-2\hspace{.1em}\text{log}\hspace{.1em}\lambda\hspace{.5em}\xrightarrow{d}\hspace{.5em}\chi^2\]
\(\blacksquare\)哪些因素影響信用卡違約?
library(ISLR); library(stargazer)
M5 <-glm(default ~ balance+income+student, data=Default, family=binomial(logit))
stargazer(M5, type='html')
Dependent variable: | |
default | |
balance | 0.006*** |
(0.0002) | |
income | 0.00000 |
(0.00001) | |
studentYes | -0.647*** |
(0.236) | |
Constant | -10.869*** |
(0.492) | |
Observations | 10,000 |
Log Likelihood | -785.772 |
Akaike Inf. Crit. | 1,579.545 |
Note: | p<0.1; p<0.05; p<0.01 |
以下指令可建立\(\text{exp}^{\beta}\):
Dependent variable: | |
default | |
balance | 1.006*** |
(0.0002) | |
income | 1.000*** |
(0.00001) | |
studentYes | 0.524** |
(0.236) | |
Constant | 0.00002 |
(0.492) | |
Observations | 10,000 |
Log Likelihood | -785.772 |
Akaike Inf. Crit. | 1,579.545 |
Note: | p<0.1; p<0.05; p<0.01 |
allcoef <-data.frame(balance=rep(mean(Default$balance),2),
income=rep(mean(Default$income),2),
student=as.factor(c("Yes", "No")))
allcoef
## balance income student
## 1 835.3749 33516.98 Yes
## 2 835.3749 33516.98 No
代入模型M5: \[\rm{Pr}(Y=1)=(e^{\eta})/(1+e^\eta)\] 其中 \[\eta=-10.869+0.006*\rm{balance}-0.647*\rm{student}\]
allcoef$pred <-predict(M5, newdata=allcoef, type="response")
allcoef<-cbind(allcoef, allcoef$pred)
allcoef
## balance income student pred allcoef$pred
## 1 835.3749 33516.98 Yes 0.001328976 0.001328976
## 2 835.3749 33516.98 No 0.002534451 0.002534451
library(ISLR); library(stargazer)
M5 <-glm(default ~ balance+income+student, data=Default, family=binomial(logit))
M6 <-glm(default ~ balance+income+student, data=Default, family=binomial(probit))
stargazer(M5, M6, type='html')
Dependent variable: | ||
default | ||
logistic | probit | |
(1) | (2) | |
balance | 0.006*** | 0.003*** |
(0.0002) | (0.0001) | |
income | 0.00000 | 0.00000 |
(0.00001) | (0.00000) | |
studentYes | -0.647*** | -0.296** |
(0.236) | (0.119) | |
Constant | -10.869*** | -5.475*** |
(0.492) | (0.238) | |
Observations | 10,000 | 10,000 |
Log Likelihood | -785.772 | -791.609 |
Akaike Inf. Crit. | 1,579.545 | 1,591.217 |
Note: | p<0.1; p<0.05; p<0.01 |
library(kableExtra)
DT <-data.frame(admission=c(rep(1,10),rep(0,10)),
gender=c(rep(1,7), rep(0,3),rep(1,3),rep(0,7)))
DT %>%
kable('html') %>%
kable_styling()
admission | gender |
---|---|
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 0 |
1 | 0 |
1 | 0 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
library(kableExtra)
DT <-data.frame(Surv=c(rep(1,11),rep(0,22)),
Ag=c(rep(1,9), rep(0,2),rep(1,8),rep(0,14)))
DT %>%
kable('html') %>%
kable_styling()
Surv | Ag |
---|---|
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 0 |
1 | 0 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 1 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
0 | 0 |
## 最後更新日期 06/28/2019