## simulate data
n = 500
beta = c(1, .5)
x = rep(0, n) # 500 empty vectors
y = rep(0, n)
py1 = rep(0, n)
set.seed(34)
for (i in 1:n) {
x[i] = rnorm(1)
py1[i] = 1/(1 + exp(-(beta[1] + beta[2] * x[i])))
y[i] = rbinom(1, 1, py1[i])
}
ddf = data.frame(x, y = as.factor(y), py1)
(head(ddf))
## x y py1
## 1 -0.1388900 0 0.7171879
## 2 -0.7113750 0 0.6557277
## 3 -0.5752482 1 0.6709259
## 4 0.1882496 1 0.7491577
## 5 0.6706200 1 0.7917176
## 6 0.8844074 1 0.8087957
summary(ddf)
## x y py1
## Min. :-3.32039 0:136 Min. :0.3407
## 1st Qu.:-0.66455 1:364 1st Qu.:0.6610
## Median :-0.06432 Median :0.7247
## Mean : 0.01974 Mean :0.7222
## 3rd Qu.: 0.73023 3rd Qu.:0.7966
## Max. : 2.94274 Max. :0.9221
rbinom(n, size, prob) generates a given number of random values of given probability from a given sample. The size is the total number of trials.
## plot
# x vs p(y=1|x)
(plot(x, py1))
## NULL
## marginal of y
(table(ddf$y)/n)
##
## 0 1
## 0.272 0.728
## x|y
boxplot(x ~ y, ddf)
1 is more likely than 0
## fit logit in R
lfit = glm(y ~ x, ddf, family=binomial())
summary(lfit)
##
## Call:
## glm(formula = y ~ x, family = binomial(), data = ddf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2264 -1.1725 0.6333 0.8171 1.5922
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0578 0.1075 9.838 < 2e-16 ***
## x 0.6274 0.1120 5.600 2.15e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 585.24 on 499 degrees of freedom
## Residual deviance: 549.92 on 498 degrees of freedom
## AIC: 553.92
##
## Number of Fisher Scoring iterations: 4
## plot fit
phat = predict(lfit, type="response")
plot(x, py1, col="blue", xlab="x", ylab="P(Y=1|x)")
points(x, phat, col="red", lwd=2)
glm(formula, family=binomial, data, …) is used to fit generalized linear models, specified by giving a symbolic description of the linear predictor and a description of the error distribution. family) is a description of the error disribution and link function used to be in the model. Note that the
echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
Intercept: 1.0578, x: 0.6274
Blue: real value Red: predicted value
## - log likelihood
mLL1 = function(x,y,beta){
n = length(y)
mll = 0.0
for (i in 1:n) {
py1 = 1/(1+ exp(-(beta[1] + beta[2] * x[i])))
if (y[i] ==1) {
mll = mll - log(py1)
} else {
mll = mll - log(1 - py1)
}
}
return(mll)
}
Let’s compute the likelihood (maximize it later). Typically, you don’t optimize the likelihood, you optimize the log likelihood. You have to multiply the likelihoods, but if you have the log likelihood, you just need to add them, which is more computationally stable.
## get -LL on beta1 grid
nval = 1000
p = 2 # number of parameters
bMat = matrix(0.0, nval, p) # 1000*2 matrix full of 0s
bMat[,1] = lfit$coef[1]
bMat[,2] = seq(from=0, to=1, length.out=nval) # generates sequence of numbers with equal intervals
llv = rep(0, nval) # 1*1000 vector of 0s
# compute the sum of -LLs
for (i in 1:nval){
llv[i] = mLL1(x, y, bMat[i,])
}
plot(bMat[,2], llv)
abline(v=beta[2], col="blue")
ii = which.min(llv)
abline(v=bMat[ii,2], col="red") # min(-LL)
## row of bMat at min:
bMat[ii, ]
## [1] 1.0577788 0.6276276
## check
lfit$coef
## (Intercept) x
## 1.0577788 0.6274077
bMat is a matrix; first column is the intercepts. second column is a sequence of numbers from 0 to 1 We can compute the sum of (-)log likelihoods for each row of the matrix.
Blue is the true value of the slope (kept the intercept constant and varied the slope), red is the minimum.
## get -LL on bivariate grid
nval = 1
b1g = seq(from=0, to=2, length.out=nval)
b2g = seq(from=0, to=1, length.out=nval)
bg = expand.grid(b1g, b2g)
nn = nrow(bg)
llv1 = rep(0, nn)
tm1 = system.time({
for (i in 1:nn){
if ((i%%100)==0) cat("i: ", i, "\n")
llv1[i] = mLL1(x, y, bg[i,])
}
})
# takes too long
expand.grid() creates a data frame from all combinations of the supplied vectors or factors.
bgM = as.matrix(bg)
llv2 = rep(0, nn)
tm2 = system.time({
for (i in 1:nn) {
if ((i%%1000)==0) cat("i: ", i, "\n")
llv2[i] = mLL1(x, y, bgM[i, ])
}
})
bgMM = cbind(bgM[, 1], bgM[, 2])
llv3 = rep(0, nn)
tm3 = system.time({
for (i in 1:nn){
if ((i%%1000)==0) cat("i: ", i, "\n")
llv3[i] = mLL1(x, y, bgMM[i, ])
}
})
mLL = function(x, y, beta){
py1 = 1/(1+exp(-beta[1] + beta[2]*x))
return(-sum(ifelse(y==1, log(py1), log(1-py1))))
}
llv4 = rep(0, nn)
tm4 = system.time({
for(i in 1:nn){
llv4[i] = mLL(x, y, bgMM[i, ])
}
})