Logistic Regression

Exercise 4.5

shuttle <- read.table("https://users.stat.ufl.edu/~aa/cat/data/Shuttle.dat", header=TRUE)
shuttle

##    Ft Temp TD
## 1   1   66  0
## 2   2   70  1
## 3   3   69  0
## 4   4   68  0
## 5   5   67  0
## 6   6   72  0
## 7   7   73  0
## 8   8   70  0
## 9   9   57  1
## 10 10   63  1
## 11 11   70  1
## 12 12   78  0
## 13 13   67  0
## 14 14   53  1
## 15 15   67  0
## 16 16   75  0
## 17 17   70  0
## 18 18   81  0
## 19 19   76  0
## 20 20   79  0
## 21 21   75  1
## 22 22   76  0
## 23 23   58  1

fit <- glm(TD ~ Temp, family=binomial, data=shuttle)
fit$coeff

## (Intercept)        Temp 
##  15.0429016  -0.2321627

predict(fit,data.frame(Temp=31),type="link")

##        1 
## 7.845857

predict(fit,data.frame(Temp=31),type="response")

##         1 
## 0.9996088

linear_fit <- lm(TD ~ Temp, data=shuttle)
linear_fit$coeff

## (Intercept)        Temp 
##  2.90476190 -0.03738095

head(cbind(predict(linear_fit, data.frame(Temp=shuttle$Temp)), predict(linear_fit, data.frame(Temp=shuttle$Temp), type="response")))

##        [,1]      [,2]
## 1 0.4376190 0.4376190
## 2 0.2880952 0.2880952
## 3 0.3254762 0.3254762
## 4 0.3628571 0.3628571
## 5 0.4002381 0.4002381
## 6 0.2133333 0.2133333

library(rootSolve)
linear_fit <- lm(TD ~ Temp, data=shuttle)
intercept <- linear_fit$coeff[1]
temp <- linear_fit$coeff[2]

f1 = function(x) {
  intercept + temp * x
}
uniroot.all(f1, c(0, 100))

## [1] 77.70701

solve(temp, -intercept)

## [1] 77.70701

middle_efficient <- -1 * intercept / temp
middle_efficient

## (Intercept) 
##    77.70701

middle_efficient <- -1 * intercept / temp
sample_mean <- mean(shuttle$Temp)
sample_mean_prob <- predict(fit,data.frame(Temp=sample_mean),type="response")
prob_change_ratio <- temp * sample_mean_prob * (1-sample_mean_prob)
sample_mean

## [1] 69.56522

sample_mean_prob

##         1 
## 0.2483279

prob_change_ratio

##         Temp 
## -0.006977572

middle_efficient <- -1 * intercept / temp
sample_min <- min(shuttle$Temp)
sample_min_prob <- predict(fit,data.frame(Temp=sample_min),type="response")
sample_min

## [1] 53

sample_min_prob

##         1 
## 0.9392478

summary(fit)

## 
## Call:
## glm(formula = TD ~ Temp, family = binomial, data = shuttle)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0611  -0.7613  -0.3783   0.4524   2.2175  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  15.0429     7.3786   2.039   0.0415 *
## Temp         -0.2322     0.1082  -2.145   0.0320 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28.267  on 22  degrees of freedom
## Residual deviance: 20.315  on 21  degrees of freedom
## AIC: 24.315
## 
## Number of Fisher Scoring iterations: 5

null_deviance <- 28.267
residual_deviance  <- 0.1082
likelihood_z <- null_deviance - residual_deviance
likelihood_z

## [1] 28.1588

beta <- -0.2322
std_error <- 0.1082
wald_z <- beta / std_error
wald_z

## [1] -2.146026

confint(fit, level=0.95)

## Waiting for profiling to be done...

##                  2.5 %      97.5 %
## (Intercept)  3.3305848 34.34215133
## Temp        -0.5154718 -0.06082076

confint.default(fit)

##                  2.5 %      97.5 %
## (Intercept)  0.5810523 29.50475096
## Temp        -0.4443022 -0.02002324

Exercise 4.12

mbti <- read.table("http://users.stat.ufl.edu/~aa/intro-cda/data/MBTI.dat", header=TRUE)
mbti$yes <- mbti$drink
mbti$no <- mbti$n - mbti$drink
mbti

##    EI SN TF JP smoke drink   n yes  no
## 1   e  s  t  j    13    10  77  10  67
## 2   e  s  t  p    11     8  42   8  34
## 3   e  s  f  j    16     5 106   5 101
## 4   e  s  f  p    19     7  79   7  72
## 5   e  n  t  j     6     3  23   3  20
## 6   e  n  t  p     4     2  18   2  16
## 7   e  n  f  j     6     4  31   4  27
## 8   e  n  f  p    23    15  80  15  65
## 9   i  s  t  j    32    17 140  17 123
## 10  i  s  t  p     9     3  52   3  49
## 11  i  s  f  j    34     6 138   6 132
## 12  i  s  f  p    29     4 106   4 102
## 13  i  n  t  j     4     1  13   1  12
## 14  i  n  t  p     9     5  35   5  30
## 15  i  n  f  j     4     1  31   1  30
## 16  i  n  f  p    22     6  79   6  73

fit <- glm(yes/(yes+no) ~ EI + SN + TF + JP, 
           weights = yes + no,
           family = binomial(link = "logit"), 
           data = mbti)
fit$coefficients

## (Intercept)         EIi         SNs         TFt         JPp 
##  -2.1140470  -0.5550115  -0.4291508   0.6873349   0.2022295

summary(fit)

## 
## Call:
## glm(formula = yes/(yes + no) ~ EI + SN + TF + JP, family = binomial(link = "logit"), 
##     data = mbti, weights = yes + no)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2712  -0.8062  -0.1063   0.1124   1.5807  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1140     0.2715  -7.788 6.82e-15 ***
## EIi          -0.5550     0.2170  -2.558  0.01053 *  
## SNs          -0.4292     0.2340  -1.834  0.06664 .  
## TFt           0.6873     0.2206   3.116  0.00184 ** 
## JPp           0.2022     0.2266   0.893  0.37209    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 30.488  on 15  degrees of freedom
## Residual deviance: 11.149  on 11  degrees of freedom
## AIC: 73.99
## 
## Number of Fisher Scoring iterations: 4

pred.prob <- fitted(fit)
lp <- predict(fit, se.fit = TRUE)
LB <- lp$fit - 1.96*lp$se.fit
UB <- lp$fit + 1.96*lp$se.fit
LB.p <- exp(LB)/(1+exp(LB))
UB.p <- exp(UB)/(1+exp(UB))
out <- data.frame(mbti$EI, mbti$SN, mbti$TF, mbti$JP, 
                  pred.prob,
                  LB.p,
                  UB.p)

out

##    mbti.EI mbti.SN mbti.TF mbti.JP  pred.prob       LB.p       UB.p
## 1        e       s       t       j 0.13518600 0.09311660 0.19223318
## 2        e       s       t       p 0.16061850 0.10468282 0.23848133
## 3        e       s       f       j 0.07288479 0.04757773 0.11009687
## 4        e       s       f       p 0.08778634 0.05694528 0.13297541
## 5        e       n       t       j 0.19361150 0.12222927 0.29277650
## 6        e       n       t       p 0.22714857 0.15042946 0.32789340
## 7        e       n       f       j 0.10773900 0.06622939 0.17051419
## 8        e       n       f       p 0.12877681 0.08736742 0.18581683
## 9        i       s       t       j 0.08234722 0.05576477 0.11999096
## 10       i       s       t       p 0.09897686 0.06339960 0.15129366
## 11       i       s       f       j 0.04318118 0.02722891 0.06782757
## 12       i       s       f       p 0.05235266 0.03296411 0.08217593
## 13       i       n       t       j 0.12113523 0.07276292 0.19490558
## 14       i       n       t       p 0.14436562 0.09114324 0.22110598
## 15       i       n       f       j 0.06482402 0.03763115 0.10943247
## 16       i       n       f       p 0.07821656 0.05006819 0.12018772

Reference

[1] Alan Agresti, 『범주형 자료분석 개론』, 박태성, 이승연 옮긴이, 자유아카데미, 2020
[2] Alan Agresti, 『Categorical Data Analysis (3rd Edition)』, Wiley, 2018
[3] https://users.stat.ufl.edu/~aa/cat/data/Shuttle.dat
[4] https://users.stat.ufl.edu/~aa/cat/data/MBTI.dat