Logistic Regression

CHAPTER 8 OUTPUT

This shows the outputs from Chapter 8 using R. The textbook is “A Modern Approach to Regression with R” by Simon J. Sheather (2008). The R code provided with the book has been updated.

MichelinFood <- read_excel("MARData.xlsx", sheet  =  "MichelinFood")

attach(MichelinFood)
head(MichelinFood)

## # A tibble: 6 x 5
##    Food InMichelin NotInMichelin    mi proportion
##   <dbl>      <dbl>         <dbl> <dbl>      <dbl>
## 1    15          0             1     1       0   
## 2    16          0             1     1       0   
## 3    17          0             8     8       0   
## 4    18          2            13    15       0.13
## 5    19          5            13    18       0.28
## 6    20          8            25    33       0.24

#Figure 8.1 on page 266
plot(Food,proportion,ylab = "Sample proportion",xlab = "Zagat Food Rating")

#R output on page 267
m1 <- glm(cbind(InMichelin,NotInMichelin)~Food,family = binomial)
summary(m1)

## 
## Call:
## glm(formula = cbind(InMichelin, NotInMichelin) ~ Food, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4850  -0.7987  -0.1679   0.5913   1.5889  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.84154    1.86236  -5.821 5.84e-09 ***
## Food          0.50124    0.08768   5.717 1.08e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 61.427  on 13  degrees of freedom
## Residual deviance: 11.368  on 12  degrees of freedom
## AIC: 41.491
## 
## Number of Fisher Scoring iterations: 4

#Figure 8.2 on page 268
x <- seq(15,28,0.05)
y <- 1/(1+exp(-1*(m1$coeff[1] + m1$coeff[2]*x)))
plot(Food,proportion,ylab = "Probability of inclusion in the Michelin Guide",xlab = "Zagat Food Rating")
lines(x,y)

#Table 8.2 on page 269
thetahat <- m1$fitted.values
odds_ratio <- m1$fitted.values/(1-m1$fitted.values)
cbind(Food,round(thetahat,3),round(odds_ratio,3))

##    Food             
## 1    15 0.035  0.036
## 2    16 0.056  0.060
## 3    17 0.089  0.098
## 4    18 0.140  0.162
## 5    19 0.211  0.268
## 6    20 0.306  0.442
## 7    21 0.422  0.729
## 8    22 0.546  1.204
## 9    23 0.665  1.988
## 10   24 0.766  3.281
## 11   25 0.844  5.416
## 12   26 0.899  8.941
## 13   27 0.937 14.759
## 14   28 0.961 24.364

#p-value on page 272
pchisq(m1$deviance,m1$df.residual,lower = FALSE)

## [1] 0.4976357

#Value of the difference in devinace and associated p-value on page 273
m1$null.deviance-m1$deviance

## [1] 50.05861

pchisq(m1$null.deviance-m1$deviance,1,lower = FALSE)

## [1] 1.492217e-12

#Logistic regression output on page 274
print(paste("Pearson's X^2  = ",round(sum(residuals(m1,type = "pearson")^2),3)))

## [1] "Pearson's X^2  =  11.999"

#Table 8.3 on page 276
cbind(round(residuals(m1,"response"),3),round(residuals(m1,"pearson"),3),round(residuals(m1,"deviance"),3))

##      [,1]   [,2]   [,3]
## 1  -0.035 -0.190 -0.266
## 2  -0.056 -0.244 -0.340
## 3  -0.089 -0.886 -1.224
## 4  -0.006 -0.069 -0.070
## 5   0.067  0.693  0.670
## 6  -0.064 -0.798 -0.815
## 7   0.155  1.602  1.589
## 8  -0.213 -1.482 -1.485
## 9   0.001  0.012  0.012
## 10  0.091  0.567  0.599
## 11  0.073  0.693  0.749
## 12 -0.399 -1.878 -1.426
## 13 -0.079 -0.862 -0.748
## 14  0.039  0.405  0.567

#Figure 8.3 on page 276
hvalues <- influence(m1)$hat
stanresDeviance <- residuals(m1)/sqrt(1-hvalues)
stanresPearson <- residuals(m1,"pearson")/sqrt(1-hvalues)
par(mfrow = c(1,2))
plot(Food,stanresDeviance,ylab = "Standardized Deviance Residuals",xlab = "Food Rating",ylim = c(-2,2))
plot(Food,stanresPearson,ylab = "Standardized Pearson Residuals",xlab = "Food Rating",ylim = c(-2,2))

detach(MichelinFood)

MichelinNY <- read_excel("MARData.xlsx", sheet  =  "MichelinNY")
attach(MichelinNY)
head(MichelinNY)

## # A tibble: 6 x 6
##   InMichelin RestaurantName  Food Decor Service Price
##        <dbl> <chr>          <dbl> <dbl>   <dbl> <dbl>
## 1          0 14 Wall Street    19    20      19    50
## 2          0 212               17    17      16    43
## 3          0 26 Seats          23    17      21    35
## 4          1 44                19    23      16    52
## 5          0 A                 23    12      19    24
## 6          0 A.O.C.            18    17      17    36

y <- InMichelin

#Figure 8.4 on page 278
par(mfrow = c(1,1))
plot(jitter(Food,amount = .15),jitter(y,amount = 0.03),xlab = "Food Rating",
ylab = "In Michelin Guide? (0 = No, 1 = Yes)")

#Figure 8.5 on page 279
boxplot(Food~y, ylab = "Food Rating",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")

#Logistic regression output on page 279
m1 <- glm(y~Food,family = binomial(),data = MichelinNY)
summary(m1)

## 
## Call:
## glm(formula = y ~ Food, family = binomial(), data = MichelinNY)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3484  -0.8555  -0.4329   0.9028   1.9847  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.84154    1.86234  -5.821 5.83e-09 ***
## Food          0.50124    0.08767   5.717 1.08e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.79  on 163  degrees of freedom
## Residual deviance: 175.73  on 162  degrees of freedom
## AIC: 179.73
## 
## Number of Fisher Scoring iterations: 4

#Figure 8.6 on page 281
hvalues <- influence(m1)$hat
stanresDeviance <- residuals(m1)/sqrt(1-hvalues)
#Alternatively we could use 
#stanresDeviance < rstandard(m1)
stanresPearson <- residuals(m1,"pearson")/sqrt(1-hvalues)
par(mfrow = c(1,2))
plot(Food,stanresDeviance,ylab = "Standardized Deviance Residuals",xlab = "Food Rating",ylim = c(-4.1,4.1))
plot(Food,stanresPearson,ylab = "Standardized Pearson Residuals",xlab = "Food Rating",ylim = c(-4.1,4.1))

#Figure 8.7 on page 282
par(mfrow = c(1,1))
xx <- seq(15,28.2,0.05)
yy <- 1/(1+exp(-1*(m1$coeff[1] + m1$coeff[2]*xx)))
loessfit1 <- loess(y ~ Food,degree = 1,span = 2/3)
plot(jitter(Food,amount = .15),jitter(y,amount = 0.03),xlab = "Food Rating",
ylab = "In Michelin Guide? (0 = No, 1 = Yes)")
lines(xx,yy)
lines(xx,predict(loessfit1,data.frame(Food = xx)),lty = 2)

#Figure 8.8 on page 286
par(mfrow = c(2,2))
boxplot(Food~y, ylab = "Food Rating",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")
boxplot(Decor~y, ylab = "Decor Rating",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")
boxplot(Service~y, ylab = "Service Rating",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")
boxplot(Price~y, ylab = "Price",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")

#Figure 8.9 on page 288
m2 <- glm(y~Food+Decor+Service+Price+log(Price),family = binomial(),data = MichelinNY)
loessfit1 <- loess(y ~ Food,degree = 1,span = 2/3)
loessfit2 <- loess(m2$fitted.values ~ Food,degree = 1,span = 2/3)
xx <- seq(15,28.2,0.05)
summary(m2)

## 
## Call:
## glm(formula = y ~ Food + Decor + Service + Price + log(Price), 
##     family = binomial(), data = MichelinNY)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8391  -0.6200  -0.2182   0.6217   2.2567  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -43.70965    8.93276  -4.893 9.92e-07 ***
## Food          0.57208    0.16513   3.464 0.000531 ***
## Decor         0.04518    0.09221   0.490 0.624156    
## Service      -0.30898    0.14071  -2.196 0.028103 *  
## Price        -0.10484    0.03387  -3.095 0.001967 ** 
## log(Price)   10.85850    2.82399   3.845 0.000121 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.79  on 163  degrees of freedom
## Residual deviance: 136.43  on 158  degrees of freedom
## AIC: 148.43
## 
## Number of Fisher Scoring iterations: 5

par(mfrow = c(1,2))
plot(Food,y,xlab = "Food Rating, x1", ylab = "Y, In Michelin Guide? (0 = No, 1 = Yes)")
lines(xx,predict(loessfit1,data.frame(Food = xx)))
#lines(lowess(Food,y,iter = 1,f = 2/3))
plot(Food,m2$fitted.values,ylab = expression(hat(Y)),xlab = "Food Rating, x1")
lines(xx,predict(loessfit2,data.frame(Food = xx)))

#Figure 8.10 on page 288
library(alr3)
mmps(m2,layout = c(2,3),key = FALSE)

#Figure 8.11 on page 289
par(mfrow = c(1,1))
plot(Decor,Service,pch = y+1,col = y+1,xlab = "Decor Rating",ylab = "Service Rating")
abline(lsfit(Decor[y  == 0],Service[y  == 0]),lty = 1,col = 1)
abline(lsfit(Decor[y  == 1],Service[y  == 1]),lty = 2,col = 2)
legend(14, 28,legend = c("No","Yes"),pch = 1:2,col = 1:2,title = "In Michelin Guide?")

#Figure 8.12 on page 290
m3 <- glm(y~Food+Decor+Service+Price+log(Price)+Service:Decor,family = binomial(),data = MichelinNY)
mmps(m3,layout = c(2,3),key = FALSE)

#Output from R on page 290
anova(m2,m3,test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: y ~ Food + Decor + Service + Price + log(Price)
## Model 2: y ~ Food + Decor + Service + Price + log(Price) + Service:Decor
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       158     136.43                       
## 2       157     129.82  1   6.6106  0.01014 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Figure 8.13 on page 291
par(mfrow = c(1,1))
hvalues <- influence(m3)$hat
stanresDeviance <- residuals(m3)/sqrt(1-hvalues)
plot(hvalues,stanresDeviance,ylab = "Standardized Deviance Residuals",xlab = "Leverage Values",ylim = c(-3,3),xlim = c(-0.05,0.7))
abline(v = 2*7/length(y),lty = 2)
identify(hvalues,stanresDeviance,labels = RestaurantName,cex = 0.75)

## integer(0)

#Output from R on pages 291 and 292
summary(m3)

## 
## Call:
## glm(formula = y ~ Food + Decor + Service + Price + log(Price) + 
##     Service:Decor, family = binomial(), data = MichelinNY)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5235  -0.6045  -0.1292   0.5332   2.3757  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -70.85308   15.45786  -4.584 4.57e-06 ***
## Food            0.66996    0.18276   3.666 0.000247 ***
## Decor           1.29788    0.49299   2.633 0.008471 ** 
## Service         0.91971    0.48829   1.884 0.059632 .  
## Price          -0.07456    0.04416  -1.688 0.091347 .  
## log(Price)     10.96400    3.22845   3.396 0.000684 ***
## Decor:Service  -0.06551    0.02512  -2.608 0.009119 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.79  on 163  degrees of freedom
## Residual deviance: 129.82  on 157  degrees of freedom
## AIC: 143.82
## 
## Number of Fisher Scoring iterations: 6

#Output from R on pages 292 and 293
m4 <- glm(y~Food+Decor+Service+log(Price)+Service:Decor,family = binomial(),data = MichelinNY)
anova(m4,m3,test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: y ~ Food + Decor + Service + log(Price) + Service:Decor
## Model 2: y ~ Food + Decor + Service + Price + log(Price) + Service:Decor
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       158     131.23                     
## 2       157     129.82  1   1.4086   0.2353

summary(m4)

## 
## Call:
## glm(formula = y ~ Food + Decor + Service + log(Price) + Service:Decor, 
##     family = binomial(), data = MichelinNY)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6555  -0.6270  -0.1598   0.5398   2.2935  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -63.76436   14.09848  -4.523 6.10e-06 ***
## Food            0.64274    0.17825   3.606 0.000311 ***
## Decor           1.50597    0.47883   3.145 0.001660 ** 
## Service         1.12633    0.47068   2.393 0.016711 *  
## log(Price)      7.29827    1.81062   4.031 5.56e-05 ***
## Decor:Service  -0.07613    0.02448  -3.110 0.001873 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.79  on 163  degrees of freedom
## Residual deviance: 131.23  on 158  degrees of freedom
## AIC: 143.23
## 
## Number of Fisher Scoring iterations: 6

#Figure 8.14 on page 294
mmps(m4,layout = c(2,3),key = FALSE)

#Figure 8.15 on page 295
par(mfrow = c(1,1))
hvalues <- influence(m4)$hat
stanresDeviance <- residuals(m4)/sqrt(1-hvalues)
plot(hvalues,stanresDeviance,ylab = "Standardized Deviance Residuals",xlab = "Leverage Values",ylim = c(-3,3),xlim = c(-0.05,0.35))
abline(v = 2*6/length(y),lty = 2)
identify(hvalues,stanresDeviance,labels = RestaurantName,cex = 0.75)

## integer(0)

#Table 8.5 on page 295
fits4 <- m4$fitted.values
round(fits4[c(14,37,69,133,135,138,160)],3)

##    14    37    69   133   135   138   160 
## 0.971 0.934 0.125 0.103 0.081 0.072 0.922

detach(MichelinNY)

#################EXERCISES

#Exercise 8.3.1
playoffs <- read_excel("MARData.xlsx", sheet  =  "playoffs")
attach(playoffs)
head(playoffs)

## # A tibble: 6 x 6
##   Team     Population AverageWins PlayoffAppearances     n Proportion
##   <chr>         <dbl>       <dbl>              <dbl> <dbl>      <dbl>
## 1 Mets          18.3         80.1                  2    10        0.2
## 2 Yankees       18.3         96.6                 10    10        1  
## 3 Angels        12.4         81.2                  2    10        0.2
## 4 Dodgers       12.4         85.8                  3    10        0.3
## 5 Cubs           9.10        77.1                  2    10        0.2
## 6 WhiteSox       9.10        81.6                  1    10        0.1

#Figure 8.16 on page 296
with(playoffs, plot(Population,PlayoffAppearances,xlab = "x, Population (in millions)", 
ylab = "Y, Play off Appearances (in 10 seasons)"))

#Output from R on page 296
m1 <- with(playoffs, lm(PlayoffAppearances~Population))
summary(m1)

## 
## Call:
## lm(formula = PlayoffAppearances ~ Population)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8409 -2.1347 -0.7179  1.5085  7.5298 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   1.7547     0.7566   2.319   0.0279 *
## Population    0.1684     0.1083   1.555   0.1311  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.619 on 28 degrees of freedom
## Multiple R-squared:  0.07952,    Adjusted R-squared:  0.04664 
## F-statistic: 2.419 on 1 and 28 DF,  p-value: 0.1311

detach(playoffs)

#Exercise 8.3.3
ex833 <- read_excel("MARData.xlsx", sheet  =  "HeartDisease")
attach(ex833)
head(ex833)

## # A tibble: 6 x 7
##    Case    x1    x2    x3    x4    x5 HeartDisease
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>        <dbl>
## 1     1   160  5.73     1  25.3    52            1
## 2     2   144  4.41     0  28.9    63            1
## 3     3   118  3.48     1  29.1    46            0
## 4     4   170  6.41     1  32.0    58            1
## 5     5   134  3.5      1  26.0    49            1
## 6     6   132  6.47     1  30.8    45            0

#Output from R for model (8.6) on page 299
m1 <- glm(HeartDisease~x1 + x2 + x3 + x4 + x5,
family = binomial(),data = ex833)
summary(m1)

## 
## Call:
## glm(formula = HeartDisease ~ x1 + x2 + x3 + x4 + x5, family = binomial(), 
##     data = ex833)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7271  -0.8713  -0.4727   0.9598   2.4682  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.313426   0.943928  -4.570 4.89e-06 ***
## x1           0.006435   0.005503   1.169 0.242227    
## x2           0.186163   0.056325   3.305 0.000949 ***
## x3           0.903863   0.221009   4.090 4.32e-05 ***
## x4          -0.035640   0.028833  -1.236 0.216433    
## x5           0.052780   0.009512   5.549 2.88e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 596.11  on 461  degrees of freedom
## Residual deviance: 493.62  on 456  degrees of freedom
## AIC: 505.62
## 
## Number of Fisher Scoring iterations: 4

#Figure 8.17 on page 298
library(alr3)
par(mfrow = c(3,2))
with(ex833, mmp(m1,x1,key = FALSE))
mmp(m1,x2,key = FALSE)
mmp(m1,x4,key = FALSE)
mmp(m1,x5,key = FALSE)
mmp(m1,m1$fitted.values,xlab = "Fitted Values",key = FALSE)

#Figure 8.18 on page 299
y <- HeartDisease
par(mfrow = c(2,1))

plot(density(x1[y  == 0],bw = "SJ",kern = "gaussian"),type = "l",
main = "Gaussian kernel density estimate",xlab = "x1")
rug(x1[y  == 0])
lines(density(x1[y  == 1],bw = "SJ",kern = "gaussian"),lty = 2)
rug(x1[y  == 1])
legend(190, 0.0275,legend = c("No","Yes"),lty = 1:2,title = "Heart Disease?")
plot(density(x4[y  == 0],bw = "SJ",kern = "gaussian"),type = "l",ylim = c(0,0.1),
main = "Gaussian kernel density estimate",xlab = "x4")
rug(x4[y  == 0])
lines(density(x4[y  == 1],bw = "SJ",kern = "gaussian"),lty = 2)
rug(x4[y  == 1])
legend(40.5, 0.1,legend = c("No","Yes"),lty = 1:2,title = "Heart Disease?")

#Output from R for model (8.7) on page 300
f1x1 <- log(x1)
f2x4 <- log(x4)
m2 <- glm(HeartDisease~x1 + f1x1 + x2 + x3 + x4 + f2x4 + x5,
family = binomial(),data = ex833)
summary(m2)

## 
## Call:
## glm(formula = HeartDisease ~ x1 + f1x1 + x2 + x3 + x4 + f2x4 + 
##     x5, family = binomial(), data = ex833)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7017  -0.8352  -0.4550   0.9444   2.3065  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  75.204768  33.830217   2.223 0.026215 *  
## x1            0.096894   0.052664   1.840 0.065792 .  
## f1x1        -13.426632   7.778559  -1.726 0.084328 .  
## x2            0.201285   0.057220   3.518 0.000435 ***
## x3            0.941056   0.224274   4.196 2.72e-05 ***
## x4            0.384608   0.208016   1.849 0.064467 .  
## f2x4        -11.443233   5.706058  -2.005 0.044915 *  
## x5            0.056111   0.009675   5.800 6.64e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 596.11  on 461  degrees of freedom
## Residual deviance: 486.74  on 454  degrees of freedom
## AIC: 502.74
## 
## Number of Fisher Scoring iterations: 4

#Figure 8.19 on page 300
par(mfrow = c(3,3))
mmp(m2,ex833$x1,key = FALSE)
mmp(m2,f1x1,key = FALSE)
mmp(m2,x2,key = FALSE)
mmp(m2,x4,key = FALSE)
mmp(m2,f2x4,key = FALSE)
mmp(m2,x5,key = FALSE)
mmp(m2,m2$fitted.values,xlab = "Fitted Values",key = FALSE)

detach(ex833)

#Exercise 8.3.6
library(alr3)
data(banknote)
attach(banknote)
head(banknote)

##   Length  Left Right Bottom  Top Diagonal Y
## 1  214.8 131.0 131.1    9.0  9.7    141.0 0
## 2  214.6 129.7 129.7    8.1  9.5    141.7 0
## 3  214.8 129.7 129.7    8.7  9.6    142.2 0
## 4  214.8 129.7 129.6    7.5 10.4    142.0 0
## 5  215.0 129.6 129.7   10.4  7.7    141.8 0
## 6  215.7 130.8 130.5    9.0 10.1    141.4 0

#Figure 8.20 on page 303
par(mfrow = c(1,1))
plot(Diagonal,Bottom,pch = Y+1,col = Y+1)
legend(141, 12.5,legend = c("Yes","No"),pch = 1:2,col = 1:2,title = "Counterfeit?")

detach(banknote)

Logistic Regression

A Student

2022-03-30

CHAPTER 8 OUTPUT