This shows the outputs from Chapter 8 using R. The textbook is “A Modern Approach to Regression with R” by Simon J. Sheather (2008). The R code provided with the book has been updated.
MichelinFood <- read_excel("MARData.xlsx", sheet = "MichelinFood")
attach(MichelinFood)
head(MichelinFood)
## # A tibble: 6 x 5
## Food InMichelin NotInMichelin mi proportion
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 15 0 1 1 0
## 2 16 0 1 1 0
## 3 17 0 8 8 0
## 4 18 2 13 15 0.13
## 5 19 5 13 18 0.28
## 6 20 8 25 33 0.24
#Figure 8.1 on page 266
plot(Food,proportion,ylab = "Sample proportion",xlab = "Zagat Food Rating")
#R output on page 267
m1 <- glm(cbind(InMichelin,NotInMichelin)~Food,family = binomial)
summary(m1)
##
## Call:
## glm(formula = cbind(InMichelin, NotInMichelin) ~ Food, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4850 -0.7987 -0.1679 0.5913 1.5889
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.84154 1.86236 -5.821 5.84e-09 ***
## Food 0.50124 0.08768 5.717 1.08e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 61.427 on 13 degrees of freedom
## Residual deviance: 11.368 on 12 degrees of freedom
## AIC: 41.491
##
## Number of Fisher Scoring iterations: 4
#Figure 8.2 on page 268
x <- seq(15,28,0.05)
y <- 1/(1+exp(-1*(m1$coeff[1] + m1$coeff[2]*x)))
plot(Food,proportion,ylab = "Probability of inclusion in the Michelin Guide",xlab = "Zagat Food Rating")
lines(x,y)
#Table 8.2 on page 269
thetahat <- m1$fitted.values
odds_ratio <- m1$fitted.values/(1-m1$fitted.values)
cbind(Food,round(thetahat,3),round(odds_ratio,3))
## Food
## 1 15 0.035 0.036
## 2 16 0.056 0.060
## 3 17 0.089 0.098
## 4 18 0.140 0.162
## 5 19 0.211 0.268
## 6 20 0.306 0.442
## 7 21 0.422 0.729
## 8 22 0.546 1.204
## 9 23 0.665 1.988
## 10 24 0.766 3.281
## 11 25 0.844 5.416
## 12 26 0.899 8.941
## 13 27 0.937 14.759
## 14 28 0.961 24.364
#p-value on page 272
pchisq(m1$deviance,m1$df.residual,lower = FALSE)
## [1] 0.4976357
#Value of the difference in devinace and associated p-value on page 273
m1$null.deviance-m1$deviance
## [1] 50.05861
pchisq(m1$null.deviance-m1$deviance,1,lower = FALSE)
## [1] 1.492217e-12
#Logistic regression output on page 274
print(paste("Pearson's X^2 = ",round(sum(residuals(m1,type = "pearson")^2),3)))
## [1] "Pearson's X^2 = 11.999"
#Table 8.3 on page 276
cbind(round(residuals(m1,"response"),3),round(residuals(m1,"pearson"),3),round(residuals(m1,"deviance"),3))
## [,1] [,2] [,3]
## 1 -0.035 -0.190 -0.266
## 2 -0.056 -0.244 -0.340
## 3 -0.089 -0.886 -1.224
## 4 -0.006 -0.069 -0.070
## 5 0.067 0.693 0.670
## 6 -0.064 -0.798 -0.815
## 7 0.155 1.602 1.589
## 8 -0.213 -1.482 -1.485
## 9 0.001 0.012 0.012
## 10 0.091 0.567 0.599
## 11 0.073 0.693 0.749
## 12 -0.399 -1.878 -1.426
## 13 -0.079 -0.862 -0.748
## 14 0.039 0.405 0.567
#Figure 8.3 on page 276
hvalues <- influence(m1)$hat
stanresDeviance <- residuals(m1)/sqrt(1-hvalues)
stanresPearson <- residuals(m1,"pearson")/sqrt(1-hvalues)
par(mfrow = c(1,2))
plot(Food,stanresDeviance,ylab = "Standardized Deviance Residuals",xlab = "Food Rating",ylim = c(-2,2))
plot(Food,stanresPearson,ylab = "Standardized Pearson Residuals",xlab = "Food Rating",ylim = c(-2,2))
detach(MichelinFood)
MichelinNY <- read_excel("MARData.xlsx", sheet = "MichelinNY")
attach(MichelinNY)
head(MichelinNY)
## # A tibble: 6 x 6
## InMichelin RestaurantName Food Decor Service Price
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 0 14 Wall Street 19 20 19 50
## 2 0 212 17 17 16 43
## 3 0 26 Seats 23 17 21 35
## 4 1 44 19 23 16 52
## 5 0 A 23 12 19 24
## 6 0 A.O.C. 18 17 17 36
y <- InMichelin
#Figure 8.4 on page 278
par(mfrow = c(1,1))
plot(jitter(Food,amount = .15),jitter(y,amount = 0.03),xlab = "Food Rating",
ylab = "In Michelin Guide? (0 = No, 1 = Yes)")
#Figure 8.5 on page 279
boxplot(Food~y, ylab = "Food Rating",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")
#Logistic regression output on page 279
m1 <- glm(y~Food,family = binomial(),data = MichelinNY)
summary(m1)
##
## Call:
## glm(formula = y ~ Food, family = binomial(), data = MichelinNY)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3484 -0.8555 -0.4329 0.9028 1.9847
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.84154 1.86234 -5.821 5.83e-09 ***
## Food 0.50124 0.08767 5.717 1.08e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.79 on 163 degrees of freedom
## Residual deviance: 175.73 on 162 degrees of freedom
## AIC: 179.73
##
## Number of Fisher Scoring iterations: 4
#Figure 8.6 on page 281
hvalues <- influence(m1)$hat
stanresDeviance <- residuals(m1)/sqrt(1-hvalues)
#Alternatively we could use
#stanresDeviance < rstandard(m1)
stanresPearson <- residuals(m1,"pearson")/sqrt(1-hvalues)
par(mfrow = c(1,2))
plot(Food,stanresDeviance,ylab = "Standardized Deviance Residuals",xlab = "Food Rating",ylim = c(-4.1,4.1))
plot(Food,stanresPearson,ylab = "Standardized Pearson Residuals",xlab = "Food Rating",ylim = c(-4.1,4.1))
#Figure 8.7 on page 282
par(mfrow = c(1,1))
xx <- seq(15,28.2,0.05)
yy <- 1/(1+exp(-1*(m1$coeff[1] + m1$coeff[2]*xx)))
loessfit1 <- loess(y ~ Food,degree = 1,span = 2/3)
plot(jitter(Food,amount = .15),jitter(y,amount = 0.03),xlab = "Food Rating",
ylab = "In Michelin Guide? (0 = No, 1 = Yes)")
lines(xx,yy)
lines(xx,predict(loessfit1,data.frame(Food = xx)),lty = 2)
#Figure 8.8 on page 286
par(mfrow = c(2,2))
boxplot(Food~y, ylab = "Food Rating",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")
boxplot(Decor~y, ylab = "Decor Rating",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")
boxplot(Service~y, ylab = "Service Rating",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")
boxplot(Price~y, ylab = "Price",xlab = "In Michelin Guide? (0 = No, 1 = Yes)")
#Figure 8.9 on page 288
m2 <- glm(y~Food+Decor+Service+Price+log(Price),family = binomial(),data = MichelinNY)
loessfit1 <- loess(y ~ Food,degree = 1,span = 2/3)
loessfit2 <- loess(m2$fitted.values ~ Food,degree = 1,span = 2/3)
xx <- seq(15,28.2,0.05)
summary(m2)
##
## Call:
## glm(formula = y ~ Food + Decor + Service + Price + log(Price),
## family = binomial(), data = MichelinNY)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8391 -0.6200 -0.2182 0.6217 2.2567
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -43.70965 8.93276 -4.893 9.92e-07 ***
## Food 0.57208 0.16513 3.464 0.000531 ***
## Decor 0.04518 0.09221 0.490 0.624156
## Service -0.30898 0.14071 -2.196 0.028103 *
## Price -0.10484 0.03387 -3.095 0.001967 **
## log(Price) 10.85850 2.82399 3.845 0.000121 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.79 on 163 degrees of freedom
## Residual deviance: 136.43 on 158 degrees of freedom
## AIC: 148.43
##
## Number of Fisher Scoring iterations: 5
par(mfrow = c(1,2))
plot(Food,y,xlab = "Food Rating, x1", ylab = "Y, In Michelin Guide? (0 = No, 1 = Yes)")
lines(xx,predict(loessfit1,data.frame(Food = xx)))
#lines(lowess(Food,y,iter = 1,f = 2/3))
plot(Food,m2$fitted.values,ylab = expression(hat(Y)),xlab = "Food Rating, x1")
lines(xx,predict(loessfit2,data.frame(Food = xx)))
#Figure 8.10 on page 288
library(alr3)
mmps(m2,layout = c(2,3),key = FALSE)
#Figure 8.11 on page 289
par(mfrow = c(1,1))
plot(Decor,Service,pch = y+1,col = y+1,xlab = "Decor Rating",ylab = "Service Rating")
abline(lsfit(Decor[y == 0],Service[y == 0]),lty = 1,col = 1)
abline(lsfit(Decor[y == 1],Service[y == 1]),lty = 2,col = 2)
legend(14, 28,legend = c("No","Yes"),pch = 1:2,col = 1:2,title = "In Michelin Guide?")
#Figure 8.12 on page 290
m3 <- glm(y~Food+Decor+Service+Price+log(Price)+Service:Decor,family = binomial(),data = MichelinNY)
mmps(m3,layout = c(2,3),key = FALSE)
#Output from R on page 290
anova(m2,m3,test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: y ~ Food + Decor + Service + Price + log(Price)
## Model 2: y ~ Food + Decor + Service + Price + log(Price) + Service:Decor
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 158 136.43
## 2 157 129.82 1 6.6106 0.01014 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Figure 8.13 on page 291
par(mfrow = c(1,1))
hvalues <- influence(m3)$hat
stanresDeviance <- residuals(m3)/sqrt(1-hvalues)
plot(hvalues,stanresDeviance,ylab = "Standardized Deviance Residuals",xlab = "Leverage Values",ylim = c(-3,3),xlim = c(-0.05,0.7))
abline(v = 2*7/length(y),lty = 2)
identify(hvalues,stanresDeviance,labels = RestaurantName,cex = 0.75)
## integer(0)
#Output from R on pages 291 and 292
summary(m3)
##
## Call:
## glm(formula = y ~ Food + Decor + Service + Price + log(Price) +
## Service:Decor, family = binomial(), data = MichelinNY)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5235 -0.6045 -0.1292 0.5332 2.3757
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -70.85308 15.45786 -4.584 4.57e-06 ***
## Food 0.66996 0.18276 3.666 0.000247 ***
## Decor 1.29788 0.49299 2.633 0.008471 **
## Service 0.91971 0.48829 1.884 0.059632 .
## Price -0.07456 0.04416 -1.688 0.091347 .
## log(Price) 10.96400 3.22845 3.396 0.000684 ***
## Decor:Service -0.06551 0.02512 -2.608 0.009119 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.79 on 163 degrees of freedom
## Residual deviance: 129.82 on 157 degrees of freedom
## AIC: 143.82
##
## Number of Fisher Scoring iterations: 6
#Output from R on pages 292 and 293
m4 <- glm(y~Food+Decor+Service+log(Price)+Service:Decor,family = binomial(),data = MichelinNY)
anova(m4,m3,test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: y ~ Food + Decor + Service + log(Price) + Service:Decor
## Model 2: y ~ Food + Decor + Service + Price + log(Price) + Service:Decor
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 158 131.23
## 2 157 129.82 1 1.4086 0.2353
summary(m4)
##
## Call:
## glm(formula = y ~ Food + Decor + Service + log(Price) + Service:Decor,
## family = binomial(), data = MichelinNY)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6555 -0.6270 -0.1598 0.5398 2.2935
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -63.76436 14.09848 -4.523 6.10e-06 ***
## Food 0.64274 0.17825 3.606 0.000311 ***
## Decor 1.50597 0.47883 3.145 0.001660 **
## Service 1.12633 0.47068 2.393 0.016711 *
## log(Price) 7.29827 1.81062 4.031 5.56e-05 ***
## Decor:Service -0.07613 0.02448 -3.110 0.001873 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.79 on 163 degrees of freedom
## Residual deviance: 131.23 on 158 degrees of freedom
## AIC: 143.23
##
## Number of Fisher Scoring iterations: 6
#Figure 8.14 on page 294
mmps(m4,layout = c(2,3),key = FALSE)
#Figure 8.15 on page 295
par(mfrow = c(1,1))
hvalues <- influence(m4)$hat
stanresDeviance <- residuals(m4)/sqrt(1-hvalues)
plot(hvalues,stanresDeviance,ylab = "Standardized Deviance Residuals",xlab = "Leverage Values",ylim = c(-3,3),xlim = c(-0.05,0.35))
abline(v = 2*6/length(y),lty = 2)
identify(hvalues,stanresDeviance,labels = RestaurantName,cex = 0.75)
## integer(0)
#Table 8.5 on page 295
fits4 <- m4$fitted.values
round(fits4[c(14,37,69,133,135,138,160)],3)
## 14 37 69 133 135 138 160
## 0.971 0.934 0.125 0.103 0.081 0.072 0.922
detach(MichelinNY)
#################EXERCISES
#Exercise 8.3.1
playoffs <- read_excel("MARData.xlsx", sheet = "playoffs")
attach(playoffs)
head(playoffs)
## # A tibble: 6 x 6
## Team Population AverageWins PlayoffAppearances n Proportion
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mets 18.3 80.1 2 10 0.2
## 2 Yankees 18.3 96.6 10 10 1
## 3 Angels 12.4 81.2 2 10 0.2
## 4 Dodgers 12.4 85.8 3 10 0.3
## 5 Cubs 9.10 77.1 2 10 0.2
## 6 WhiteSox 9.10 81.6 1 10 0.1
#Figure 8.16 on page 296
with(playoffs, plot(Population,PlayoffAppearances,xlab = "x, Population (in millions)",
ylab = "Y, Play off Appearances (in 10 seasons)"))
#Output from R on page 296
m1 <- with(playoffs, lm(PlayoffAppearances~Population))
summary(m1)
##
## Call:
## lm(formula = PlayoffAppearances ~ Population)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8409 -2.1347 -0.7179 1.5085 7.5298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7547 0.7566 2.319 0.0279 *
## Population 0.1684 0.1083 1.555 0.1311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.619 on 28 degrees of freedom
## Multiple R-squared: 0.07952, Adjusted R-squared: 0.04664
## F-statistic: 2.419 on 1 and 28 DF, p-value: 0.1311
detach(playoffs)
#Exercise 8.3.3
ex833 <- read_excel("MARData.xlsx", sheet = "HeartDisease")
attach(ex833)
head(ex833)
## # A tibble: 6 x 7
## Case x1 x2 x3 x4 x5 HeartDisease
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 160 5.73 1 25.3 52 1
## 2 2 144 4.41 0 28.9 63 1
## 3 3 118 3.48 1 29.1 46 0
## 4 4 170 6.41 1 32.0 58 1
## 5 5 134 3.5 1 26.0 49 1
## 6 6 132 6.47 1 30.8 45 0
#Output from R for model (8.6) on page 299
m1 <- glm(HeartDisease~x1 + x2 + x3 + x4 + x5,
family = binomial(),data = ex833)
summary(m1)
##
## Call:
## glm(formula = HeartDisease ~ x1 + x2 + x3 + x4 + x5, family = binomial(),
## data = ex833)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7271 -0.8713 -0.4727 0.9598 2.4682
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.313426 0.943928 -4.570 4.89e-06 ***
## x1 0.006435 0.005503 1.169 0.242227
## x2 0.186163 0.056325 3.305 0.000949 ***
## x3 0.903863 0.221009 4.090 4.32e-05 ***
## x4 -0.035640 0.028833 -1.236 0.216433
## x5 0.052780 0.009512 5.549 2.88e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 493.62 on 456 degrees of freedom
## AIC: 505.62
##
## Number of Fisher Scoring iterations: 4
#Figure 8.17 on page 298
library(alr3)
par(mfrow = c(3,2))
with(ex833, mmp(m1,x1,key = FALSE))
mmp(m1,x2,key = FALSE)
mmp(m1,x4,key = FALSE)
mmp(m1,x5,key = FALSE)
mmp(m1,m1$fitted.values,xlab = "Fitted Values",key = FALSE)
#Figure 8.18 on page 299
y <- HeartDisease
par(mfrow = c(2,1))
plot(density(x1[y == 0],bw = "SJ",kern = "gaussian"),type = "l",
main = "Gaussian kernel density estimate",xlab = "x1")
rug(x1[y == 0])
lines(density(x1[y == 1],bw = "SJ",kern = "gaussian"),lty = 2)
rug(x1[y == 1])
legend(190, 0.0275,legend = c("No","Yes"),lty = 1:2,title = "Heart Disease?")
plot(density(x4[y == 0],bw = "SJ",kern = "gaussian"),type = "l",ylim = c(0,0.1),
main = "Gaussian kernel density estimate",xlab = "x4")
rug(x4[y == 0])
lines(density(x4[y == 1],bw = "SJ",kern = "gaussian"),lty = 2)
rug(x4[y == 1])
legend(40.5, 0.1,legend = c("No","Yes"),lty = 1:2,title = "Heart Disease?")
#Output from R for model (8.7) on page 300
f1x1 <- log(x1)
f2x4 <- log(x4)
m2 <- glm(HeartDisease~x1 + f1x1 + x2 + x3 + x4 + f2x4 + x5,
family = binomial(),data = ex833)
summary(m2)
##
## Call:
## glm(formula = HeartDisease ~ x1 + f1x1 + x2 + x3 + x4 + f2x4 +
## x5, family = binomial(), data = ex833)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7017 -0.8352 -0.4550 0.9444 2.3065
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 75.204768 33.830217 2.223 0.026215 *
## x1 0.096894 0.052664 1.840 0.065792 .
## f1x1 -13.426632 7.778559 -1.726 0.084328 .
## x2 0.201285 0.057220 3.518 0.000435 ***
## x3 0.941056 0.224274 4.196 2.72e-05 ***
## x4 0.384608 0.208016 1.849 0.064467 .
## f2x4 -11.443233 5.706058 -2.005 0.044915 *
## x5 0.056111 0.009675 5.800 6.64e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 486.74 on 454 degrees of freedom
## AIC: 502.74
##
## Number of Fisher Scoring iterations: 4
#Figure 8.19 on page 300
par(mfrow = c(3,3))
mmp(m2,ex833$x1,key = FALSE)
mmp(m2,f1x1,key = FALSE)
mmp(m2,x2,key = FALSE)
mmp(m2,x4,key = FALSE)
mmp(m2,f2x4,key = FALSE)
mmp(m2,x5,key = FALSE)
mmp(m2,m2$fitted.values,xlab = "Fitted Values",key = FALSE)
detach(ex833)
#Exercise 8.3.6
library(alr3)
data(banknote)
attach(banknote)
head(banknote)
## Length Left Right Bottom Top Diagonal Y
## 1 214.8 131.0 131.1 9.0 9.7 141.0 0
## 2 214.6 129.7 129.7 8.1 9.5 141.7 0
## 3 214.8 129.7 129.7 8.7 9.6 142.2 0
## 4 214.8 129.7 129.6 7.5 10.4 142.0 0
## 5 215.0 129.6 129.7 10.4 7.7 141.8 0
## 6 215.7 130.8 130.5 9.0 10.1 141.4 0
#Figure 8.20 on page 303
par(mfrow = c(1,1))
plot(Diagonal,Bottom,pch = Y+1,col = Y+1)
legend(141, 12.5,legend = c("Yes","No"),pch = 1:2,col = 1:2,title = "Counterfeit?")
detach(banknote)