env <- new.env()
load("../data.rda", envir = env)
df <- get("CH14TA01", envir = env)
names(df) <- c("x", "y", "fitted")
fit <- glm(y ~ x, family = binomial(link = "logit"), df)
cbind(
'Experience' = df$x,
'Task Success' = df$y,
'Fitted Value' = df$fitted,
'Residual' = resid(fit))
## Experience Task Success Fitted Value Residual
## 1 14 0 0.31026 -0.8619
## 2 29 0 0.83526 -1.8992
## 3 6 0 0.11000 -0.4828
## 4 25 1 0.72660 0.7992
## 5 18 1 0.46184 1.2430
## 6 4 0 0.08213 -0.4140
## 7 18 0 0.46184 -1.1132
## 8 12 0 0.24567 -0.7509
## 9 22 1 0.62081 0.9765
## 10 6 0 0.11000 -0.4828
## 11 30 1 0.85630 0.5570
## 12 11 0 0.21698 -0.6994
## 13 30 1 0.85630 0.5570
## 14 5 0 0.09515 -0.4472
## 15 20 1 0.54240 1.1061
## 16 13 0 0.27680 -0.8051
## 17 9 0 0.16710 -0.6047
## 18 32 1 0.89166 0.4789
## 19 24 0 0.69338 -1.5376
## 20 13 1 0.27680 1.6028
## 21 19 0 0.50213 -1.1810
## 22 4 0 0.08213 -0.4140
## 23 28 1 0.81183 0.6457
## 24 22 1 0.62081 0.9765
## 25 8 1 0.14582 1.9624
summary(fit) # Estimates and Standard Deviations
##
## Call:
## glm(formula = y ~ x, family = binomial(link = "logit"), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.899 -0.751 -0.414 0.799 1.962
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.060 1.259 -2.43 0.015 *
## x 0.162 0.065 2.49 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 34.296 on 24 degrees of freedom
## Residual deviance: 25.425 on 23 degrees of freedom
## AIC: 29.42
##
## Number of Fisher Scoring iterations: 4
exp(coef(fit)[2]) # Estimated Odds Ratio
## x
## 1.175
exp(15 * coef(fit)[2]) # Estimated Odds Ratio for 15 Months Training (p 567)
## x
## 11.27
# Plot Logistic Regression and Loess Fit
xx <- with(df, seq(min(x), max(x), len = 200))
plot(y ~ x, df, pch = 19, col = "gray40", xlab = "Experience (X)", ylab = "Fitted Value")
lines(xx, predict(loess(y ~ x, df), data.frame(x = xx)), lty = 2, col = 'blue')
lines(xx, predict(fit, data.frame(x = xx), type = "resp"), lwd = 2)
title("Scatter Plot with Loess (blue) and Logistic Mean Response Functions")
Since we can see how the logistic curve looks previously, I made it gray so it would not stand out. Frankly, it is hard to see anyway because the probit curve (blue) lays nearly on top of it.
plot(y ~ x, df, pch = 19, col = "gray40", xlab = "Experience (X)", ylab = "Fitted Value")
title("Logistic (Gray), Probit (Blue), and \nComplementary Log-Log (Red) Fitted Models")
fit <- glm(y ~ x, data = df, family = binomial(link = "probit"))
lines(xx, predict(fit, data.frame(x = xx), type = "resp"), col = 'blue', lwd = 2, lty = 2)
fit <- glm(y ~ x, data = df, family = binomial(link = "cloglog"))
lines(xx, predict(fit, data.frame(x = xx), type = "resp"), col = 'red', lwd = 2, lty = 2)
fit <- glm(y ~ x, family = binomial(link = "logit"), df)
lines(xx, predict(fit, data.frame(x = xx), type = "resp"), col = 'gray40', lwd = 2)
df <- get("CH14TA02", envir = env)
names(df) <- c("x", "n", "y", "p")
# Recreate the data from the summary table
fit <- glm(y ~ x, family = binomial(link = 'logit'), data = data.frame(y = c(rep(1, sum(df$y)), rep(0, 1000 - sum(df$y))),
x = c(rep(df$x, df$y), rep(df$x, 200 - df$y))))
exp(coef(fit))
## (Intercept) x
## 0.1295 1.1017
plot(p ~ x, df, pch = 19, col = "gray40", ylim = c(0, 1), xlim = c(0, 40),
xlab = "Price Reduction ($)", ylab = "Proportion Redeemed")
lines(seq(0, 40, len = 200), predict(fit, data.frame(x = seq(0, 40, len = 200)), type = "resp"))
title("Proportion of Coupons Redeemed and \nFitted Logistic Response Function")
Since we cannot find the Coronary Heart Disease data set, we're going to use the Dwaine Studio data set mentioned at this point in the book. While it has a continuous response variable, we're going to recode it as a binary outcome.
df <- get("CH06FI05", envir = env)
names(df) <- c("x1", "x2", "y")
df <- transform(df, y = ifelse(y < median(y), 0, 1)) # Create dichotomous response
Here we define a modification to the lm3d function used at points throughout these tutorials. In particular, we update the predict command to include type = 'resp' to get the right sort of prediction intervals from predict.glm.
glm3d <- function(model, res, ...) {
ivs <- labels(terms(model))
x <- model.frame(model)[, ivs[1]] # 1st independent variable
y <- model.frame(model)[, ivs[2]] # 2nd independent variable
xr <- seq(min(x), max(x), len = res) # equidistant sequence from range of 1st iv
yr <- seq(min(y), max(y), len = res) # equidistant sequence from range of 2nd iv
# Create a list object of these sequences to be turned into a grid
newdata <- list(xr, yr)
names(newdata) <- ivs
newdata <- do.call('expand.grid', newdata)
zr <- matrix(predict(model, newdata, type = 'resp'), res, res)
persp(xr, yr, zr, ...)
}
fit <- glm(y ~ x1 + x2, binomial(link = 'logit'), df)
glm3d(fit, res=45, tick = "detailed", shade = 0.5, expand = 2/3, theta = 300, phi = 30,
xlab = "x1", ylab = "x2", zlab = "y")
glm3d(fit, res=45, tick = "detailed", shade = 0.5, expand = 2/3, theta = 330, phi = 0,
xlab = "x1", ylab = "x2", zlab = "y")
df <- get("CH14TA03", envir = env)
names(df) <- c('id', 'x1', 'x2', 'x3', 'x4', 'y')
fit <- glm(y ~ x1 + x2 + x3 + x4, data = df, family = binomial(link = 'logit'))
cbind(df, 'fitted' = fitted(fit))
## id x1 x2 x3 x4 y fitted
## 1 1 33 0 0 0 0 0.20896
## 2 2 35 0 0 0 0 0.21897
## 3 3 6 0 0 0 0 0.10579
## 4 4 60 0 0 0 0 0.37100
## 5 5 18 0 1 0 1 0.11079
## 6 6 26 0 1 0 0 0.13650
## 7 7 6 0 1 0 0 0.08020
## 8 8 31 1 0 0 1 0.27252
## 9 9 26 1 0 0 1 0.24404
## 10 10 37 1 0 0 0 0.30930
## 11 11 23 0 0 0 0 0.16401
## 12 12 23 0 0 0 0 0.16401
## 13 13 27 0 0 0 0 0.18099
## 14 14 9 0 0 0 1 0.11454
## 15 15 37 0 0 1 1 0.58966
## 16 16 22 0 0 1 1 0.47909
## 17 17 67 0 0 1 1 0.77818
## 18 18 8 0 0 1 0 0.37750
## 19 19 6 0 0 1 1 0.36362
## 20 20 15 0 0 1 1 0.42753
## 21 21 21 1 0 1 1 0.57331
## 22 22 32 1 0 1 1 0.65081
## 23 23 16 0 0 1 1 0.43483
## 24 24 11 1 0 1 0 0.49946
## 25 25 14 0 1 1 0 0.34820
## 26 26 9 1 0 1 0 0.48459
## 27 27 18 1 0 1 0 0.55134
## 28 28 2 0 1 0 0 0.07184
## 29 29 61 0 1 0 0 0.30929
## 30 30 20 0 1 0 0 0.11679
## 31 31 16 0 1 0 0 0.10506
## 32 32 9 1 0 0 0 0.16296
## 33 33 35 1 0 0 0 0.29674
## 34 34 4 0 0 0 0 0.10030
## 35 35 44 0 1 1 0 0.56600
## 36 36 11 0 1 1 1 0.32823
## 37 37 3 1 0 1 0 0.44025
## 38 38 6 0 1 1 0 0.29631
## 39 39 17 1 0 1 1 0.54398
## 40 40 1 0 1 1 0 0.26626
## 41 41 53 1 0 1 1 0.77684
## 42 42 13 0 0 1 1 0.41303
## 43 43 24 0 0 1 0 0.49395
## 44 44 70 0 0 1 1 0.79320
## 45 45 16 0 1 1 1 0.36183
## 46 46 12 1 0 1 0 0.50690
## 47 47 20 0 1 1 1 0.38973
## 48 48 65 0 1 1 0 0.70896
## 49 49 40 1 0 1 1 0.70279
## 50 50 38 1 0 1 1 0.69021
## 51 51 68 1 0 1 1 0.84470
## 52 52 74 0 0 1 1 0.81204
## 53 53 14 0 0 1 1 0.42027
## 54 54 27 0 0 1 1 0.51626
## 55 55 31 0 0 1 0 0.54589
## 56 56 18 0 0 1 0 0.44950
## 57 57 39 0 0 1 0 0.60398
## 58 58 50 0 0 1 0 0.67903
## 59 59 31 0 0 1 0 0.54589
## 60 60 61 0 0 1 0 0.74584
## 61 61 18 0 1 0 0 0.11079
## 62 62 5 0 1 0 0 0.07803
## 63 63 2 0 1 0 0 0.07184
## 64 64 16 0 1 0 0 0.10506
## 65 65 59 0 1 0 1 0.29673
## 66 66 22 0 1 0 0 0.12307
## 67 67 24 0 0 0 0 0.16813
## 68 68 30 0 0 0 0 0.19459
## 69 69 46 0 0 0 0 0.28001
## 70 70 28 0 0 0 0 0.18544
## 71 71 27 0 0 0 0 0.18099
## 72 72 27 0 0 0 1 0.18099
## 73 73 28 0 0 0 0 0.18544
## 74 74 52 0 0 0 1 0.31736
## 75 75 11 0 1 0 0 0.09188
## 76 76 6 1 0 0 0 0.15115
## 77 77 46 0 1 0 0 0.22275
## 78 78 20 1 0 0 1 0.21263
## 79 79 3 0 0 0 0 0.09764
## 80 80 18 1 0 0 0 0.20284
## 81 81 25 1 0 0 0 0.23860
## 82 82 6 0 1 0 0 0.08020
## 83 83 65 0 1 0 1 0.33527
## 84 84 51 0 1 0 0 0.24956
## 85 85 39 1 0 0 0 0.32215
## 86 86 8 0 0 0 0 0.11156
## 87 87 8 1 0 0 0 0.15894
## 88 88 14 0 1 0 0 0.09960
## 89 89 6 0 1 0 0 0.08020
## 90 90 6 0 1 0 0 0.08020
## 91 91 7 0 1 0 0 0.08242
## 92 92 4 0 1 0 0 0.07592
## 93 93 8 0 1 0 0 0.08470
## 94 94 9 1 0 0 0 0.16296
## 95 95 32 0 1 0 1 0.15893
## 96 96 19 0 1 0 0 0.11376
## 97 97 11 0 1 0 0 0.09188
## 98 98 35 0 1 0 0 0.17123
cbind(summary(fit)$coefficients, 'OR' = exp(coef(fit)))
## Estimate Std. Error z value Pr(>|z|) OR
## (Intercept) -2.31293 0.6426 -3.5994 0.0003189 0.09897
## x1 0.02975 0.0135 2.2033 0.0275770 1.03020
## x2 0.40879 0.5990 0.6825 0.4949543 1.50500
## x3 -0.30525 0.6041 -0.5053 0.6133615 0.73694
## x4 1.57475 0.5016 3.1393 0.0016934 4.82953
vcov(fit)
## (Intercept) x1 x2 x3 x4
## (Intercept) 0.412919 -0.0057142 -0.18357 -0.2009773 -0.163242
## x1 -0.005714 0.0001823 0.00115 0.0007319 0.000338
## x2 -0.183575 0.0011498 0.35881 0.1482218 0.012890
## x3 -0.200977 0.0007319 0.14822 0.3649711 0.062266
## x4 -0.163242 0.0003380 0.01289 0.0622665 0.251623
df <- get("APPENC11", envir = env)[2:3]
names(df) <- c('Y', 'X')
df <- transform(df, X = log(X)) # log scale X
df <- transform(df, x = scale(X, T, F), xx = scale(X, T, F)^2) # Center log X and square it
par(mfrow = c(1,2))
xr <- with(df, seq(min(X), max(X), len = 200)) # Plotting range values from log of X
fit <- glm(Y ~ X, binomial(link = 'logit'), df)
plot(Y ~ X, df, pch = 19, col = "gray")
title("(a) First-Order Fit")
lines(xr, predict(loess(Y ~ X, df), data.frame(X = xr)), lty = 2, lwd = 2, col = 'blue')
lines(xr, predict(fit, data.frame(X = xr), type = "resp"), lwd = 2)
fit <- glm(Y ~ x + xx, binomial(link = "logit"), df)
plot(Y ~ X, df, pch = 19, col = "gray")
title("(b) Second-Order Fit")
lines(xr, predict(loess(Y ~ X, df), data.frame(X = xr)), lty = 2, lwd = 2, col = 'blue')
lines(xr, predict(fit, data.frame(x = scale(xr, T, F), xx = scale(xr, T, F)^2), type = "resp"), lwd = 2)
summary(fit) # TABLE 14.5
##
## Call:
## glm(formula = Y ~ x + xx, family = binomial(link = "logit"),
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.35 -1.17 -0.43 1.06 2.89
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.300 0.124 2.42 0.015 *
## x 0.552 0.138 3.98 6.8e-05 ***
## xx -0.862 0.140 -6.14 8.5e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 661.20 on 481 degrees of freedom
## Residual deviance: 588.27 on 479 degrees of freedom
## AIC: 594.3
##
## Number of Fisher Scoring iterations: 5
df <- get("CH14TA01", envir = env)
names(df) <- c("x", "y", "fitted")
fit <- glm(y ~ x, data = df, family = binomial)
summary(fit) # the results are already supplied
##
## Call:
## glm(formula = y ~ x, family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.899 -0.751 -0.414 0.799 1.962
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.060 1.259 -2.43 0.015 *
## x 0.162 0.065 2.49 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 34.296 on 24 degrees of freedom
## Residual deviance: 25.425 on 23 degrees of freedom
## AIC: 29.42
##
## Number of Fisher Scoring iterations: 4
cat('z* =', coef(fit)[2] / sqrt(diag(vcov(fit))[2]))
cat('z(0.95) =', qnorm(0.95))
cat('p-value =', 1-pnorm(coef(fit)[2] / sqrt(diag(vcov(fit))[2]))) # Input 'z*'
## z* = 2.485
## z(0.95) = 1.645
## p-value = 0.006475
confint.default(fit)[2, ] # this result is based on asymptotic normality
exp(5 * coef(fit)[2]) # Point estimate for 5 months of training
exp(5 * confint.default(fit)[2, ]) # Confidence limits for 5 months of training
(exp(5 * confint.default(fit)[2, ]) - 1) * 100 # Percentage change form
## 2.5 % 97.5 %
## 0.03413 0.28884
## x
## 2.242
## 2.5 % 97.5 %
## 1.186 4.239
## 2.5 % 97.5 %
## 18.61 323.86
Here we're going to do a quick bootstrapping example to check if bootstrapping on our n=25 data set is consistent with large sample theory. If so, then using the confint.default instead of confint above is justified. Otherwise, the interval estimates would have wider limits than generated earlier.
Note, not every glm in the bootstrapping results in convergence, giving rise to warnings that are suppressed in this output.
library(boot)
boot.beta <- function(data, indices) {coef(glm(y ~ x, binomial, data = data[indices, ]))[2]}
z <- boot(df, boot.beta, R = 1000)
boot.ci(z, type = c("perc", "bca"))
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = z, type = c("perc", "bca"))
##
## Intervals :
## Level Percentile BCa
## 95% ( 0.0610, 0.4954 ) ( 0.0381, 0.3492 )
## Calculations and Intervals on Original Scale
## Some BCa intervals may be unstable
df <- get("CH14TA03", envir = env)
names(df) <- c('id', 'x1', 'x2', 'x3', 'x4', 'y')