Logistic Regression
#response: binomial
#binary (0|1, good/bad)
#independent
#same probability of "success" for all trials
#odds = probability of success / probability of failure = # of successes / # of failures
#if p is the probability of success, then the odds of success are p / (1-p)
#log odds (natural log) = log(p / (1-p)) = logit(p)
#log(p/(1-p)) = B0 + B1x + ... + Bkxk
#use maximum likelihood to estimate betas using data x1, ..., xk and y
data(orings) #cause of Challenger explosion
shuttlemod = glm(cbind(damage, 6-damage) ~ temp, family = binomial, data = orings)
summary(shuttlemod)
##
## Call:
## glm(formula = cbind(damage, 6 - damage) ~ temp, family = binomial,
## data = orings)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9529 -0.7345 -0.4393 -0.2079 1.9565
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.66299 3.29626 3.538 0.000403 ***
## temp -0.21623 0.05318 -4.066 4.78e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 38.898 on 22 degrees of freedom
## Residual deviance: 16.912 on 21 degrees of freedom
## AIC: 33.675
##
## Number of Fisher Scoring iterations: 6
#every 1 unit increase in x is associated with a B1 unit increase in the log odds
#every 1 unit increase in x is associated with a multiplicative change in the odds by a factor of e^B1
plot(damage/6 ~ jitter(temp), data = orings, ylim = c(0,1), xlab = "Temperature at Launch", ylab = "Probability of O-Ring Damage")
xs = seq(25, 85, 1)
ys = ilogit(11.663 - 0.2162*xs)
lines(xs,ys)
#when regression coefficient is positive, the probability of "success" increase as your predictor increases
#Logistic regression exercises https://irsaatumn.github.io/RWorkshop18/logistic-regression-in-r.html#model-basics
#Boundry Water Blowdown
blowdown <- read.csv("http://www.cknudson.com/data/blowdown.csv")
blowdown.mod = glm(y ~ D, family = binomial, data = blowdown)
summary(blowdown.mod)
##
## Call:
## glm(formula = y ~ D, family = binomial, data = blowdown)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6309 -0.9616 -0.7211 1.1495 1.7172
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.702112 0.082798 -20.56 <2e-16 ***
## D 0.097558 0.004846 20.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5057.9 on 3665 degrees of freedom
## Residual deviance: 4555.6 on 3664 degrees of freedom
## AIC: 4559.6
##
## Number of Fisher Scoring iterations: 4
#regression equation: y = 0.097558*D - 1.702112
#For every 1 increase in diamater, the log odds of death increase by 0.097558. Thicker trees have a higher chance of dying.
0.097558*20 - 1.702112 #log odds
## [1] 0.249048
#The log odds of a 20 cm tree dying are 0.249048
exp(0.249048) #odds
## [1] 1.282804
exp(0.249048) / (1 + exp(0.249048))
## [1] 0.5619422
#The probability of a 20 cm tree dying is about 0.5619422
#because our p-value is < 0.05, we have evidence that the log odds of tree diameter have a significant linear relationship with tree death
#Beer
beer <- read.csv("http://www.cknudson.com/data/MNbeer.csv")
beer.mod = glm(Good ~ ABV, family = binomial, data = beer)
summary(beer.mod)
##
## Call:
## glm(formula = Good ~ ABV, family = binomial, data = beer)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3847 -0.8206 -0.4663 0.7230 2.1320
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.7881 3.3959 -2.882 0.00395 **
## ABV 1.4662 0.5481 2.675 0.00747 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 51.564 on 43 degrees of freedom
## Residual deviance: 42.108 on 42 degrees of freedom
## AIC: 46.108
##
## Number of Fisher Scoring iterations: 5
#For every 1 increase in ABV, the log odds of the beer being classified as good increases by 1.4662
#For a beer with an ABV of 5.3
1.4662*5.3 - 9.7881 #log odds
## [1] -2.01724
exp(-2.01724) / (1 + exp(-2.01724))
## [1] 0.1174047
#The probability of a beer with an ABV of 5.3 having a score of at least 90 is 0.1174047
#State Colors
election <- read.csv("https://www.macalester.edu/~ajohns24/data/IMAdata1.csv")
election$Red <- as.numeric(election$StateColor == "red")
election.mod = glm(Red ~ per_capita_income, family = binomial, data = election)
summary(election.mod)
##
## Call:
## glm(formula = Red ~ per_capita_income, family = binomial, data = election)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.617 -1.146 -0.753 1.157 2.073
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.649e+00 1.722e-01 9.575 <2e-16 ***
## per_capita_income -7.339e-05 7.214e-06 -10.173 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4352.6 on 3142 degrees of freedom
## Residual deviance: 4237.1 on 3141 degrees of freedom
## AIC: 4241.1
##
## Number of Fisher Scoring iterations: 4