Week 5

Logistic Regression

#response: binomial
  #binary (0|1, good/bad)
  #independent
  #same probability of "success" for all trials

#odds = probability of success / probability of failure = # of successes / # of failures
  #if p is the probability of success, then the odds of success are p / (1-p)
#log odds (natural log) = log(p / (1-p)) = logit(p)

#log(p/(1-p)) = B0 + B1x + ... + Bkxk
  #use maximum likelihood to estimate betas using data x1, ..., xk and y

data(orings) #cause of Challenger explosion
shuttlemod = glm(cbind(damage, 6-damage) ~ temp, family = binomial, data = orings)
summary(shuttlemod)

## 
## Call:
## glm(formula = cbind(damage, 6 - damage) ~ temp, family = binomial, 
##     data = orings)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9529  -0.7345  -0.4393  -0.2079   1.9565  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 11.66299    3.29626   3.538 0.000403 ***
## temp        -0.21623    0.05318  -4.066 4.78e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 38.898  on 22  degrees of freedom
## Residual deviance: 16.912  on 21  degrees of freedom
## AIC: 33.675
## 
## Number of Fisher Scoring iterations: 6

#every 1 unit increase in x is associated with a B1 unit increase in the log odds
#every 1 unit increase in x is associated with a multiplicative change in the odds by a factor of e^B1

plot(damage/6 ~ jitter(temp), data = orings, ylim = c(0,1), xlab = "Temperature at Launch", ylab = "Probability of O-Ring Damage")
xs = seq(25, 85, 1)
ys = ilogit(11.663 - 0.2162*xs)
lines(xs,ys)

#when regression coefficient is positive, the probability of "success" increase as your predictor increases

#Logistic regression exercises https://irsaatumn.github.io/RWorkshop18/logistic-regression-in-r.html#model-basics

#Boundry Water Blowdown
blowdown <- read.csv("http://www.cknudson.com/data/blowdown.csv")
blowdown.mod = glm(y ~ D, family = binomial, data = blowdown)
summary(blowdown.mod)

## 
## Call:
## glm(formula = y ~ D, family = binomial, data = blowdown)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6309  -0.9616  -0.7211   1.1495   1.7172  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.702112   0.082798  -20.56   <2e-16 ***
## D            0.097558   0.004846   20.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5057.9  on 3665  degrees of freedom
## Residual deviance: 4555.6  on 3664  degrees of freedom
## AIC: 4559.6
## 
## Number of Fisher Scoring iterations: 4

#regression equation: y = 0.097558*D - 1.702112
#For every 1 increase in diamater, the log odds of death increase by 0.097558. Thicker trees have a higher chance of dying.
0.097558*20 - 1.702112 #log odds

## [1] 0.249048

#The log odds of a 20 cm tree dying are 0.249048
exp(0.249048) #odds

## [1] 1.282804

exp(0.249048) / (1 + exp(0.249048))

## [1] 0.5619422

#The probability of a 20 cm tree dying is about 0.5619422
#because our p-value is < 0.05, we have evidence that the log odds of tree diameter have a significant linear relationship with tree death

#Beer
beer <- read.csv("http://www.cknudson.com/data/MNbeer.csv")
beer.mod = glm(Good ~ ABV, family = binomial, data = beer)
summary(beer.mod)

## 
## Call:
## glm(formula = Good ~ ABV, family = binomial, data = beer)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3847  -0.8206  -0.4663   0.7230   2.1320  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -9.7881     3.3959  -2.882  0.00395 **
## ABV           1.4662     0.5481   2.675  0.00747 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 51.564  on 43  degrees of freedom
## Residual deviance: 42.108  on 42  degrees of freedom
## AIC: 46.108
## 
## Number of Fisher Scoring iterations: 5

#For every 1 increase in ABV, the log odds of the beer being classified as good increases by 1.4662
#For a beer with an ABV of 5.3
1.4662*5.3 - 9.7881 #log odds

## [1] -2.01724

exp(-2.01724) / (1 + exp(-2.01724))

## [1] 0.1174047

#The probability of a beer with an ABV of 5.3 having a score of at least 90 is 0.1174047

#State Colors
election <- read.csv("https://www.macalester.edu/~ajohns24/data/IMAdata1.csv")
election$Red <- as.numeric(election$StateColor == "red")
election.mod = glm(Red ~ per_capita_income, family = binomial, data = election)
summary(election.mod)

## 
## Call:
## glm(formula = Red ~ per_capita_income, family = binomial, data = election)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.617  -1.146  -0.753   1.157   2.073  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.649e+00  1.722e-01   9.575   <2e-16 ***
## per_capita_income -7.339e-05  7.214e-06 -10.173   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4352.6  on 3142  degrees of freedom
## Residual deviance: 4237.1  on 3141  degrees of freedom
## AIC: 4241.1
## 
## Number of Fisher Scoring iterations: 4