An Example of a Logistic Model

library(openintro)

## Please visit openintro.org for free statistics materials

## 
## Attaching package: 'openintro'

## The following objects are masked from 'package:datasets':
## 
##     cars, trees

library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Warning: package 'dplyr' was built under R version 3.4.2

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

library(broom)

Create a binary variable that marks counties in the state of Washington. Also keep only counties from Alabama or Washington to make the problem easier.

countyComplete %>% 
  filter(state %in% c("Washington","Alabama")) %>% 
  mutate(WA = state =="Washington") -> ccm

# Make sure it worked
table(ccm$WA)

## 
## FALSE  TRUE 
##    67    39

Build a Model

The model should identify counties in Washington based on characteristics in the countyComplete (ccm) dataframe.

WAMod = glm(WA ~ area + poverty + median_household_income,data=ccm,family=binomial)
summary(WAMod)

## 
## Call:
## glm(formula = WA ~ area + poverty + median_household_income, 
##     family = binomial, data = ccm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4225  -0.4632  -0.2236   0.2105   2.2696  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -4.670e+00  4.718e+00  -0.990    0.322    
## area                     3.428e-03  8.737e-04   3.923 8.74e-05 ***
## poverty                 -1.766e-01  1.427e-01  -1.237    0.216    
## median_household_income  8.484e-05  6.864e-05   1.236    0.216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 139.462  on 105  degrees of freedom
## Residual deviance:  65.368  on 102  degrees of freedom
## AIC: 73.368
## 
## Number of Fisher Scoring iterations: 6

Check Performance

ProbWA = predict(WAMod,type="response")
PredWA = ProbWA > .5

# Create the confusion matrix
table(PredWA,ccm$WA)

##        
## PredWA  FALSE TRUE
##   FALSE    62    8
##   TRUE      5   31

# Compute the overall accuracy rate
AccRate = mean(PredWA == ccm$WA)
AccRate

## [1] 0.8773585

Of the 39 counties in Washington, 31 were classified correctly and 8 were incorrectly identified as Alabama counties. In the case of Alabama, 62 counties were identified correctly and 8 were classified as Washington counties.

The overall accuracy was about 88%.

Some Extras to Satisfy my Curiosity

Augment the Model

AugWA = augment(WAMod)

## Warning: Deprecated: please use `purrr::possibly()` instead

## Warning: Deprecated: please use `purrr::possibly()` instead

## Warning: Deprecated: please use `purrr::possibly()` instead

## Warning: Deprecated: please use `purrr::possibly()` instead

## Warning: Deprecated: please use `purrr::possibly()` instead

AugWA$state = ccm$state
AugWA$name = ccm$name
AugWA$Pred = PredWA
glimpse(AugWA)

## Observations: 106
## Variables: 14
## $ WA                      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
## $ area                    <dbl> 594.44, 1589.78, 884.88, 622.58, 644.7...
## $ poverty                 <dbl> 10.6, 12.2, 25.0, 12.6, 13.4, 25.3, 25...
## $ median_household_income <dbl> 53255, 50147, 33219, 41770, 45549, 316...
## $ .fitted                 <dbl> 0.01400616, 2.87963665, -3.23291267, -...
## $ .se.fit                 <dbl> 0.5255341, 0.8829022, 0.8857175, 0.617...
## $ .resid                  <dbl> -1.1833637, -2.4225077, -0.2781527, -0...
## $ .hat                    <dbl> 0.06904316, 0.03926023, 0.02864931, 0....
## $ .sigma                  <dbl> 0.7951811, 0.7659812, 0.8040009, 0.801...
## $ .cooksd                 <dbl> 2.019688e-02, 1.893614e-01, 2.994098e-...
## $ .std.resid              <dbl> -1.2264602, -2.4715095, -0.2822249, -0...
## $ state                   <fctr> Alabama, Alabama, Alabama, Alabama, A...
## $ name                    <fctr> Autauga County, Baldwin County, Barbo...
## $ Pred                    <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE...

Check the Washington misses.

WaMiss = filter(AugWA,Pred==FALSE & state == "Washington") %>% select(name) 
WaMiss

##               name
## 1    Asotin County
## 2  Columbia County
## 3  Garfield County
## 4    Island County
## 5     Mason County
## 6   Pacific County
## 7  San Juan County
## 8 Wahkiakum County

Play with the threshold

PredWA = ProbWA > .8

# Create the confusion matrix
table(PredWA,ccm$WA)

##        
## PredWA  FALSE TRUE
##   FALSE    65   17
##   TRUE      2   22

# Compute the overall accuracy rate
AccRate = mean(PredWA == ccm$WA)
AccRate

## [1] 0.8207547

Create and Use a Function fAccrate

fAccrate = function(th){
  PredWA = ProbWA > th
  return(mean(PredWA == ccm$WA))
}
x = seq(from = .01,to=.99,by = .01)
Accrate = sapply(x,fAccrate)
Accrate

##  [1] 0.4433962 0.4811321 0.5188679 0.5471698 0.5943396 0.6132075 0.6603774
##  [8] 0.6886792 0.6981132 0.7264151 0.7735849 0.7924528 0.7924528 0.8018868
## [15] 0.8113208 0.8207547 0.8396226 0.8396226 0.8396226 0.8301887 0.8301887
## [22] 0.8396226 0.8490566 0.8584906 0.8584906 0.8490566 0.8490566 0.8679245
## [29] 0.8679245 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906
## [36] 0.8584906 0.8584906 0.8490566 0.8490566 0.8490566 0.8584906 0.8584906
## [43] 0.8584906 0.8584906 0.8773585 0.8773585 0.8773585 0.8773585 0.8679245
## [50] 0.8773585 0.8867925 0.8773585 0.8773585 0.8584906 0.8490566 0.8584906
## [57] 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906 0.8490566 0.8396226
## [64] 0.8396226 0.8396226 0.8396226 0.8490566 0.8490566 0.8490566 0.8490566
## [71] 0.8490566 0.8396226 0.8301887 0.8301887 0.8301887 0.8301887 0.8207547
## [78] 0.8207547 0.8207547 0.8207547 0.8207547 0.8113208 0.8113208 0.8113208
## [85] 0.8113208 0.8113208 0.8113208 0.8113208 0.8113208 0.8113208 0.8018868
## [92] 0.8018868 0.7924528 0.7924528 0.7924528 0.7641509 0.7641509 0.7358491
## [99] 0.7075472

result = data.frame(x,Accrate)
result %>% ggplot(aes(x,Accrate)) + geom_point()

winner = which(Accrate == max(Accrate))
winner

## [1] 51

x[winner]

## [1] 0.51

Notes 11-7