library(openintro)
## Please visit openintro.org for free statistics materials
##
## Attaching package: 'openintro'
## The following objects are masked from 'package:datasets':
##
## cars, trees
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(broom)
countyComplete %>%
filter(state %in% c("Washington","Alabama")) %>%
mutate(WA = state =="Washington") -> ccm
# Make sure it worked
table(ccm$WA)
##
## FALSE TRUE
## 67 39
The model should identify counties in Washington based on characteristics in the countyComplete (ccm) dataframe.
WAMod = glm(WA ~ area + poverty + median_household_income,data=ccm,family=binomial)
summary(WAMod)
##
## Call:
## glm(formula = WA ~ area + poverty + median_household_income,
## family = binomial, data = ccm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4225 -0.4632 -0.2236 0.2105 2.2696
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.670e+00 4.718e+00 -0.990 0.322
## area 3.428e-03 8.737e-04 3.923 8.74e-05 ***
## poverty -1.766e-01 1.427e-01 -1.237 0.216
## median_household_income 8.484e-05 6.864e-05 1.236 0.216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 139.462 on 105 degrees of freedom
## Residual deviance: 65.368 on 102 degrees of freedom
## AIC: 73.368
##
## Number of Fisher Scoring iterations: 6
ProbWA = predict(WAMod,type="response")
PredWA = ProbWA > .5
# Create the confusion matrix
table(PredWA,ccm$WA)
##
## PredWA FALSE TRUE
## FALSE 62 8
## TRUE 5 31
# Compute the overall accuracy rate
AccRate = mean(PredWA == ccm$WA)
AccRate
## [1] 0.8773585
Of the 39 counties in Washington, 31 were classified correctly and 8 were incorrectly identified as Alabama counties. In the case of Alabama, 62 counties were identified correctly and 8 were classified as Washington counties.
The overall accuracy was about 88%.
AugWA = augment(WAMod)
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
AugWA$state = ccm$state
AugWA$name = ccm$name
AugWA$Pred = PredWA
glimpse(AugWA)
## Observations: 106
## Variables: 14
## $ WA <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
## $ area <dbl> 594.44, 1589.78, 884.88, 622.58, 644.7...
## $ poverty <dbl> 10.6, 12.2, 25.0, 12.6, 13.4, 25.3, 25...
## $ median_household_income <dbl> 53255, 50147, 33219, 41770, 45549, 316...
## $ .fitted <dbl> 0.01400616, 2.87963665, -3.23291267, -...
## $ .se.fit <dbl> 0.5255341, 0.8829022, 0.8857175, 0.617...
## $ .resid <dbl> -1.1833637, -2.4225077, -0.2781527, -0...
## $ .hat <dbl> 0.06904316, 0.03926023, 0.02864931, 0....
## $ .sigma <dbl> 0.7951811, 0.7659812, 0.8040009, 0.801...
## $ .cooksd <dbl> 2.019688e-02, 1.893614e-01, 2.994098e-...
## $ .std.resid <dbl> -1.2264602, -2.4715095, -0.2822249, -0...
## $ state <fctr> Alabama, Alabama, Alabama, Alabama, A...
## $ name <fctr> Autauga County, Baldwin County, Barbo...
## $ Pred <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE...
WaMiss = filter(AugWA,Pred==FALSE & state == "Washington") %>% select(name)
WaMiss
## name
## 1 Asotin County
## 2 Columbia County
## 3 Garfield County
## 4 Island County
## 5 Mason County
## 6 Pacific County
## 7 San Juan County
## 8 Wahkiakum County