library(openintro)
## Please visit openintro.org for free statistics materials
##
## Attaching package: 'openintro'
## The following objects are masked from 'package:datasets':
##
## cars, trees
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(broom)
countyComplete %>%
filter(state %in% c("Washington","Alabama")) %>%
mutate(WA = state =="Washington") -> ccm
# Make sure it worked
table(ccm$WA)
##
## FALSE TRUE
## 67 39
The model should identify counties in Washington based on characteristics in the countyComplete (ccm) dataframe.
WAMod = glm(WA ~ area + poverty + median_household_income,data=ccm,family=binomial)
summary(WAMod)
##
## Call:
## glm(formula = WA ~ area + poverty + median_household_income,
## family = binomial, data = ccm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4225 -0.4632 -0.2236 0.2105 2.2696
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.670e+00 4.718e+00 -0.990 0.322
## area 3.428e-03 8.737e-04 3.923 8.74e-05 ***
## poverty -1.766e-01 1.427e-01 -1.237 0.216
## median_household_income 8.484e-05 6.864e-05 1.236 0.216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 139.462 on 105 degrees of freedom
## Residual deviance: 65.368 on 102 degrees of freedom
## AIC: 73.368
##
## Number of Fisher Scoring iterations: 6
ProbWA = predict(WAMod,type="response")
PredWA = ProbWA > .5
# Create the confusion matrix
table(PredWA,ccm$WA)
##
## PredWA FALSE TRUE
## FALSE 62 8
## TRUE 5 31
# Compute the overall accuracy rate
AccRate = mean(PredWA == ccm$WA)
AccRate
## [1] 0.8773585
Of the 39 counties in Washington, 31 were classified correctly and 8 were incorrectly identified as Alabama counties. In the case of Alabama, 62 counties were identified correctly and 8 were classified as Washington counties.
The overall accuracy was about 88%.
AugWA = augment(WAMod)
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
AugWA$state = ccm$state
AugWA$name = ccm$name
AugWA$Pred = PredWA
glimpse(AugWA)
## Observations: 106
## Variables: 14
## $ WA <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
## $ area <dbl> 594.44, 1589.78, 884.88, 622.58, 644.7...
## $ poverty <dbl> 10.6, 12.2, 25.0, 12.6, 13.4, 25.3, 25...
## $ median_household_income <dbl> 53255, 50147, 33219, 41770, 45549, 316...
## $ .fitted <dbl> 0.01400616, 2.87963665, -3.23291267, -...
## $ .se.fit <dbl> 0.5255341, 0.8829022, 0.8857175, 0.617...
## $ .resid <dbl> -1.1833637, -2.4225077, -0.2781527, -0...
## $ .hat <dbl> 0.06904316, 0.03926023, 0.02864931, 0....
## $ .sigma <dbl> 0.7951811, 0.7659812, 0.8040009, 0.801...
## $ .cooksd <dbl> 2.019688e-02, 1.893614e-01, 2.994098e-...
## $ .std.resid <dbl> -1.2264602, -2.4715095, -0.2822249, -0...
## $ state <fctr> Alabama, Alabama, Alabama, Alabama, A...
## $ name <fctr> Autauga County, Baldwin County, Barbo...
## $ Pred <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE...
WaMiss = filter(AugWA,Pred==FALSE & state == "Washington") %>% select(name)
WaMiss
## name
## 1 Asotin County
## 2 Columbia County
## 3 Garfield County
## 4 Island County
## 5 Mason County
## 6 Pacific County
## 7 San Juan County
## 8 Wahkiakum County
PredWA = ProbWA > .8
# Create the confusion matrix
table(PredWA,ccm$WA)
##
## PredWA FALSE TRUE
## FALSE 65 17
## TRUE 2 22
# Compute the overall accuracy rate
AccRate = mean(PredWA == ccm$WA)
AccRate
## [1] 0.8207547
fAccrate = function(th){
PredWA = ProbWA > th
return(mean(PredWA == ccm$WA))
}
x = seq(from = .01,to=.99,by = .01)
Accrate = sapply(x,fAccrate)
Accrate
## [1] 0.4433962 0.4811321 0.5188679 0.5471698 0.5943396 0.6132075 0.6603774
## [8] 0.6886792 0.6981132 0.7264151 0.7735849 0.7924528 0.7924528 0.8018868
## [15] 0.8113208 0.8207547 0.8396226 0.8396226 0.8396226 0.8301887 0.8301887
## [22] 0.8396226 0.8490566 0.8584906 0.8584906 0.8490566 0.8490566 0.8679245
## [29] 0.8679245 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906
## [36] 0.8584906 0.8584906 0.8490566 0.8490566 0.8490566 0.8584906 0.8584906
## [43] 0.8584906 0.8584906 0.8773585 0.8773585 0.8773585 0.8773585 0.8679245
## [50] 0.8773585 0.8867925 0.8773585 0.8773585 0.8584906 0.8490566 0.8584906
## [57] 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906 0.8490566 0.8396226
## [64] 0.8396226 0.8396226 0.8396226 0.8490566 0.8490566 0.8490566 0.8490566
## [71] 0.8490566 0.8396226 0.8301887 0.8301887 0.8301887 0.8301887 0.8207547
## [78] 0.8207547 0.8207547 0.8207547 0.8207547 0.8113208 0.8113208 0.8113208
## [85] 0.8113208 0.8113208 0.8113208 0.8113208 0.8113208 0.8113208 0.8018868
## [92] 0.8018868 0.7924528 0.7924528 0.7924528 0.7641509 0.7641509 0.7358491
## [99] 0.7075472
result = data.frame(x,Accrate)
result %>% ggplot(aes(x,Accrate)) + geom_point()
winner = which(Accrate == max(Accrate))
winner
## [1] 51
x[winner]
## [1] 0.51