library(openintro)
## Please visit openintro.org for free statistics materials
##
## Attaching package: 'openintro'
## The following objects are masked from 'package:datasets':
##
## cars, trees
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(broom)
countyComplete %>%
filter(state %in% c("Washington","Alabama")) %>%
mutate(WA = state =="Washington") -> ccm
# Make sure it worked
table(ccm$WA)
##
## FALSE TRUE
## 67 39
The model should identify counties in Washington based on characteristics in the countyComplete (ccm) dataframe.
WAMod = glm(WA ~ area + poverty + median_household_income,data=ccm,family=binomial)
summary(WAMod)
##
## Call:
## glm(formula = WA ~ area + poverty + median_household_income,
## family = binomial, data = ccm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4225 -0.4632 -0.2236 0.2105 2.2696
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.670e+00 4.718e+00 -0.990 0.322
## area 3.428e-03 8.737e-04 3.923 8.74e-05 ***
## poverty -1.766e-01 1.427e-01 -1.237 0.216
## median_household_income 8.484e-05 6.864e-05 1.236 0.216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 139.462 on 105 degrees of freedom
## Residual deviance: 65.368 on 102 degrees of freedom
## AIC: 73.368
##
## Number of Fisher Scoring iterations: 6
ProbWA = predict(WAMod,type="response")
PredWA = ProbWA > .5
# Create the confusion matrix
table(PredWA,ccm$WA)
##
## PredWA FALSE TRUE
## FALSE 62 8
## TRUE 5 31
# Compute the overall accuracy rate
AccRate = mean(PredWA == ccm$WA)
AccRate
## [1] 0.8773585
Of the 39 counties in Washington, 31 were classified correctly and 8 were incorrectly identified as Alabama counties. In the case of Alabama, 62 counties were identified correctly and 8 were classified as Washington counties.
The overall accuracy was about 88%.
PredWA = ProbWA > .8
# Create the confusion matrix
table(PredWA,ccm$WA)
##
## PredWA FALSE TRUE
## FALSE 65 17
## TRUE 2 22
# Compute the overall accuracy rate
AccRate = mean(PredWA == ccm$WA)
AccRate
## [1] 0.8207547
fAccrate = function(th){
PredWA = ProbWA > th
return(mean(PredWA == ccm$WA))
}
x = seq(from = .01,to=.99,by = .01)
Accrate = sapply(x,fAccrate)
Accrate
## [1] 0.4433962 0.4811321 0.5188679 0.5471698 0.5943396 0.6132075 0.6603774
## [8] 0.6886792 0.6981132 0.7264151 0.7735849 0.7924528 0.7924528 0.8018868
## [15] 0.8113208 0.8207547 0.8396226 0.8396226 0.8396226 0.8301887 0.8301887
## [22] 0.8396226 0.8490566 0.8584906 0.8584906 0.8490566 0.8490566 0.8679245
## [29] 0.8679245 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906
## [36] 0.8584906 0.8584906 0.8490566 0.8490566 0.8490566 0.8584906 0.8584906
## [43] 0.8584906 0.8584906 0.8773585 0.8773585 0.8773585 0.8773585 0.8679245
## [50] 0.8773585 0.8867925 0.8773585 0.8773585 0.8584906 0.8490566 0.8584906
## [57] 0.8584906 0.8584906 0.8584906 0.8584906 0.8584906 0.8490566 0.8396226
## [64] 0.8396226 0.8396226 0.8396226 0.8490566 0.8490566 0.8490566 0.8490566
## [71] 0.8490566 0.8396226 0.8301887 0.8301887 0.8301887 0.8301887 0.8207547
## [78] 0.8207547 0.8207547 0.8207547 0.8207547 0.8113208 0.8113208 0.8113208
## [85] 0.8113208 0.8113208 0.8113208 0.8113208 0.8113208 0.8113208 0.8018868
## [92] 0.8018868 0.7924528 0.7924528 0.7924528 0.7641509 0.7641509 0.7358491
## [99] 0.7075472
result = data.frame(x,Accrate)
result %>% ggplot(aes(x,Accrate)) + geom_point()
winner = which(Accrate == max(Accrate))
winner
## [1] 51
x[winner]
## [1] 0.51
We may have asymmetric concerns about false positives and false negatives.
If the probability that a plan is edible is .5, would you call the plan edible?
How do you compute the false positive and false negative rates?
Use the confusion matrix.
th = .5
PredWA = ProbWA > th
cmat = table(PredWA,ccm$WA)
cmat
##
## PredWA FALSE TRUE
## FALSE 62 8
## TRUE 5 31
cmat[1,1]
## [1] 62
cmat[1,2]
## [1] 8
cmat[2,1]
## [1] 5
cmat[2,2]
## [1] 31
FNR = cmat[1,2]/(cmat[1,1] + cmat[1,2])
FPR = cmat[2,1]/(cmat[2,1] + cmat[2,2])
FNR
## [1] 0.1142857
FPR
## [1] 0.1388889
We could type different threshold values into this code snippet and observe effects on the FPR and FNR. Or we could build functions and use them to show how the values of FPR and FNR respond to th.
Hint: That’s an exercise. Use my code for fAccrate as a template.