library(ggplot2)
str(mpg)
## Classes 'tbl_df', 'tbl' and 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
table(mpg$class)
## 
##    2seater    compact    midsize    minivan     pickup subcompact 
##          5         47         41         11         33         35 
##        suv 
##         62

We can create a classifier to identify vehicles of class suv. Everything is easier if we create a 0,1 variable, where 1 identifies positive results.

mpg$suv = ifelse(mpg$class == "suv",1,0)
fit.suv = glm(suv~cyl+hwy,data=mpg,family=binomial)
summary(fit.suv)
## 
## Call:
## glm(formula = suv ~ cyl + hwy, family = binomial, data = mpg)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0776  -0.5230  -0.3120   0.6884   2.3302  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.71366    2.17866   3.541 0.000399 ***
## cyl         -0.17258    0.17225  -1.002 0.316397    
## hwy         -0.35813    0.06156  -5.818 5.97e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 270.59  on 233  degrees of freedom
## Residual deviance: 184.92  on 231  degrees of freedom
## AIC: 190.92
## 
## Number of Fisher Scoring iterations: 6

Let’s look at our results. The results of the glm procedure are probabilities.

pred.suv = predict(fit.suv,type="response")
summary(pred.suv)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0001609 0.0479600 0.1275000 0.2650000 0.4717000 0.8845000
hist(pred.suv)

We need to convert the probabilities to decisions in a separate step. The usual choice is .5. However if the result of a bad classification has serious consequences, we may choose a differnt threshold.

pred=ifelse(pred.suv>.5,1,0)
res=table(mpg$suv,pred)
res
##    pred
##       0   1
##   0 146  26
##   1  31  31
accuracy = sum(diag(res))/sum(res)
accuracy
## [1] 0.7564103

Exercise. Use the iris dataset included in base R.

str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Build a classification model to identify specimens with type = “setosa” based on one of the four numerical variables.

iris$setosa = ifelse(iris$Species == "setosa",1,0)
pred.set = glm(setosa ~ Sepal.Length,data=iris,family=binomial)
pred.res = predict(pred.set,type="response")
res = ifelse(pred.res > .5,1,0)
conf = table(iris$setosa,res)
conf
##    res
##      0  1
##   0 94  6
##   1 10 40
accuracy = sum(diag(conf))/sum(conf)
accuracy
## [1] 0.8933333