Homework 2

Download Dataset

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

df <- read_csv('classification-output-data.csv')

## Rows: 181 Columns: 11

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): pregnant, glucose, diastolic, skinfold, insulin, bmi, pedigree, ag...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(df)

## # A tibble: 6 × 11
##   pregnant glucose diastolic skinfold insulin   bmi pedigree   age class
##      <dbl>   <dbl>     <dbl>    <dbl>   <dbl> <dbl>    <dbl> <dbl> <dbl>
## 1        7     124        70       33     215  25.5    0.161    37     0
## 2        2     122        76       27     200  35.9    0.483    26     0
## 3        3     107        62       13      48  22.9    0.678    23     1
## 4        1      91        64       24       0  29.2    0.192    21     0
## 5        4      83        86       19       0  29.3    0.317    34     0
## 6        1     100        74       12      46  19.5    0.149    28     0
## # … with 2 more variables: scored.class <dbl>, scored.probability <dbl>

Create Confusion Matrix

cm <- table(df$class, df$scored.class)
cm

##    
##       0   1
##   0 119   5
##   1  30  27

Accuracy Function

Function returns ~81% accuracy

accuracy_f <- function(tp, tn, fp, fn) {
  return( ( (tp+tn) / (tp+tn+fp+fn) ) )
}

accuracy <- accuracy_f(cm[4], cm[1], cm[2], cm[3])
accuracy

## [1] 0.8066298

Classification Error

classification_error <- function(tp, tn, fp, fn) {
  return( ( (fp+fn) / (tp+tn+fp+fn) ) )
}

classification_error <- classification_error(cm[4], cm[1], cm[2], cm[3])

classification_error

## [1] 0.1933702

classification_error + accuracy

## [1] 1

Precision

precision <- function(tp, fp) {
  return( ( tp / (tp+fp) ) )
}

precision <- precision(cm[4], cm[1])
precision

## [1] 0.1849315

Sensitivity

sensitivity <- function(tp, fn) {
  return( ( tp / (tp+fn) ) )
}

sensitivity <- sensitivity(cm[4], cm[3])
sensitivity

## [1] 0.84375

Specificity

specificity <- function(tn, fp) {
  return( ( tn / (tn+fp) ) )
}

specificity <- specificity(cm[1], cm[3])
specificity

## [1] 0.9596774

F1 Score

f1 <- (2 * precision * sensitivity) / (precision + sensitivity)
f1

## [1] 0.3033708

F1 Score Proof

a = precision / (precision + sensitivity) b = sensitivity / (precision + sensitivity)

0 <= precision <= 1 0 <= sensitivity <= 1

precision < (precision + sensitivity) sensitivity < (precision + sensitivity)

a < 1 b < 1

0<=F1 Score<=1

ROC Curve

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

roc <- roc(df$class, df$scored.class, direction="<")

## Setting levels: control = 0, case = 1

plot(roc, col="yellow", lwd=3)

Caret

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

confusionMatrix(as.factor(df$scored.class), as.factor(df$class))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##                                           
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.9597          
##             Specificity : 0.4737          
##          Pos Pred Value : 0.7987          
##          Neg Pred Value : 0.8438          
##              Prevalence : 0.6851          
##          Detection Rate : 0.6575          
##    Detection Prevalence : 0.8232          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 0               
##

pROC

library(pROC)

roc <- roc(df$class, df$scored.class, direction="<")

## Setting levels: control = 0, case = 1

plot(roc, col="yellow", lwd=3)