library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
df <- read_csv('classification-output-data.csv')
## Rows: 181 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): pregnant, glucose, diastolic, skinfold, insulin, bmi, pedigree, ag...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 11
## pregnant glucose diastolic skinfold insulin bmi pedigree age class
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 124 70 33 215 25.5 0.161 37 0
## 2 2 122 76 27 200 35.9 0.483 26 0
## 3 3 107 62 13 48 22.9 0.678 23 1
## 4 1 91 64 24 0 29.2 0.192 21 0
## 5 4 83 86 19 0 29.3 0.317 34 0
## 6 1 100 74 12 46 19.5 0.149 28 0
## # … with 2 more variables: scored.class <dbl>, scored.probability <dbl>
cm <- table(df$class, df$scored.class)
cm
##
## 0 1
## 0 119 5
## 1 30 27
Function returns ~81% accuracy
accuracy_f <- function(tp, tn, fp, fn) {
return( ( (tp+tn) / (tp+tn+fp+fn) ) )
}
accuracy <- accuracy_f(cm[4], cm[1], cm[2], cm[3])
accuracy
## [1] 0.8066298
classification_error <- function(tp, tn, fp, fn) {
return( ( (fp+fn) / (tp+tn+fp+fn) ) )
}
classification_error <- classification_error(cm[4], cm[1], cm[2], cm[3])
classification_error
## [1] 0.1933702
classification_error + accuracy
## [1] 1
precision <- function(tp, fp) {
return( ( tp / (tp+fp) ) )
}
precision <- precision(cm[4], cm[1])
precision
## [1] 0.1849315
sensitivity <- function(tp, fn) {
return( ( tp / (tp+fn) ) )
}
sensitivity <- sensitivity(cm[4], cm[3])
sensitivity
## [1] 0.84375
specificity <- function(tn, fp) {
return( ( tn / (tn+fp) ) )
}
specificity <- specificity(cm[1], cm[3])
specificity
## [1] 0.9596774
f1 <- (2 * precision * sensitivity) / (precision + sensitivity)
f1
## [1] 0.3033708
a = precision / (precision + sensitivity) b = sensitivity / (precision + sensitivity)
0 <= precision <= 1 0 <= sensitivity <= 1
precision < (precision + sensitivity) sensitivity < (precision + sensitivity)
a < 1 b < 1
0<=F1 Score<=1
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc <- roc(df$class, df$scored.class, direction="<")
## Setting levels: control = 0, case = 1
plot(roc, col="yellow", lwd=3)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
confusionMatrix(as.factor(df$scored.class), as.factor(df$class))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.9597
## Specificity : 0.4737
## Pos Pred Value : 0.7987
## Neg Pred Value : 0.8438
## Prevalence : 0.6851
## Detection Rate : 0.6575
## Detection Prevalence : 0.8232
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 0
##
library(pROC)
roc <- roc(df$class, df$scored.class, direction="<")
## Setting levels: control = 0, case = 1
plot(roc, col="yellow", lwd=3)