In this homework assignment, you will work through various classification metrics. You will be asked to create functions in R to carry out the various calculations. You will also investigate some functions in packages that will let you obtain the equivalent results. Finally, you will create graphical output that also can be used to evaluate the output of classification models, such as binary logistic regression.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(zoo)
## Warning: package 'zoo' was built under R version 3.6.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(knitr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data <- read.csv("https://raw.githubusercontent.com/ekhahm/datascience/master/classification-output-data.csv")
head(data)
## pregnant glucose diastolic skinfold insulin bmi pedigree age class
## 1 7 124 70 33 215 25.5 0.161 37 0
## 2 2 122 76 27 200 35.9 0.483 26 0
## 3 3 107 62 13 48 22.9 0.678 23 1
## 4 1 91 64 24 0 29.2 0.192 21 0
## 5 4 83 86 19 0 29.3 0.317 34 0
## 6 1 100 74 12 46 19.5 0.149 28 0
## scored.class scored.probability
## 1 0 0.32845226
## 2 0 0.27319044
## 3 0 0.10966039
## 4 0 0.05599835
## 5 0 0.10049072
## 6 0 0.05515460
data %>%
select(scored.class, class)%>%
table()
## class
## scored.class 0 1
## 0 119 30
## 1 5 27
Accuracy =\(\frac{TP+TN}{TP+FP+TN+FN}\)
accuracy <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
round((TP + TN)/nrow(x), 4)
}
accuracy(data)
## [1] 0.8066
Classification Error Rate =\(\frac{FP+FN}{TP+FP+TN+FN}\)
class_er <- function(x){
FP <- sum(x$class == 0 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
round((FP + FN)/nrow(x), 4)
}
class_er(data)
## [1] 0.1934
Precision =\(\frac{TP}{TP+FP}\)
precision <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
round(TP/(TP + FP), 4)
}
precision(data)
## [1] 0.8438
Sensitivity =\(\frac{TP}{TP+FN}\)
sensitivity <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
round(TP/(TP + FN), 4)
}
sensitivity(data)
## [1] 0.4737
Specificity =\(\frac{TN}{TN+FP}\)
specificity <- function(x){
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
round(TN/(TN + FP), 4)
}
specificity(data)
## [1] 0.9597
F1 Score =\(\frac{2 * Precision * sensitivity}{Precision + sensitivity}\)
f1_score <- function(x){
(2*precision(x)*sensitivity(x))/(precision(x)+sensitivity(x))
}
f1_score(data)
## [1] 0.6067675
0 < Precision < 1 and 0 < Sensitivity < 1. Therefore, F1 score is between 0 and 1
ROC <- function(x, y){
x <- x[order(y, decreasing = TRUE)]
TPR <- cumsum(x) / sum(x)
FPR <- cumsum(!x) / sum(!x)
xy <- data.frame(TPR, FPR, x)
FPR_df <- c(diff(xy$FPR), 0)
TPR_df <- c(diff(xy$TPR), 0)
AUC <- round(sum(xy$TPR * FPR_df) + sum(TPR_df * FPR_df)/2, 4)
plot(xy$FPR, xy$TPR, type = "l",
main = "ROC Curve",
xlab = "False Postivie Rate",
ylab = "True Positive Rate")
abline(a = 0, b = 1)
legend(.6, .4, AUC, title = "AUC")
}
ROC(data$class,data$scored.probability)
metrics <- c(accuracy(data), class_er(data), precision(data), sensitivity(data), specificity(data), f1_score(data))
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
metrics
## Accuracy Classification Error Rate
## 0.8066000 0.1934000
## Precision Sensitivity
## 0.8438000 0.4737000
## Specificity F1 Score
## 0.9597000 0.6067675
b <- data %>%
select(scored.class, class) %>%
mutate(scored.class = as.factor(scored.class),
class = as.factor(class))
c <- confusionMatrix(b$scored.class, b$class, positive = "1")
c
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.4737
## Specificity : 0.9597
## Pos Pred Value : 0.8438
## Neg Pred Value : 0.7987
## Prevalence : 0.3149
## Detection Rate : 0.1492
## Detection Prevalence : 0.1768
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 1
##
par(mfrow = c(1, 2))
plot(roc(data$class, data$scored.probability), print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC(data$class,data$scored.probability)