library(tidyverse)
library(zoo)
library(knitr)
library(caret)
library(pROC)
In this homework assignment, you will work through various classification metrics. You will be asked to create functions in R to carry out the various calculations. You will also investigate some functions in packages that will let you obtain the equivalent results. Finally, you will create graphical output that also can be used to evaluate the output of classification models, such as binary logistic regression.
file <- "https://raw.githubusercontent.com/saayedalam/Data/master/classification-output-data.csv"
data <- read.csv(file)
head(data, 5)
## pregnant glucose diastolic skinfold insulin bmi pedigree age class
## 1 7 124 70 33 215 25.5 0.161 37 0
## 2 2 122 76 27 200 35.9 0.483 26 0
## 3 3 107 62 13 48 22.9 0.678 23 1
## 4 1 91 64 24 0 29.2 0.192 21 0
## 5 4 83 86 19 0 29.3 0.317 34 0
## scored.class scored.probability
## 1 0 0.32845226
## 2 0 0.27319044
## 3 0 0.10966039
## 4 0 0.05599835
## 5 0 0.10049072
#The rows represent the predicted class and the columns are actual class.
data %>%
select(scored.class, class) %>%
table()
## class
## scored.class 0 1
## 0 119 30
## 1 5 27
accurary_predictions <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
round((TP + TN)/nrow(x), 4)
}
accurary_predictions(data)
## [1] 0.8066
class_error_rate <- function(x){
FP <- sum(x$class == 0 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
round((FP + FN)/nrow(x), 4)
}
class_error_rate(data)
## [1] 0.1934
#Verify that you get an accuracy and an error rate that sums to one
accurary_predictions(data) + class_error_rate(data)
## [1] 1
precision <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
round(TP/(TP + FP), 4)
}
precision(data)
## [1] 0.8438
sensitivity <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
round(TP/(TP + FN), 4)
}
sensitivity(data)
## [1] 0.4737
specificity <- function(x){
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
round(TN/(TN + FP), 4)
}
specificity(data)
## [1] 0.9597
f1_score <- function(x){
(2*precision(x)*sensitivity(x))/(precision(x)+sensitivity(x))
}
f1_score(data)
## [1] 0.6067675
Before we move on, let’s consider a question that was asked: What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1.
Both Precision and Sensitivity used to calculate F1 score are bounded between 0 and 1. Therefore, F1 score will be between 0 and 1.
Write a function that generates an ROC curve from a data set with a true classification column (class in our example) and a probability column (scored.probability in our example). Your function should return a list that includes the plot of the ROC curve and a vector that contains the calculated area under the curve (AUC). Note that I recommend using a sequence of thresholds ranging from 0 to 1 at 0.01 intervals.
ROC <- function(x, y){
x <- x[order(y, decreasing = TRUE)]
TPR <- cumsum(x) / sum(x)
FPR <- cumsum(!x) / sum(!x)
xy <- data.frame(TPR, FPR, x)
FPR_df <- c(diff(xy$FPR), 0)
TPR_df <- c(diff(xy$TPR), 0)
AUC <- round(sum(xy$TPR * FPR_df) + sum(TPR_df * FPR_df)/2, 4)
plot(xy$FPR, xy$TPR, type = "l",
main = "ROC Curve",
xlab = "False Postivie Rate",
ylab = "True Positive Rate")
abline(a = 0, b = 1)
legend(.6, .4, AUC, title = "AUC")
}
ROC(data$class,data$scored.probability)
metrics <- c(accurary_predictions(data), class_error_rate(data), precision(data), sensitivity(data), specificity(data), f1_score(data))
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
kable(metrics, col.names = "Metrics")
| Metrics | |
|---|---|
| Accuracy | 0.8066000 |
| Classification Error Rate | 0.1934000 |
| Precision | 0.8438000 |
| Sensitivity | 0.4737000 |
| Specificity | 0.9597000 |
| F1 Score | 0.6067675 |
b <- data %>%
select(scored.class, class) %>%
mutate(scored.class = as.factor(scored.class),
class = as.factor(class))
c <- confusionMatrix(b$scored.class, b$class, positive = "1")
caret_package <- c(c$overall["Accuracy"], c$byClass["Sensitivity"], c$byClass["Specificity"])
written_function <- c(accurary_predictions(data), sensitivity(data), specificity(data))
d <- cbind(caret_package, written_function)
kable(d)
| caret_package | written_function | |
|---|---|---|
| Accuracy | 0.8066298 | 0.8066 |
| Sensitivity | 0.4736842 | 0.4737 |
| Specificity | 0.9596774 | 0.9597 |
#The results are exactly the same
par(mfrow = c(1, 2))
plot(roc(data$class, data$scored.probability), print.auc = TRUE)
ROC(data$class,data$scored.probability)