#Homework #2 Library output:
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(knitr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#overview In this homework assignment, you will work through various classification metrics. You will be asked to create functions in R to carry out the various calculations. You will also investigate some functions in packages that will let you obtain the equivalent results. Finally, you will create graphical output that also can be used to evaluate the output of classification models, such as binary logistic regression.
#1 Question: Download the classification output data set
data <- read.csv("https://raw.githubusercontent.com/Wilchau/Data_621_Hw_2/main/classification-output-data.csv")
head(data, 10)
#Question 2 Observation: Looking through this data set there are 3 key columns we can focus on: a) Class: the actual class for the observation b) scored.class: the predicted class for the observation (based on a threshold of 0.5) c) scored.probability: the predicted probability of success for the observation
data %>%
select(scored.class, class) %>%
table()
## class
## scored.class 0 1
## 0 119 30
## 1 5 27
#Question 3 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the accuracy of the predictions. Accuracy = (TP+TN)/(TP+FP+TN+FN)
accurary_predictions <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
round((TP + TN)/nrow(x), 4)
}
accurary_predictions(data)
## [1] 0.8066
#Question 4 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the classification error rate of the predictions.
class_error_rate <- function(x){
FP <- sum(x$class == 0 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
round((FP + FN)/nrow(x), 4)
}
class_error_rate(data)
## [1] 0.1934
#Question 5 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the precision of the predictions.
precision <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
round(TP/(TP + FP), 4)
}
precision(data)
## [1] 0.8438
#Question 6 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the sensitivity of the predictions. Sensitivity is also known as recall.
sensitivity <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
round(TP/(TP + FN), 4)
}
sensitivity(data)
## [1] 0.4737
#Question 7 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the specificity of the predictions.
specificity <- function(x){
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
round(TN/(TN + FP), 4)
}
specificity(data)
## [1] 0.9597
#Question 8 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the F1 score of the predictions.
f1_score <- function(x){
(2*precision(x)*sensitivity(x))/(precision(x)+sensitivity(x))
}
f1_score(data)
## [1] 0.6067675
#Question 9 Before we move on, let’s consider a question that was asked: What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1. We can see that the precision and Senssititvity used to calculate F1 is 0.6067675 which is between 0 and 1.
#Question 10 Write a function that generates an ROC curve from a data set with a true classification column (class in our example) and a probability column (scored.probability in our example). Your function should return a list that includes the plot of the ROC curve and a vector that contains the calculated area under the curve (AUC). Note that I recommend using a sequence of thresholds ranging from 0 to 1 at 0.01 intervals.
ROC <- function(x, y){
x <- x[order(y, decreasing = TRUE)]
TPR <- cumsum(x) / sum(x)
FPR <- cumsum(!x) / sum(!x)
xy <- data.frame(TPR, FPR, x)
FPR_df <- c(diff(xy$FPR), 0)
TPR_df <- c(diff(xy$TPR), 0)
AUC <- round(sum(xy$TPR * FPR_df) + sum(TPR_df * FPR_df)/2, 4)
plot(xy$FPR, xy$TPR, type = "l",
main = "ROC Curve",
xlab = "False Postivie Rate",
ylab = "True Positive Rate")
abline(a = 0, b = 1)
legend(.6, .4, AUC, title = "AUC")
}
ROC(data$class,data$scored.probability)
#Question 11 Use your created R functions and the provided
classification output data set to produce all of the classification
metrics discussed above.
metrics <- c(accurary_predictions(data), class_error_rate(data), precision(data), sensitivity(data), specificity(data), f1_score(data))
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
kable(metrics, col.names = "Metrics")
| Metrics | |
|---|---|
| Accuracy | 0.8066000 |
| Classification Error Rate | 0.1934000 |
| Precision | 0.8438000 |
| Sensitivity | 0.4737000 |
| Specificity | 0.9597000 |
| F1 Score | 0.6067675 |
#Question 12 Investigate the caret package. In particular, consider the functions confusionMatrix, sensitivity, and specificity. Apply the functions to the data set. How do the results compare with your own functions?
b <- data %>%
select(scored.class, class) %>%
mutate(scored.class = as.factor(scored.class),
class = as.factor(class))
c <- confusionMatrix(b$scored.class, b$class, positive = "1")
caret_package <- c(c$overall["Accuracy"], c$byClass["Sensitivity"], c$byClass["Specificity"])
written_function <- c(accurary_predictions(data), sensitivity(data), specificity(data))
d <- cbind(caret_package, written_function)
kable(d)
| caret_package | written_function | |
|---|---|---|
| Accuracy | 0.8066298 | 0.8066 |
| Sensitivity | 0.4736842 | 0.4737 |
| Specificity | 0.9596774 | 0.9597 |
#Question 13 nvestigate the pROC package. Use it to generate an ROC curve for the data set. How do the results compare with your own functions?
par(mfrow = c(1, 2))
plot(roc(data$class, data$scored.probability), print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC(data$class,data$scored.probability)
Looking through these two graphs they both look very similar