library("dplyr")
library("ggplot2")
library("knitr")
df <- read.csv("https://raw.githubusercontent.com/ezaccountz/DATA_621/main/HW2/classification-output-data.csv")
The data set has three key columns we will use:
class: the actual class for the observation
scored.class: the predicted class for the observation (based on a threshold of 0.5)
scored.probability: the predicted probability of success for the observation
Use the table() function to get the raw confusion matrix for this scored dataset. Make sure you understand the output. In particular, do the rows represent the actual or predicted class? The columns?
df %>%
select(class, scored.class) %>%
mutate(class = recode(class,
'0' = 'Actual Negative',
'1' = 'Actual Positive'),
scored.class = recode(scored.class,
'0' = 'Predicted Negative',
'1' = 'Predicted Positive')) %>%
table()
## scored.class
## class Predicted Negative Predicted Positive
## Actual Negative 119 5
## Actual Positive 30 27
The rows represent the actual class while the columns represent the predicted class.
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the accuracy of the predictions.
\(Accuracy = \frac{TP + TN}{TP + FP + TN + FN}\)
calc_accurary <- function (df, actual_var_name, pred_var_name) {
accurary <- sum(df[actual_var_name] == df[pred_var_name])/nrow(df)
return (accurary)
}
calc_accurary(df,"class","scored.class")
## [1] 0.8066298
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the classification error rate of the predictions.
\(Classification Error Rate = \frac{FP + FN}{TP + FP + TN + FN}\)
calc_error <- function (df, actual_var_name, pred_var_name) {
error <- sum(df[actual_var_name] != df[pred_var_name])/nrow(df)
return (error)
}
calc_error(df,"class","scored.class")
## [1] 0.1933702
Verify that you get an accuracy and an error rate that sums to one.
calc_accurary(df,"class","scored.class") + calc_error(df,"class","scored.class")
## [1] 1
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the precision of the predictions.
\(Precision = \frac{TP}{TP + FP}\)
calc_precision <- function (df, actual_var_name, pred_var_name) {
n_tp <- sum(df[actual_var_name] == 1 & df[pred_var_name] == 1)
n_fp <- sum(df[actual_var_name] == 0 & df[pred_var_name] == 1)
precision <- n_tp / (n_tp + n_fp)
return (precision)
}
calc_precision(df,"class","scored.class")
## [1] 0.84375
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the sensitivity of the predictions. Sensitivity is also known as recall.
\[ Sensitivity = \frac{TP}{TP + FP} \]
calc_sensitivity <- function (df, actual_var_name, pred_var_name) {
n_tp <- sum(df[actual_var_name] == 1 & df[pred_var_name] == 1)
n_fn <- sum(df[actual_var_name] == 1 & df[pred_var_name] == 0)
sensitivity <- n_tp / (n_tp + n_fn)
return (sensitivity)
}
calc_sensitivity(df,"class","scored.class")
## [1] 0.4736842
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the specificity of the predictions.
\[ Specificity = \frac{TN}{TN + FP} \]
calc_specificity <- function (df, actual_var_name, pred_var_name) {
n_tn <- sum(df[actual_var_name] == 0 & df[pred_var_name] == 0)
n_fp <- sum(df[actual_var_name] == 0 & df[pred_var_name] == 1)
specificity <- n_tn / (n_tn + n_fp)
return (specificity)
}
calc_specificity(df,"class","scored.class")
## [1] 0.9596774
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the F1 score of the predictions.
\[ F1 Score = \frac{2 \times Precision \times Sensitivity}{Precision + Sensitivity} \]
calc_f1 <- function (df, actual_var_name, pred_var_name) {
precision <- calc_precision(df,"class","scored.class")
sensitivity <- calc_sensitivity(df,"class","scored.class")
f1 <- 2*precision*sensitivity/(precision+sensitivity)
return (f1)
}
calc_f1(df,"class","scored.class")
## [1] 0.6067416
Before we move on, let’s consider a question that was asked: What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1. (Hint: If 0<a<1 and 0<b<1 then ab<a.)
Ans: let \(α=Precision, β=Sensitivity, γ=F1Score=\frac{2×α×β}{α+β}\)
$$
<α<1and <β<1 \ >0 $$
and,
\[ if \space 0<α<1 \space and \space 0<β<1 \space then \space αβ<α \\ \frac{2×α×β}{α+β}=\frac{αβ}{α+β}+\frac{αβ}{α+β} < \frac{α}{α+β}+\frac{β}{α+β}=\frac{α+β}{α+β}=1 \\ 0<γ<1 \]
We proved that F1 Score is always between 0 and 1.
Write a function that generates an ROC curve from a data set with a true classification column (class in our example) and a probability column (scored.probability in our example). Your function should return a list that includes the plot of the ROC curve and a vector that contains the calculated area under the curve (AUC). Note that I recommend using a sequence of thresholds ranging from 0 to 1 at 0.01 intervals.
ROC <- function(actual_class, prob){
pred_df <- data.frame(actual_class = actual_class, prob = prob)
pred_df <- pred_df[order(pred_df$prob, decreasing = TRUE),]
tpr <- cumsum(pred_df$actual_class) / sum(pred_df$actual_class)
fpr <- cumsum(!pred_df$actual_class) / sum(!pred_df$actual_class)
tpr <- c(tpr, 0)
fpr <- c(fpr, 1)
tpr_fpr <- data.frame(tpr = tpr, fpr = fpr)
rect_table <- tpr_fpr %>%
group_by(tpr) %>%
summarise(
fpr_min = min(fpr),
fpr_max = max(fpr)
) %>%
arrange(tpr)
AUC <- sum(rect_table$tpr * (rect_table$fpr_max-rect_table$fpr_min))
roc <- ggplot() +
geom_polygon(data = tpr_fpr, mapping = aes(x=fpr, y=tpr), fill = 'deeppink4') +
geom_line() +
geom_abline() +
labs(title = "ROC Curve", x = "False Postivie Rate (1 - Specificity)",y = "True Positive Rate(Sensitivity)") +
theme(plot.title = element_text(hjust = 0.5))
return(list(roc = roc, AUC = AUC))
}
ROC(df$class,df$scored.probability)$roc
noquote(paste0("The Area Under Curve (AUC) is: ", round(ROC(df$class,df$scored.probability)$AUC,2)))
## [1] The Area Under Curve (AUC) is: 0.85
Use your created R functions and the provided classification output data set to produce all of the classification metrics discussed above.
# data.frame(metric = c("accurary","error rate",
# "precision","sensitivity","specificity","F1"),
# score = c(calc_accurary(df,"class","scored.class"),
# calc_error(df,"class","scored.class"),
# calc_precision(df,"class","scored.class"),
# calc_sensitivity(df,"class","scored.class"),
# calc_specificity(df,"class","scored.class"),
# calc_f1(df,"class","scored.class")
# ))
createdfunctions <- c(calc_accurary(df,"class","scored.class"),
calc_error(df,"class","scored.class"),
calc_precision(df,"class","scored.class"),
calc_sensitivity(df,"class","scored.class"),
calc_specificity(df,"class","scored.class"),
calc_f1(df,"class","scored.class")
)
names(createdfunctions) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
kable(createdfunctions, col.names = "Created Functions")
| Created Functions | |
|---|---|
| Accuracy | 0.8066298 |
| Classification Error Rate | 0.1933702 |
| Precision | 0.8437500 |
| Sensitivity | 0.4736842 |
| Specificity | 0.9596774 |
| F1 Score | 0.6067416 |
Investigate the caret package. In particular, consider the functions confusionMatrix, sensitivity, and specificity. Apply the functions to the data set. How do the results compare with your own functions?
library(caret)
## Loading required package: lattice
df2 <- data.frame(actual_class = df$class,
predicted_class = df$scored.class)
df2$actual_class = as.factor(df2$actual_class)
df2$predicted_class = as.factor(df2$predicted_class)
c_M <- confusionMatrix(df2$predicted_class,df2$actual_class,
positive = "1")
caret <- c(c_M$overall["Accuracy"], c_M$byClass["Sensitivity"], c_M$byClass["Specificity"])
createdfunctions2 <- c(calc_accurary(df,"class","scored.class"),
calc_sensitivity(df,"class","scored.class"),
calc_specificity(df,"class","scored.class")
)
res <- cbind(caret, createdfunctions2)
kable(res, col.names = c("Caret Package","Created Functions"))
| Caret Package | Created Functions | |
|---|---|---|
| Accuracy | 0.8066298 | 0.8066298 |
| Sensitivity | 0.4736842 | 0.4736842 |
| Specificity | 0.9596774 | 0.9596774 |
The results are the same as the results produced by our own functions.
Investigate the pROC package. Use it to generate an ROC curve for the data set. How do the results compare with your own functions?
library(pROC)
rocCurve <- roc(df$class, df$scored.probability)
plot(rocCurve)
The ROC curve is exactly the same as we plotted using our own function
noquote(paste0("The AUC is: ", round(auc(rocCurve),2)))
## [1] The AUC is: 0.85
The AUC is exactly the same as what we calculated earlier using our own function.