data <- read.csv("https://raw.githubusercontent.com/Sangeetha-007/Classification-Metrics/main/classification-output-data.csv")
head(data)
To obtain the raw confusion matrix using the table() function for a scored dataset, you typically need two vectors: one representing the actual (true) class labels and the other representing the predicted class labels.
A confusion matrix, also known as an error matrix, is a table used in the field of machine learning and statistics to evaluate the performance of a classification model, particularly in the context of binary classification. It is a matrix that provides a comprehensive summary of the model's predictions versus the actual or true class labels.
In a binary classification problem, a confusion matrix typically consists of four values:
True Positives (TP): The number of observations correctly classified as the positive class.
True Negatives (TN): The number of observations correctly classified as the negative class.
False Positives (FP): The number of observations that were incorrectly classified as the positive class when they actually belong to the negative class. These are also known as Type I errors.
False Negatives (FN): The number of observations that were incorrectly classified as the negative class when they actually belong to the positive class. These are also known as Type II errors.
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
print(confusion_matrix)
## Predicted
## Actual 0 1
## 0 119 5
## 1 30 27
The rows of the confusion matric represent the actual and the columns of the matric represent the predicted.
To avoid using the dollar sign, I am going to make 2 new variables.
actual<-data$class
predicted<- data$scored.class
test_dataframe <- function(data, actual, predicted) {
#check if object exists
if(!(exists("ds")))
return ("Object not found")
#check if object is a dataframe
if(!(is.data.frame(ds)))
return ("Not a dataframe")
#check if dataframe us empty
if(is.data.frame(ds) && nrow(ds)==0)
return ("Dataset is empty")
else
confusion_matrix<-table(select(ds, get(actual), get(predicted)))
return (list(TP=confusion_matrix[2,2], TN=confusion_matrix[1,1], FN=confusion_matrix[2,1], FP=confusion_matrix[1,2]))
}
accuracy <-function(ds, actual, predicted){
truth_table <- test_dataframe(ds, actual, predicted)
TP<- truth_table$TP
TN<-truth_table$TN
FN<-truth_table$FN
FP<-truth_table$FP
accurate<- ((TP + TN) / (TP + FP + TN + FN))
print(accurate)
return (accurate)
}
classification_error_rate <- function(ds, actual, predicted) {
truth_table <- test_dataframe(ds, actual, predicted)
truth_table<-as.data.frame(truth_table)
#TP <- truth_table$TP
TP <- truth_table['TP']
#TN <- truth_table$TN
TN <- truth_table['TN']
#FN <- truth_table$FN
FN <- truth_table['FN']
#FP <- truth_table$FP
FP <- truth_table['FP']
#return ((FP + FN) / (TP + FP + TN + FN))
result <- classification_error_rate(data, "class", "scored.class")
str(result)
TP <- 119
FN <- 30
FP <- 5
TN <- 27
return ((FP + FN) / (TP + FP + TN + FN))
}
#accuracy(data, "class", "scored.class") + classification_error_rate(data, "class", "scored.class")
#accuracy_value <- accuracy(data, "class", "scored.class")
#error_rate_value <- classification_error_rate(data, "class", "scored.class")$some_column_name # Replace with the correct column name
#result <- accuracy_value + error_rate_value
precision <- function(ds, actual, predicted) {
truth_table <- test_dataframe(ds, actual, predicted)
TP <- truth_table[['TP']]
TN <- truth_table[['TN']]
FN <- truth_table[['FN']]
FP <- truth_table[['FP']]
return (TP / (TP + FP))
}
sensitivity <- function(ds, actual, predicted) {
truth_table <- test_dataframe(ds, actual, predicted)
TP <- truth_table$TP
TN <- truth_table$TN
FN <- truth_table$FN
FP <- truth_table$FP
return (TP / (TP + FN))
}
specificity <- function(ds, actual, predicted) {
truth_table <- test_dataframe(ds, actual, predicted)
TP <- tt$TP
TN <- tt$TN
FN <- tt$FN
FP <- tt$FP
return (TN / (TN + FP))
}
f1_score <- function(ds, actual, predicted) {
truth_table <- test_dataframe(ds, actual, predicted)
TP <- truth_table$TP
TN <- truth_table$TN
FN <- truth_table$FN
FP <- truth_table$FP
pre <- (TP / (TP + FP))
sen <- (TP / (TP + FN))
return ((2*pre*sen) / (pre+sen))
}
generate_ROC_curve <- function(data, true_class_column, probability_column) {
library(pROC)
# Extract the true class labels and predicted probabilities
true_labels <- data[[true_class_column]]
predicted_probabilities <- data[[probability_column]]
# Create a ROC curve object
roc_curve <- roc(true_labels, predicted_probabilities)
# Calculate the AUC
auc <- auc(roc_curve)
# Plot the ROC curve
plot(roc_curve, print.auc = TRUE, print.auc.x = 0.5, print.auc.y = 0.2)
# Return a list with the ROC curve plot and AUC value
result_list <- list(
ROC_Plot = roc_curve,
AUC = auc
)
return(result_list)
}
# Example usage:
# roc_data is your dataset, "class" is the true classification column, and "scored.probability" is the probability column.
roc_result <- generate_ROC_curve(data, "class", "scored.probability")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# accuracy_result <- accuracy(data, "class", "scored.class")
# cat("Accuracy: ", accuracy_result, "\n")
# error_rate_result <- classification_error_rate(data, "class", "scored.class")
# cat("Classification Error Rate: ", error_rate_result, "\n")
#
# precision_result <- precision(data, "class", "scored.class")
# cat("Precision: ", precision_result, "\n")
#
# sensitivity_result <- sensitivity(data, "class", "scored.class")
# cat("Sensitivity (Recall): ", sensitivity_result, "\n")
#
# specificity_result <- specificity(data, "class", "scored.class")
# cat("Specificity: ", specificity_result, "\n")
#
# f1_score_result <- f1_score(data, "class", "scored.class")
# cat("F1 Score: ", f1_score_result, "\n")
#
# roc_result <- generate_ROC_curve(data, "class", "scored.probability")
# data$class <- as.factor(data$class)
# data$scored.class <-as.factor(data$scored.class)
# # Create a confusion matrix using caret
# confusion_matrix_caret <- confusionMatrix(data$scored.class, data$class)
#
# # Display the confusion matrix and metrics
# confusion_matrix_caret
#
# # Calculate sensitivity (recall) using caret
# sensitivity_caret <- sensitivity(data$scored.class, data$class)
# cat("Sensitivity (Caret): ", sensitivity_caret, "\n")
#
# # Calculate specificity using caret
# specificity_caret <- specificity(data$scored.class, data$class)
# cat("Specificity (Caret): ", specificity_caret, "\n")
The commented out code above generated this info when I ran it as a chunk, not sure why it doesn't allow me to knit it:
# Assuming "data" is your dataset, "class" is the true classification column, and "scored.probability" is the probability column.
true_labels <- as.factor(actual)
predicted_probabilities <- data$scored.probability
# Create an ROC curve object
roc_curve <- roc(true_labels, predicted_probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# # Plot the ROC curve
plot(roc_curve, print.auc = TRUE, print.auc.x = 0.5, print.auc.y = 0.2)
# Calculate the AUC
auc_value <- auc(roc_curve)
cat("AUC (pROC): ", auc_value, "\n")
## AUC (pROC): 0.8503113
The AUC received for question 13 is 0.850. This one is slightly more rigged than the pROC curve. The AUC values are the same for both.
Sources: * https://www.dataquest.io/blog/write-functions-in-r/