library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
#install.packages('caret')
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
#install.packages("pROC")
library(pROC)
## Warning: package 'pROC' was built under R version 4.2.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
df = read.csv("https://raw.githubusercontent.com/enidroman/DATA-621-Business-Analytics-and-Data-Mining/main/classification-output-data.csv")
df1 = df[,c('class', 'scored.class','scored.probability')]
head(df1)
## class scored.class scored.probability
## 1 0 0 0.32845226
## 2 0 0 0.27319044
## 3 1 0 0.10966039
## 4 0 0 0.05599835
## 5 0 0 0.10049072
## 6 0 0 0.05515460
cm <- table(predicted = df1$scored.class, actual = df1$class)
cm
## actual
## predicted 0 1
## 0 119 30
## 1 5 27
The table displays a 2x2 confusion matrix for two classes, Postitive and Negagtive. You have 119 which is the TP (True Positve) You have 5 which is the FN (False Negative) You have 30 which is the FP (False Positve) You have 27 which is the TN (True Nagative)
The rows represent the predicted class. The columns represent the actual class.
calculate_accuracy <- function(confusion_matrix) {
TP <- confusion_matrix[2, 2] # True Positives
TN <- confusion_matrix[1, 1] # True Negatives
FP <- confusion_matrix[1, 2] # False Positives
FN <- confusion_matrix[2, 1] # False Negatives
accuracy <- (TP + TN) / (TP + FP + TN + FN)
return(accuracy)
}
# Create a confusion matrix
confusion_matrix <- matrix(c(119, 30, 5, 27), nrow = 2, byrow = TRUE,
dimnames = list(c("Actual 0", "Actual 1"), c("Predicted 0", "Predicted 1")))
# Calculate accuracy
accuracy <- calculate_accuracy(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.806629834254144"
calculate_classification_error_rate <- function(confusion_matrix) {
FP <- confusion_matrix[1, 2] # False Positives
FN <- confusion_matrix[2, 1] # False Negatives
error_rate <- (FP + FN) / sum(confusion_matrix)
return(error_rate)
}
# Create a confusion matrix
confusion_matrix <- matrix(c(119, 30, 5, 27), nrow = 2, byrow = TRUE,
dimnames = list(c("Actual 0", "Actual 1"), c("Predicted 0", "Predicted 1")))
# Calculate error rate
error_rate <- calculate_classification_error_rate(confusion_matrix)
print(paste("Classification Error Rate:", error_rate))
## [1] "Classification Error Rate: 0.193370165745856"
# Calculate accuracy
accuracy <- 1 - error_rate
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.806629834254144"
calculate_precision <- function(confusion_matrix) {
TP <- confusion_matrix[2, 2] # True Positives
FP <- confusion_matrix[1, 2] # False Positives
precision <- TP / (TP + FP)
return(precision)
}
# Create a confusion matrix
confusion_matrix <- matrix(c(119, 30, 5, 27), nrow = 2, byrow = TRUE,
dimnames = list(c("Actual 0", "Actual 1"), c("Predicted 0", "Predicted 1")))
# Calculate precision
precision <- calculate_precision(confusion_matrix)
print(paste("Precision:", precision))
## [1] "Precision: 0.473684210526316"
calculate_sensitivity <- function(confusion_matrix) {
TP <- confusion_matrix[2, 2] # True Positives
FN <- confusion_matrix[2, 1] # False Negatives
sensitivity <- TP / (TP + FN)
return(sensitivity)
}
# Create a confusion matrix
confusion_matrix <- matrix(c(119, 30, 5, 27), nrow = 2, byrow = TRUE,
dimnames = list(c("Actual 0", "Actual 1"), c("Predicted 0", "Predicted 1")))
# Calculate sensitivity (recall)
sensitivity <- calculate_sensitivity(confusion_matrix)
print(paste("Sensitivity (Recall):", sensitivity))
## [1] "Sensitivity (Recall): 0.84375"
calculate_specificity <- function(confusion_matrix) {
TN <- confusion_matrix[1, 1] # True Negatives
FP <- confusion_matrix[1, 2] # False Positives
specificity <- TN / (TN + FP)
return(specificity)
}
# Create a confusion matrix
confusion_matrix <- matrix(c(119, 30, 5, 27), nrow = 2, byrow = TRUE,
dimnames = list(c("Actual 0", "Actual 1"), c("Predicted 0", "Predicted 1")))
# Calculate specificity
specificity <- calculate_specificity(confusion_matrix)
print(paste("Specificity:", specificity))
## [1] "Specificity: 0.798657718120805"
calculate_f1_score <- function(confusion_matrix) {
TP <- confusion_matrix[2, 2] # True Positives
FP <- confusion_matrix[1, 2] # False Positives
FN <- confusion_matrix[2, 1] # False Negatives
precision <- TP / (TP + FP)
sensitivity <- TP / (TP + FN)
f1_score <- (2 * precision * sensitivity) / (precision + sensitivity)
return(f1_score)
}
# Create a confusion matrix
confusion_matrix <- matrix(c(119, 30, 5, 27), nrow = 2, byrow = TRUE,
dimnames = list(c("Actual 0", "Actual 1"), c("Predicted 0", "Predicted 1")))
# Calculate F1 score
f1_score <- calculate_f1_score(confusion_matrix)
print(paste("F1 Score:", f1_score))
## [1] "F1 Score: 0.606741573033708"
We want to show that the F1 score is always between 0 and 1. The F1 score is calculated as:
F1 Score = 2 * (Precision * Sensitivity) / (Precision + Sensitivity)
To demonstrate that the F1 score will always be between 0 and 1, weβll break it down step by step:
Therefore, the F1 Score, being the result of the F1 formula, is always between 0 and 1.
generate_roc_curve <- function(df1, class_column, probability_column) {
# Extract true class labels and predicted probabilities
true_labels <- df1[[class_column]]
predicted_probabilities <- df1[[probability_column]]
# Create a sequence of thresholds from 0 to 1 at 0.01 intervals
thresholds <- seq(0, 1, by = 0.01)
# Initialize vectors to store TPR (True Positive Rate) and FPR (False Positive Rate)
tpr <- numeric(length(thresholds))
fpr <- numeric(length(thresholds))
for (i in 1:length(thresholds)) {
threshold <- thresholds[i]
# Create binary predictions based on the current threshold
predictions <- ifelse(predicted_probabilities >= threshold, 1, 0)
# Calculate True Positives, False Positives, True Negatives, and False Negatives
TP <- sum(predictions == 1 & true_labels == 1)
FP <- sum(predictions == 1 & true_labels == 0)
TN <- sum(predictions == 0 & true_labels == 0)
FN <- sum(predictions == 0 & true_labels == 1)
# Calculate TPR and FPR
tpr[i] <- TP / (TP + FN)
fpr[i] <- FP / (FP + TN)
}
# Calculate the AUC using the trapezoidal rule
auc <- sum(diff(fpr) * tpr[-length(tpr)])
# Plot the ROC curve
plot(fpr, tpr, type = "l", main = "ROC Curve", xlab = "False Positive Rate", ylab = "True Positive Rate")
# Return the ROC curve and AUC
return(list(roc_curve = list(fpr = fpr, tpr = tpr), auc = auc))
}
# Example usage:
# Replace 'data_frame', 'class_column', and 'probability_column' with your specific data
result <- generate_roc_curve(df, 'class', 'scored.probability')
cat('AUC:', result$auc, '\n')
## AUC: -0.8539898
# Load the data (replace this with your actual data)
df1 <- as.data.frame(matrix(c(119, 30, 5, 27), nrow = 2, byrow = TRUE))
colnames(df1) <- c("0", "1")
rownames(df1) <- c("0", "1")
# Define the names of the columns in your dataset
class_column <- "actual"
probability_column <- "predicted"
# Calculate metrics
accuracy <- calculate_accuracy(df)
#error_rate <- calculate_error_rate(df)
precision <- calculate_precision(df)
sensitivity <- calculate_sensitivity(df)
specificity <- calculate_specificity(df)
f1_score <- calculate_f1_score(df)
cat('Accuracy:', accuracy, '\n')
## Accuracy: 0.5058824
cat('Error Rate:', error_rate, '\n')
## Error Rate: 0.1933702
cat('Precision:', precision, '\n')
## Precision: 0.495935
cat('Sensitivity (Recall):', sensitivity, '\n')
## Sensitivity (Recall): 0.983871
cat('Specificity:', specificity, '\n')
## Specificity: 0.05343511
cat('F1 Score:', f1_score, '\n')
## F1 Score: 0.6594595
The caret package in R is a powerful tool for training and evaluating machine learning models. It provides a wide range of functions for model evaluation and performance metrics. The confusionMatrix, sensitivity, and specificity functions within the caret package can be used to calculate classification metrics.
Letβs apply these functions to your dataset and compare the results with the functions we previously created:
# Create a confusion matrix
conf_matrix <- matrix(c(119, 30, 5, 27), nrow = 2, byrow = TRUE, dimnames = list(c("0", "1"), c("0", "1")))
# Convert the confusion matrix to a data frame
conf_matrix_df <- as.table(conf_matrix)
# Compute confusion matrix using caret's confusionMatrix function
cm <- confusionMatrix(conf_matrix_df)
cm
## Confusion Matrix and Statistics
##
## 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.9597
## Specificity : 0.4737
## Pos Pred Value : 0.7987
## Neg Pred Value : 0.8438
## Prevalence : 0.6851
## Detection Rate : 0.6575
## Detection Prevalence : 0.8232
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 0
##
# Create an ROC curve
roc_curve <- roc(response = df$class, predictor = df$scored.probability)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, print.auc = TRUE, main = "ROC Curve")
# Calculate AUC
auc_value <- auc(roc_curve)
cat('AUC (pROC):', auc_value, '\n')
## AUC (pROC): 0.8503113
Comparing the results with our own functions:
The pROC package provides a convenient and widely used method for generating ROC curves and calculating AUC in R.
The custom functions can be useful for specific metrics like precision, error rate, sensitivity, and specificity, which are not directly calculated by the pROC package.
The pROC package offers various customization options for ROC curve plots, including confidence intervals, smoothing, and other visualization features.
Both methods should provide consistent results if applied correctly. The choice depends on your specific needs and familiarity with the pROC package.