# Load required libraries
library('tidyverse')
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(lubridate)
library(yaml)
library(httr)
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:httr':
##
## progress
##
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
classification_data <- read.csv('https://raw.githubusercontent.com/hawa1983/DATA-621/refs/heads/main/Homework%202/classification-output-data.csv')
head(classification_data)
## pregnant glucose diastolic skinfold insulin bmi pedigree age class
## 1 7 124 70 33 215 25.5 0.161 37 0
## 2 2 122 76 27 200 35.9 0.483 26 0
## 3 3 107 62 13 48 22.9 0.678 23 1
## 4 1 91 64 24 0 29.2 0.192 21 0
## 5 4 83 86 19 0 29.3 0.317 34 0
## 6 1 100 74 12 46 19.5 0.149 28 0
## scored.class scored.probability
## 1 0 0.32845226
## 2 0 0.27319044
## 3 0 0.10966039
## 4 0 0.05599835
## 5 0 0.10049072
## 6 0 0.05515460
scored.class
), andclass
).# Ensure that the actual and predicted class columns are factors
classification_data$class <- factor(classification_data$class)
classification_data$scored.class <- factor(classification_data$scored.class)
# Create confusion matrix using the table() function
conf_matrix <- table(Predicted = classification_data$scored.class, Actual = classification_data$class)
# Extract individual values from the confusion matrix
TN <- conf_matrix["0", "0"]
FP <- conf_matrix["1", "0"]
FN <- conf_matrix["0", "1"]
TP <- conf_matrix["1", "1"]
# Convert confusion matrix to a data frame
conf_matrix_df <- as.data.frame(conf_matrix)
# Add a new column with the values TN, FP, FN, TP
conf_matrix_df$Value <- c(TN, FP, FN, TP)
# Add a new column with concise definitions of each term
conf_matrix_df$Definition <- c(
"True Negatives (TN): Correctly predicted 'No'",
"False Positives (FP): Incorrectly predicted 'Yes'",
"False Negatives (FN): Incorrectly predicted 'No'",
"True Positives (TP): Correctly predicted 'Yes'"
)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 119 30
## 1 5 27
# Print the updated confusion matrix data frame
cat("\nConfusion Matrix with Values and Definitions:\n")
##
## Confusion Matrix with Values and Definitions:
print(conf_matrix_df)
## Predicted Actual Freq Value Definition
## 1 0 0 119 119 True Negatives (TN): Correctly predicted 'No'
## 2 1 0 5 5 False Positives (FP): Incorrectly predicted 'Yes'
## 3 0 1 30 30 False Negatives (FN): Incorrectly predicted 'No'
## 4 1 1 27 27 True Positives (TP): Correctly predicted 'Yes'
scored.class
in the dataset).
class
in the dataset).
0
, and the actual class was
0
.0
, but the actual class was
1
.1
, but the actual class was
0
.1
, and the actual class was
1
.0
and the actual class was 0
.0
, but the actual class was 1
.1
, but the actual class was 0
.1
and the actual class was 1
.Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the accuracy of the predictions.
custom_accuracy <- function(data) {
TP <- sum(data$class == 1 & data$scored.class == 1)
TN <- sum(data$class == 0 & data$scored.class == 0)
FP <- sum(data$class == 0 & data$scored.class == 1)
FN <- sum(data$class == 1 & data$scored.class == 0)
return((TP + TN) / (TP + FP + TN + FN))
}
accuracy <- custom_accuracy(classification_data)
cat(sprintf("accuracy: %.4f\n", accuracy))
## accuracy: 0.8066
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the classification error rate of the predictions.
error_rate <- function(data) {
TP <- sum(data$class == 1 & data$scored.class == 1)
TN <- sum(data$class == 0 & data$scored.class == 0)
FP <- sum(data$class == 0 & data$scored.class == 1)
FN <- sum(data$class == 1 & data$scored.class == 0)
return((FP + FN) / (TP + FP + TN + FN))
}
error_rate <- error_rate(classification_data)
# Compare results
cat(sprintf("Classification Error Rate: %.4f\n", error_rate))
## Classification Error Rate: 0.1934
cat(sprintf("Sum of Accuracy and Classification Error Rate: %.2f\n", accuracy + error_rate))
## Sum of Accuracy and Classification Error Rate: 1.00
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the precision of the predictions.
custom_precision <- function(data) {
TP <- sum(data$class == 1 & data$scored.class == 1)
FP <- sum(data$class == 0 & data$scored.class == 1)
return(TP / (TP + FP))
}
precision <- custom_precision(classification_data)
cat(sprintf("precision: %.4f\n", precision))
## precision: 0.8438
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the sensitivity of the predictions. Sensitivity is also known as recall.
custom_sensitivity <- function(data) {
TP <- sum(data$class == 1 & data$scored.class == 1)
FN <- sum(data$class == 1 & data$scored.class == 0)
return(TP / (TP + FN))
}
sensitivity <- custom_sensitivity(classification_data)
recall = sensitivity
cat(sprintf("precision (recall): %.4f\n", sensitivity))
## precision (recall): 0.4737
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the specificity of the predictions.
custom_specificity <- function(data) {
TN <- sum(data$class == 0 & data$scored.class == 0)
FP <- sum(data$class == 0 & data$scored.class == 1)
return(TN / (TN + FP))
}
specificity <- custom_specificity(classification_data)
cat(sprintf("specificity: %.4f\n", specificity))
## specificity: 0.9597
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the F1 score of the predictions.
# Custom precision function
custom_f1_score <- function(data) {
return(2 * (precision * sensitivity) / (precision + sensitivity))
}
f1score <- custom_f1_score(classification_data)
cat(sprintf("F1 Score: %.4f\n", f1score))
## F1 Score: 0.6067
Yes, the output below addresses the question regarding the bounds on the F1 score.
Explanation:
Interpretation of the Matrix: - The matrix shown below illustrates different precision and recall values, with each resulting in an F1 score. In all cases, the F1 score lies between 0 and 1. - The test cases: - f1_score(0, 1): Results in 0, which is the lower bound. - f1_score(1, 1): Results in 1, which is the upper bound. - f1_score(0.5, 0.5): Results in a value between 0 and 1, which is a typical case for non-extreme precision and recall values.
This confirms that the F1 score is bounded between 0 and 1, thus answering the question.
# Updated F1 Score function to handle division by zero
f1_score <- function(precision, recall) {
if (precision == 0 & recall == 0) {
return(0) # Return 0 when both precision and recall are 0
} else {
return(2 * (precision * recall) / (precision + recall))
}
}
# Example values of Precision (P) and Recall (R)
precision_values <- seq(0, 1, by = 0.1) # Precision from 0 to 1
recall_values <- seq(0, 1, by = 0.1) # Recall from 0 to 1
# Create a matrix to store F1 scores
f1_matrix <- matrix(nrow = length(precision_values), ncol = length(recall_values))
# Calculate F1 scores for each combination of precision and recall
for (i in seq_along(precision_values)) {
for (j in seq_along(recall_values)) {
f1_matrix[i, j] <- f1_score(precision_values[i], recall_values[j])
}
}
# Display the matrix of F1 scores
print(f1_matrix)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [2,] 0 0.1000000 0.1333333 0.1500000 0.1600000 0.1666667 0.1714286
## [3,] 0 0.1333333 0.2000000 0.2400000 0.2666667 0.2857143 0.3000000
## [4,] 0 0.1500000 0.2400000 0.3000000 0.3428571 0.3750000 0.4000000
## [5,] 0 0.1600000 0.2666667 0.3428571 0.4000000 0.4444444 0.4800000
## [6,] 0 0.1666667 0.2857143 0.3750000 0.4444444 0.5000000 0.5454545
## [7,] 0 0.1714286 0.3000000 0.4000000 0.4800000 0.5454545 0.6000000
## [8,] 0 0.1750000 0.3111111 0.4200000 0.5090909 0.5833333 0.6461538
## [9,] 0 0.1777778 0.3200000 0.4363636 0.5333333 0.6153846 0.6857143
## [10,] 0 0.1800000 0.3272727 0.4500000 0.5538462 0.6428571 0.7200000
## [11,] 0 0.1818182 0.3333333 0.4615385 0.5714286 0.6666667 0.7500000
## [,8] [,9] [,10] [,11]
## [1,] 0.0000000 0.0000000 0.0000000 0.0000000
## [2,] 0.1750000 0.1777778 0.1800000 0.1818182
## [3,] 0.3111111 0.3200000 0.3272727 0.3333333
## [4,] 0.4200000 0.4363636 0.4500000 0.4615385
## [5,] 0.5090909 0.5333333 0.5538462 0.5714286
## [6,] 0.5833333 0.6153846 0.6428571 0.6666667
## [7,] 0.6461538 0.6857143 0.7200000 0.7500000
## [8,] 0.7000000 0.7466667 0.7875000 0.8235294
## [9,] 0.7466667 0.8000000 0.8470588 0.8888889
## [10,] 0.7875000 0.8470588 0.9000000 0.9473684
## [11,] 0.8235294 0.8888889 0.9473684 1.0000000
cat("\nTest for boundary cases:\n")
##
## Test for boundary cases:
# Check the boundaries of F1 score
# Test for boundary cases
cat(sprintf("f1_score(0, 1) # Should return 0 (lower bound): %.2f\n", f1_score(0, 1)))
## f1_score(0, 1) # Should return 0 (lower bound): 0.00
cat(sprintf("f1_score(0.5, 0.5) # Should return a value between 0 and 1: %.2f\n", f1_score(0.5, 0.5)))
## f1_score(0.5, 0.5) # Should return a value between 0 and 1: 0.50
cat(sprintf("f1_score(1, 1) # Should return 1 (upper bound): %.2f\n", f1_score(1, 1)))
## f1_score(1, 1) # Should return 1 (upper bound): 1.00
# Custom function for calculating TPR, FPR, and AUC
custom_roc_curve <- function(data) {
# Ensure that the probabilities are sorted in descending order
data <- data[order(-data$scored.probability), ]
# Define a sequence of thresholds from 0 to 1 with a step of 0.01
thresholds <- seq(0, 1, by = 0.01)
n_positive <- sum(data$class == 1)
n_negative <- sum(data$class == 0)
tpr <- numeric(length(thresholds)) # True Positive Rate
fpr <- numeric(length(thresholds)) # False Positive Rate
# Loop through each threshold to calculate TPR and FPR
for (i in seq_along(thresholds)) {
threshold <- thresholds[i]
# Predicted positives at the current threshold
predicted_positive <- data$scored.probability >= threshold
# Calculate TP, FP, FN, TN
TP <- sum(predicted_positive & data$class == 1)
FP <- sum(predicted_positive & data$class == 0)
FN <- n_positive - TP
TN <- n_negative - FP
# Calculate TPR and FPR
tpr[i] <- TP / (TP + FN) # Sensitivity or Recall
fpr[i] <- FP / (FP + TN) # 1 - Specificity
}
# Sort by FPR to plot the ROC curve
sorted_fpr <- c(0, sort(fpr), 1)
sorted_tpr <- c(0, sort(tpr), 1)
# Calculate AUC using the trapezoidal rule
auc_value <- sum(diff(sorted_fpr) * (sorted_tpr[-1] + sorted_tpr[-length(sorted_tpr)]) / 2)
# Plot the ROC curve
plot(sorted_fpr, sorted_tpr, type = "l", col = "blue", lwd = 2,
xlab = "False Positive Rate (1 - Specificity)", ylab = "True Positive Rate (Sensitivity)",
main = "ROC Curve")
abline(0, 1, col = "red", lty = 2) # Diagonal line for random guessing
# Return the ROC values and AUC
return(list(roc_curve = list(fpr = sorted_fpr, tpr = sorted_tpr), auc = auc_value))
}
# Call the custom ROC function
roc_curve_result <- custom_roc_curve(classification_data)
auc <- roc_curve_result$auc
# Print the AUC
print(paste("AUC:", auc))
## [1] "AUC: 0.848896434634975"
# Call the previously defined functions and calculate the metrics
# Calculate metrics
accuracy_value <- custom_accuracy(classification_data)
sensitivity_value <- custom_sensitivity(classification_data)
specificity_value <- custom_specificity(classification_data)
precision_value <- custom_precision(classification_data)
f1_value <- custom_f1_score(classification_data)
# Create a data frame for metrics
metrics_df <- data.frame(
Metric = c("Accuracy", "Sensitivity", "Specificity", "Precision", "F1 Score"),
Value = c(accuracy_value, sensitivity_value, specificity_value, precision_value, f1_value)
)
# Print the metrics data frame
print("classification metrics:")
## [1] "classification metrics:"
print(metrics_df)
## Metric Value
## 1 Accuracy 0.8066298
## 2 Sensitivity 0.4736842
## 3 Specificity 0.9596774
## 4 Precision 0.8437500
## 5 F1 Score 0.6067416
The caret functions and custom functions for calculating classification metrics such as sensitivity and specificity provide similar results, but with a key difference in how they treat the positive class. In the first instance, the caret function considered the class labeled as 0 as the positive class by default, which is evident from the output where Sensitivity was 95.97% (high sensitivity for detecting class 0).
When we reordered the factor levels (making class 1 the positive class), the sensitivity dropped to 47%, but this aligns with the custom function’s result of 47% sensitivity for class 1. Similarly, specificity was 95.97% for detecting class 1 after the reordering, which again matches the custom function’s result.
This demonstrates that both caret and custom functions provide the same results, but the interpretation of sensitivity and specificity depends on how the positive class is defined. The difference in results between the first and second approaches is purely due to the choice of positive class, which can be addressed by consistent factor level ordering.
In conclusion, the outputs show that both methods are valid, and the key takeaway is the importance of properly defining the positive class to ensure consistent interpretations of sensitivity and specificity.
# The caret requires `data` and `reference` should be factors.
# Ensure both actual and predicted are factors
classification_data$class <- factor(classification_data$class)
classification_data$scored.class <- factor(classification_data$scored.class)
# Generate confusion matrix with caret using re-ordered factors
confusion_result <- confusionMatrix(classification_data$scored.class, classification_data$class)
# Extract sensitivity and specificity
caret_sensitivity <- confusion_result$byClass["Sensitivity"]
caret_specificity <- confusion_result$byClass["Specificity"]
# Use custom functions
custom_sensitivity_value <- custom_sensitivity(classification_data)
custom_specificity_value <- custom_specificity(classification_data)
# Print the confusion matrix
print("Confusion matrix:")
## [1] "Confusion matrix:"
print(confusion_result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.9597
## Specificity : 0.4737
## Pos Pred Value : 0.7987
## Neg Pred Value : 0.8438
## Prevalence : 0.6851
## Detection Rate : 0.6575
## Detection Prevalence : 0.8232
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 0
##
# Compare results
cat(sprintf("Caret Sensitivity: %.2f vs Custom Sensitivity: %.2f\n", caret_sensitivity, custom_sensitivity_value))
## Caret Sensitivity: 0.96 vs Custom Sensitivity: 0.47
cat(sprintf("Caret Specificity: %.2f vs Custom Specificity: %.2f\n", caret_specificity, custom_specificity_value))
## Caret Specificity: 0.47 vs Custom Specificity: 0.96
# Ensure that the predicted class and actual class are factors
# Set factor levels explicitly
classification_data$class <- factor(classification_data$class, levels = c(1, 0))
classification_data$scored.class <- factor(classification_data$scored.class, levels = c(1, 0))
# Generate confusion matrix with caret using re-ordered factors
confusion_result <- confusionMatrix(classification_data$scored.class, classification_data$class)
# Extract sensitivity and specificity
caret_sensitivity <- confusion_result$byClass["Sensitivity"]
caret_specificity <- confusion_result$byClass["Specificity"]
# Use custom functions
custom_sensitivity_value <- custom_sensitivity(classification_data)
custom_specificity_value <- custom_specificity(classification_data)
# Print the confusion matrix
print("Confusion matrix:")
## [1] "Confusion matrix:"
print(confusion_result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 27 5
## 0 30 119
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.4737
## Specificity : 0.9597
## Pos Pred Value : 0.8438
## Neg Pred Value : 0.7987
## Prevalence : 0.3149
## Detection Rate : 0.1492
## Detection Prevalence : 0.1768
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 1
##
# Compare results
cat(sprintf("Caret Sensitivity: %.2f vs Custom Sensitivity: %.2f\n", caret_sensitivity, custom_sensitivity_value))
## Caret Sensitivity: 0.47 vs Custom Sensitivity: 0.47
cat(sprintf("Caret Specificity: %.2f vs Custom Specificity: %.2f\n", caret_specificity, custom_specificity_value))
## Caret Specificity: 0.96 vs Custom Specificity: 0.96
The investigation of the pROC package, as demonstrated by the ROC curves and AUC values, reveals that the results produced by the pROC package are very similar to those generated by the custom ROC function. Specifically:
The slight difference in AUC values between pROC and the custom function (0.8503 vs. 0.8484) can be attributed to minor numerical differences in how each method calculates the area under the curve. The pROC package is a well-established library that likely uses more precise and optimized techniques for calculating the AUC, while the custom function approximates the AUC using the trapezoidal rule.
In terms of the ROC curves, both plots exhibit a similar shape, showing that both approaches capture the same underlying performance of the model. Both curves indicate a strong classification ability, as the curves rise steeply towards high sensitivity with relatively low false positive rates. The diagonal red line represents random guessing, and both curves significantly outperform this baseline.
Thus, the pROC package produces almost identical results compared to the custom function, but pROC is more efficient and robust due to its specialized algorithms.
# Use pROC to generate the ROC curve
roc_result <- roc(classification_data$class, classification_data$scored.probability)
## Setting levels: control = 1, case = 0
## Setting direction: controls > cases
# Calculate and display the AUC (Area Under the Curve)
pROC_auc <- auc(roc_result)
cat(sprintf("pROC AUC: %.4f\n", pROC_auc))
## pROC AUC: 0.8503
# Subset the auc from the custom ROC function
custom_auc <- roc_curve_result$auc
cat(sprintf("Custom AUC: %.4f\n", auc))
## Custom AUC: 0.8489
# Compare AUC values
cat(sprintf("pROC AUC: %.4f vs Custom AUC: %.4f\n", pROC_auc, custom_auc))
## pROC AUC: 0.8503 vs Custom AUC: 0.8489
# Load necessary libraries
library(ggplot2)
library(pROC)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Step 1: Generate the ROC curve using pROC
roc_result <- roc(classification_data$class, classification_data$scored.probability)
## Setting levels: control = 1, case = 0
## Setting direction: controls > cases
auc_value <- auc(roc_result)
# Convert pROC result to a data frame for ggplot
roc_data <- data.frame(specificity = roc_result$specificities,
sensitivity = roc_result$sensitivities)
# Plot ROC curve with pROC
pROC_plot <- ggplot(data = roc_data, aes(x = 1 - specificity, y = sensitivity)) +
geom_line(color = "blue", size = 1.2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
ggtitle(paste("pROC AUC:", round(auc_value, 4))) +
xlab("1 - Specificity") +
ylab("Sensitivity") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Step 2: Custom ROC curve function with FPR and TPR sorting
custom_roc_curve <- function(data) {
thresholds <- seq(0, 1, by = 0.01)
TPR <- numeric(length(thresholds)) # True Positive Rate
FPR <- numeric(length(thresholds)) # False Positive Rate
for (i in seq_along(thresholds)) {
threshold <- thresholds[i]
predicted_positive <- data$scored.probability >= threshold
TP <- sum(predicted_positive & data$class == 1)
FP <- sum(predicted_positive & data$class == 0)
FN <- sum(!predicted_positive & data$class == 1)
TN <- sum(!predicted_positive & data$class == 0)
TPR[i] <- TP / (TP + FN)
FPR[i] <- FP / (FP + TN)
}
# Ensure FPR and TPR are sorted before calculating AUC
sorted_indices <- order(FPR)
sorted_FPR <- FPR[sorted_indices]
sorted_TPR <- TPR[sorted_indices]
# Calculate AUC using the trapezoidal rule
auc_value <- sum(diff(sorted_FPR) * (sorted_TPR[-1] + sorted_TPR[-length(sorted_TPR)]) / 2)
return(list(FPR = sorted_FPR, TPR = sorted_TPR, auc = auc_value))
}
# Apply custom ROC function
custom_roc <- custom_roc_curve(classification_data)
# Convert custom ROC data to a data frame
custom_roc_data <- data.frame(FPR = custom_roc$FPR, TPR = custom_roc$TPR)
# Plot custom ROC curve
custom_plot <- ggplot(data = custom_roc_data, aes(x = FPR, y = TPR)) +
geom_line(color = "green", size = 1.2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
ggtitle(paste("Custom AUC:", round(custom_roc$auc, 4))) +
xlab("1 - Specificity") +
ylab("Sensitivity") +
theme_minimal()
# Step 3: Display the two plots side by side
grid.arrange(pROC_plot, custom_plot, ncol = 2)
# Generate ROC curve using pROC
roc_result <- roc(classification_data$class, classification_data$scored.probability)
## Setting levels: control = 1, case = 0
## Setting direction: controls > cases
# Plot the ROC curve
plot(roc_result, col = "blue", lwd = 2, main = "ROC Curve with AUC")
auc_value <- auc(roc_result) # Calculate AUC
cat(sprintf("AUC: %.3f\n", auc_value))
## AUC: 0.850
Create Gain and Lift Charts based on the output of the classification model.
Steps for Generating Gain and Lift Charts:
scored.probability
) in descending order.# Custom Gain and Lift Chart function
custom_gain_lift <- function(data) {
# Sort the data by scored.probability in descending order
data <- data[order(-data$scored.probability), ]
# Create a decile (or percentile) column
data$decile <- ntile(data$scored.probability, 10) # Divide into 10 deciles
# Calculate cumulative gain
total_positives <- sum(data$class == 1) # Total positives (class = 1)
data$cumulative_positives <- cumsum(data$class == 1)
data$cumulative_gain <- data$cumulative_positives / total_positives
# Calculate lift
data$decile_percentage <- 1:nrow(data) / nrow(data) # Percentage of total observations
data$lift <- data$cumulative_gain / data$decile_percentage
return(data)
}
# Apply the Gain and Lift calculation
gain_lift_data <- custom_gain_lift(classification_data)
The Gain Chart shows the cumulative gain in identifying true positives as more of the population is sampled based on predicted probability. A steep initial rise indicates that the model captures a large portion of positives early, for example, the top 25% of the sample captures about 75% of the positives. The chart outperforms the baseline (red dashed line), demonstrating that the model is more effective than random guessing.
The Lift Chart measures the improvement in model performance over random selection. A lift of 3 at the start means the model is three times better at identifying positives than random selection in the top deciles. As more of the sample is included, the lift decreases and eventually converges toward 1, indicating no advantage over random selection for the entire population.
Both charts indicate that the model is effective at identifying true positives early, with strong gains and lift in the top portion of predictions.
# Load ggplot2 for plotting
library(ggplot2)
# Plot Gain Chart
ggplot(gain_lift_data, aes(x = decile_percentage, y = cumulative_gain)) +
geom_line(color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Gain Chart", x = "Percentage of Sample", y = "Cumulative Gain") +
theme_minimal()
# Plot Lift Chart
ggplot(gain_lift_data, aes(x = decile_percentage, y = lift)) +
geom_line(color = "green") +
geom_abline(slope = 0, intercept = 1, linetype = "dashed", color = "red") +
labs(title = "Lift Chart", x = "Percentage of Sample", y = "Lift") +
theme_minimal()
#### iii. Kolmogorov-Smirnov (K-S) Chart
The K-S Chart shows a K-S statistic of 0.58, indicating the model effectively separates positive and negative classes. The green line (Sensitivity) rises sharply, capturing positives early, while the blue line (1-Specificity) increases more slowly, meaning fewer false positives initially. The large gap between the lines reflects good model performance.
# Function to calculate and plot K-S Chart
ks_chart <- function(data) {
# Sort the data by predicted probabilities
data <- data[order(-data$scored.probability), ]
# Add an index column to represent the rank of each observation
data$index <- 1:nrow(data)
# Calculate cumulative distributions
data$cum_true_positive <- cumsum(data$class == 1) / sum(data$class == 1)
data$cum_false_positive <- cumsum(data$class == 0) / sum(data$class == 0)
# Calculate the K-S statistic (max difference)
data$ks_stat <- abs(data$cum_true_positive - data$cum_false_positive)
ks_value <- max(data$ks_stat)
ks_index <- which.max(data$ks_stat)
# Extract the row where the K-S statistic occurs (for annotation)
ks_row <- data[ks_index, ]
# Plot K-S chart using the index for the x-axis
library(ggplot2)
ks_plot <- ggplot(data, aes(x = index)) +
geom_line(aes(y = cum_true_positive, color = "True Positive Rate (Sensitivity)")) +
geom_line(aes(y = cum_false_positive, color = "False Positive Rate (1 - Specificity)")) +
geom_vline(xintercept = ks_index, linetype = "dashed", color = "red") +
geom_text(data = ks_row, aes(x = index, y = 0.5, label = paste("KS Statistic =", round(ks_value, 2))),
color = "red", vjust = -1.5) + # Only annotate at the ks_index row
labs(title = "K-S Chart", x = "Observations (sorted by probability)", y = "Cumulative Rate") +
scale_color_manual(values = c("blue", "green")) +
theme_minimal()
# Print the plot
print(ks_plot)
# Return the K-S statistic
return(ks_value)
}
# Apply the K-S chart function
ks_statistic <- ks_chart(classification_data)
# Print K-S statistic value
cat(sprintf("K-S Statistic: %.4f\n", ks_statistic))
## K-S Statistic: 0.5808
The ROC Chart shows the model’s ability to distinguish between positive and negative classes. The AUC of 0.85 indicates a strong model performance, as it is close to 1. The curve’s sharp rise near the y-axis shows high sensitivity with a low false positive rate initially, suggesting the model performs well at identifying true positives. Overall, the model is effective at classification. The larger shaded area indicates better performance, with the maximum possible AUC being 1 (perfect classification).
# Load necessary libraries
library(pROC)
# Assuming you have classification data with 'scored.probability' and 'class' variables
# Create the ROC curve object
roc_obj <- roc(classification_data$class, classification_data$scored.probability)
## Setting levels: control = 1, case = 0
## Setting direction: controls > cases
# Plot the ROC curve
plot(roc_obj, main = "ROC Chart")
# Add AUC to the plot
auc_value <- auc(roc_obj)
legend("bottomright", legend = paste("AUC =", round(auc_value, 2)), col = "blue", lty = 1)
# Highlight the Area Under the Curve (AUC)
auc_value <- auc(roc_obj)
polygon(c(0, roc_obj$specificities, 1), c(0, roc_obj$sensitivities, 0), col = rgb(0, 0, 1, 0.2), border = NA)
# Add AUC value in the legend
legend("bottomright", legend = paste("AUC =", round(auc_value, 2)), col = "blue", lty = 1)
# Load necessary libraries
library(pROC)
# Assuming classification_data with 'scored.probability' and 'class'
roc_obj <- roc(classification_data$class, classification_data$scored.probability)
## Setting levels: control = 1, case = 0
## Setting direction: controls > cases
# Plot the ROC curve
plot(roc_obj, main = "ROC Curve with AUC Highlight", col = "blue")
# Highlight the Area Under the Curve (AUC)
auc_value <- auc(roc_obj)
polygon(c(0, roc_obj$specificities, 1), c(0, roc_obj$sensitivities, 0), col = rgb(0, 0, 1, 0.2), border = NA)
# Add AUC value in the legend
legend("bottomright", legend = paste("AUC =", round(auc_value, 2)), col = "blue", lty = 1)