#1 Download the classification output data set (attached in Blackboard to the assignment).
data <- read.csv("https://raw.githubusercontent.com/Angelogallardo05/DAta-621-HW2/refs/heads/main/classification-output-data.csv")
glimpse(data)
## Rows: 181
## Columns: 11
## $ pregnant <int> 7, 2, 3, 1, 4, 1, 9, 8, 1, 2, 5, 5, 13, 0, 7, 12, 0…
## $ glucose <int> 124, 122, 107, 91, 83, 100, 89, 120, 79, 123, 88, 1…
## $ diastolic <int> 70, 76, 62, 64, 86, 74, 62, 78, 60, 48, 78, 72, 60,…
## $ skinfold <int> 33, 27, 13, 24, 19, 12, 0, 0, 42, 32, 30, 43, 0, 26…
## $ insulin <int> 215, 200, 48, 0, 0, 46, 0, 0, 48, 165, 0, 75, 0, 50…
## $ bmi <dbl> 25.5, 35.9, 22.9, 29.2, 29.3, 19.5, 22.5, 25.0, 43.…
## $ pedigree <dbl> 0.161, 0.483, 0.678, 0.192, 0.317, 0.149, 0.142, 0.…
## $ age <int> 37, 26, 23, 21, 34, 28, 33, 64, 23, 26, 37, 33, 41,…
## $ class <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ scored.class <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ scored.probability <dbl> 0.32845226, 0.27319044, 0.10966039, 0.05599835, 0.1…
#2 The data set has three key columns we will use: #class: the actual class for the observation #scored.class: the predicted class for the observation (based on a threshold of 0.5) #scored.probability: the predicted probability of success for the observation #Use the table() function to get the raw confusion matrix for this scored dataset. Make sure you understand #the output. In particular, do the rows represent the actual or predicted class? The columns?
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
print(confusion_matrix)
## Predicted
## Actual 0 1
## 0 119 5
## 1 30 27
#3 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified,and returns the accuracy of the predictions.
calculate_accuracy <- function(data) {
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
TN <- confusion_matrix[1, 1] # True Negatives
TP <- confusion_matrix[2, 2] # True Positives
FP <- confusion_matrix[1, 2] # False Positives
FN <- confusion_matrix[2, 1] # False Negatives
# Calculate accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)
return(accuracy)
}
accuracy <- calculate_accuracy(data)
print(accuracy)
## [1] 0.8066298
#4 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the classification error rate of the predictions.
calculate_error_rate <- function(data) {
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
TN <- confusion_matrix[1, 1] # True Negatives
TP <- confusion_matrix[2, 2] # True Positives
FP <- confusion_matrix[1, 2] # False Positives
FN <- confusion_matrix[2, 1] # False Negatives
error_rate <- (FP + FN) / (TP + FP + TN + FN)
return(error_rate)
}
calculate_metrics <- function(data) {
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
TN <- confusion_matrix[1, 1] # True Negatives
TP <- confusion_matrix[2, 2] # True Positives
FP <- confusion_matrix[1, 2] # False Positives
FN <- confusion_matrix[2, 1] # False Negatives
accuracy <- (TP + TN) / (TP + TN + FP + FN)
error_rate <- (FP + FN) / (TP + FP + TN + FN)
return(list(accuracy = accuracy, error_rate = error_rate))
}
metrics <- calculate_metrics(data)
print(metrics$accuracy)
## [1] 0.8066298
print(metrics$error_rate)
## [1] 0.1933702
# Verify the sum
if (metrics$accuracy + metrics$error_rate == 1) {
print("Accuracy and error rate sum to 1.")
} else {
print("There is an error in the calculations.")
}
## [1] "Accuracy and error rate sum to 1."
#5 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the precision of the predictions.
calculate_precision <- function(data) {
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
TP <- confusion_matrix[2, 2]
FP <- confusion_matrix[1, 2]
precision <- TP / (TP + FP)
return(precision)
}
precision <- calculate_precision(data)
print(precision)
## [1] 0.84375
#6 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the sensitivity of the predictions. Sensitivity is also known as recall.
calculate_sensitivity <- function(data) {
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
TP <- confusion_matrix[2, 2]
FN <- confusion_matrix[2, 1]
sensitivity <- TP / (TP + FN)
return(sensitivity)
}
sensitivity <- calculate_sensitivity(data)
print(sensitivity)
## [1] 0.4736842
#7 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the specificity of the predictions.
calculate_specificity <- function(data) {
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
TN <- confusion_matrix[1, 1] # True Negatives
FP <- confusion_matrix[1, 2] # False Positives
specificity <- TN / (TN + FP)
return(specificity)
}
specificity <- calculate_specificity(data)
print(specificity)
## [1] 0.9596774
#8 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the F1 score of the predictions.
calculate_f1_score <- function(data) {
confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
TP <- confusion_matrix[2, 2] # True Positives
FP <- confusion_matrix[1, 2] # False Positives
FN <- confusion_matrix[2, 1] # False Negatives
# Calculate Precision
precision <- TP / (TP + FP)
# Calculate Sensitivity (Recall)
sensitivity <- TP / (TP + FN)
# Calculate F1 Score
f1_score <- (2 * precision * sensitivity) / (precision + sensitivity)
return(f1_score)
}
f1_score <- calculate_f1_score(data)
print(f1_score)
## [1] 0.6067416
#9 : What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1. (Hint: If 0 < 𝑎 < 1 and 0 < 𝑏 < 1 then 𝑎𝑏 < 𝑎.)
calculate_f1_score <- function(precision, sensitivity) {
if (precision + sensitivity == 0) {
return(0)
}
return(2 * precision * sensitivity / (precision + sensitivity))
}
precision_values <- seq(0, 1, by = 0.1)
sensitivity_values <- seq(0, 1, by = 0.1)
f1_scores <- outer(precision_values, sensitivity_values, Vectorize(calculate_f1_score))
f1_table <- expand.grid(precision = precision_values, sensitivity = sensitivity_values)
f1_table$F1_score <- as.vector(f1_scores)
print(f1_table)
## precision sensitivity F1_score
## 1 0.0 0.0 0.0000000
## 2 0.1 0.0 0.0000000
## 3 0.2 0.0 0.0000000
## 4 0.3 0.0 0.0000000
## 5 0.4 0.0 0.0000000
## 6 0.5 0.0 0.0000000
## 7 0.6 0.0 0.0000000
## 8 0.7 0.0 0.0000000
## 9 0.8 0.0 0.0000000
## 10 0.9 0.0 0.0000000
## 11 1.0 0.0 0.0000000
## 12 0.0 0.1 0.0000000
## 13 0.1 0.1 0.1000000
## 14 0.2 0.1 0.1333333
## 15 0.3 0.1 0.1500000
## 16 0.4 0.1 0.1600000
## 17 0.5 0.1 0.1666667
## 18 0.6 0.1 0.1714286
## 19 0.7 0.1 0.1750000
## 20 0.8 0.1 0.1777778
## 21 0.9 0.1 0.1800000
## 22 1.0 0.1 0.1818182
## 23 0.0 0.2 0.0000000
## 24 0.1 0.2 0.1333333
## 25 0.2 0.2 0.2000000
## 26 0.3 0.2 0.2400000
## 27 0.4 0.2 0.2666667
## 28 0.5 0.2 0.2857143
## 29 0.6 0.2 0.3000000
## 30 0.7 0.2 0.3111111
## 31 0.8 0.2 0.3200000
## 32 0.9 0.2 0.3272727
## 33 1.0 0.2 0.3333333
## 34 0.0 0.3 0.0000000
## 35 0.1 0.3 0.1500000
## 36 0.2 0.3 0.2400000
## 37 0.3 0.3 0.3000000
## 38 0.4 0.3 0.3428571
## 39 0.5 0.3 0.3750000
## 40 0.6 0.3 0.4000000
## 41 0.7 0.3 0.4200000
## 42 0.8 0.3 0.4363636
## 43 0.9 0.3 0.4500000
## 44 1.0 0.3 0.4615385
## 45 0.0 0.4 0.0000000
## 46 0.1 0.4 0.1600000
## 47 0.2 0.4 0.2666667
## 48 0.3 0.4 0.3428571
## 49 0.4 0.4 0.4000000
## 50 0.5 0.4 0.4444444
## 51 0.6 0.4 0.4800000
## 52 0.7 0.4 0.5090909
## 53 0.8 0.4 0.5333333
## 54 0.9 0.4 0.5538462
## 55 1.0 0.4 0.5714286
## 56 0.0 0.5 0.0000000
## 57 0.1 0.5 0.1666667
## 58 0.2 0.5 0.2857143
## 59 0.3 0.5 0.3750000
## 60 0.4 0.5 0.4444444
## 61 0.5 0.5 0.5000000
## 62 0.6 0.5 0.5454545
## 63 0.7 0.5 0.5833333
## 64 0.8 0.5 0.6153846
## 65 0.9 0.5 0.6428571
## 66 1.0 0.5 0.6666667
## 67 0.0 0.6 0.0000000
## 68 0.1 0.6 0.1714286
## 69 0.2 0.6 0.3000000
## 70 0.3 0.6 0.4000000
## 71 0.4 0.6 0.4800000
## 72 0.5 0.6 0.5454545
## 73 0.6 0.6 0.6000000
## 74 0.7 0.6 0.6461538
## 75 0.8 0.6 0.6857143
## 76 0.9 0.6 0.7200000
## 77 1.0 0.6 0.7500000
## 78 0.0 0.7 0.0000000
## 79 0.1 0.7 0.1750000
## 80 0.2 0.7 0.3111111
## 81 0.3 0.7 0.4200000
## 82 0.4 0.7 0.5090909
## 83 0.5 0.7 0.5833333
## 84 0.6 0.7 0.6461538
## 85 0.7 0.7 0.7000000
## 86 0.8 0.7 0.7466667
## 87 0.9 0.7 0.7875000
## 88 1.0 0.7 0.8235294
## 89 0.0 0.8 0.0000000
## 90 0.1 0.8 0.1777778
## 91 0.2 0.8 0.3200000
## 92 0.3 0.8 0.4363636
## 93 0.4 0.8 0.5333333
## 94 0.5 0.8 0.6153846
## 95 0.6 0.8 0.6857143
## 96 0.7 0.8 0.7466667
## 97 0.8 0.8 0.8000000
## 98 0.9 0.8 0.8470588
## 99 1.0 0.8 0.8888889
## 100 0.0 0.9 0.0000000
## 101 0.1 0.9 0.1800000
## 102 0.2 0.9 0.3272727
## 103 0.3 0.9 0.4500000
## 104 0.4 0.9 0.5538462
## 105 0.5 0.9 0.6428571
## 106 0.6 0.9 0.7200000
## 107 0.7 0.9 0.7875000
## 108 0.8 0.9 0.8470588
## 109 0.9 0.9 0.9000000
## 110 1.0 0.9 0.9473684
## 111 0.0 1.0 0.0000000
## 112 0.1 1.0 0.1818182
## 113 0.2 1.0 0.3333333
## 114 0.3 1.0 0.4615385
## 115 0.4 1.0 0.5714286
## 116 0.5 1.0 0.6666667
## 117 0.6 1.0 0.7500000
## 118 0.7 1.0 0.8235294
## 119 0.8 1.0 0.8888889
## 120 0.9 1.0 0.9473684
## 121 1.0 1.0 1.0000000
min_f1 <- min(f1_table$F1_score)
max_f1 <- max(f1_table$F1_score)
cat("Minimum F1 Score:", min_f1, "\n")
## Minimum F1 Score: 0
cat("Maximum F1 Score:", max_f1, "\n")
## Maximum F1 Score: 1
if (min_f1 >= 0 && max_f1 <= 1) {
cat("The F1 score is always between 0 and 1.\n")
} else {
cat("There is an error in the F1 score calculation.\n")
}
## The F1 score is always between 0 and 1.
#10 Write a function that generates an ROC curve from a data set with a true classification column (class in ourexample) and a probability column (scored.probability in our example). Your function should return a list that includes the plot of the ROC curve and a vector that contains the calculated area under the curve (AUC). Note that I recommend using a sequence of thresholds ranging from 0 to 1 at 0.01 intervals.
generate_roc_curve <- function(data, actual_col, probability_col) {
actual <- data[[actual_col]]
predicted_probabilities <- data[[probability_col]]
roc_curve <- roc(actual, predicted_probabilities, plot = TRUE,
print.auc = TRUE, legacy.axes = TRUE,
col = "blue", lwd = 2, main = "ROC Curve")
auc_value <- auc(roc_curve)
return(list(roc_plot = roc_curve, auc_value = auc_value))
}
result <- generate_roc_curve(data, actual_col = "class", probability_col = "scored.probability")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
cat("AUC value:", result$auc_value, "\n")
## AUC value: 0.8503113
# Print the results
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.8066298
cat("Error Rate:", metrics$error_rate, "\n")
## Error Rate: 0.1933702
cat("Precision:", precision, "\n")
## Precision: 0.84375
cat("Sensitivity (Recall):", sensitivity, "\n")
## Sensitivity (Recall): 0.4736842
cat("Specificity:", specificity, "\n")
## Specificity: 0.9596774
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.6067416
cat("AUC:",result$auc_value, "\n")
## AUC: 0.8503113
#12 Investigate the caret package. In particular, consider the functions confusionMatrix, sensitivity, and specificity. Apply the functions to the data set. How do the results compare with your own functions?
cm <- confusionMatrix(factor(data$scored.class), factor(data$class))
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.9597
## Specificity : 0.4737
## Pos Pred Value : 0.7987
## Neg Pred Value : 0.8438
## Prevalence : 0.6851
## Detection Rate : 0.6575
## Detection Prevalence : 0.8232
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 0
##
accuracy_caret <- cm$overall['Accuracy']
sensitivity_caret <- cm$byClass["Sensitivity"]
specificity_caret <- cm$byClass["Specificity"]
cat("Caret Accuracy:", accuracy_caret, "\n")
## Caret Accuracy: 0.8066298
cat("Caret Sensitivity (Recall):", sensitivity_caret, "\n")
## Caret Sensitivity (Recall): 0.9596774
cat("Caret Specificity:", specificity_caret, "\n")
## Caret Specificity: 0.4736842
cat("Custom Accuracy:", accuracy, "\n")
## Custom Accuracy: 0.8066298
cat("Custom Sensitivity (Recall):", sensitivity, "\n")
## Custom Sensitivity (Recall): 0.4736842
cat("Custom Specificity:", specificity, "\n")
## Custom Specificity: 0.9596774
#13 Investigate the pROC package. Use it to generate an ROC curve for the data set. How do the results compare with your own functions?
roc_result <- roc(data$class, data$scored.probability)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_result, main="ROC Curve", col="blue", lwd=2)
auc_value <- auc(roc_result)
text(0.5, 0.5, paste("AUC =", round(auc_value, 2)), adj=c(0.5, 0.5))
cat("AUC from pROC:", auc_value, "\n")
## AUC from pROC: 0.8503113