For probabilistic classification models like logistic regression.
data <- read.csv("SampleData.csv", header=TRUE) %>%
mutate(truth=factor(truth, levels=c(0,1)))
print("The first column indicates the true class, while the second column indicates the probability score assigned by the model")
## [1] "The first column indicates the true class, while the second column indicates the probability score assigned by the model"
head(data)
## truth pred_prob
## 1 1 0.9887905
## 2 1 0.9999989
## 3 1 0.7725319
## 4 1 0.5550344
## 5 1 0.9999394
## 6 1 0.8846755
labels <- c('Positive','Negative') # the labels in the legend
colors <- c('#233f7d','grey')
breaks <- c(1,0) # should match the encoding used in your data
my_cutoff <- 0.5 # the score cutoff that classifies an observation
cutoff_label_height <- 7 # manually adjust to fit into the graph
num_negative <- nrow(filter(data, (truth==0)))
num_positive <- nrow(filter(data, (truth==1)))
true_positive <- nrow(filter(data, (pred_prob > my_cutoff)&(truth==1)))
false_positive <- nrow(filter(data, (pred_prob > my_cutoff)&(truth==0)))
true_negative <- num_negative - false_positive
false_negative <- num_positive - true_positive
label_right <- stringr::str_interp("Classified Positive\n True Positives: ${true_positive}\n False Positives: ${false_positive}")
label_left <- stringr::str_interp("Classified Negative\n True Negatives: ${true_negative}\n False Negatives: ${false_negative} ")
label_df <- data.frame(label=c(label_right, label_left),
x=c(my_cutoff+0.05,my_cutoff-0.3))
ggplot(data=data) +
scale_color_manual(name='True Class', values=colors, breaks=breaks, labels=labels) +
scale_fill_manual(name='True Class', values=colors, breaks=breaks, labels=labels) +
geom_density(aes(x=pred_prob, color=truth, fill=truth), alpha=0.5) +
geom_vline(xintercept = my_cutoff, alpha = 0.5, linetype='dotted') +
geom_text(data=label_df,
aes(x=x, y=cutoff_label_height, label=label),
color="grey", hjust=0) +
labs(x='Score Assigned', y='Relative Density',
title='Distribution of the Scores',
subtitle='Assigned by <model_name>') +
coord_cartesian(xlim = c(0, 1), expand = FALSE) +
theme_minimal() +
theme(panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.y=element_blank())