class_df <- read.csv("https://raw.githubusercontent.com/irene908/DATA621/main/classification-output-data.csv")
head(class_df)
class_scored <- class_df[,c('class', 'scored.class','scored.probability')]
table(class_scored$class,class_scored$scored.class)
##
## 0 1
## 0 119 5
## 1 30 27
accuracyfn <- function(df){
s <- nrow(df)
tn <- sum(df$class == 0 & df$scored.class ==0)
tp <- sum(df$class == 1 & df$scored.class ==1)
return((tn+tp)/s)
}
print(accuracyfn(class_scored))
## [1] 0.8066298
errorfn <- function(df){
s <- nrow(df)
fn <- sum(df$class == 1 & df$scored.class ==0)
fp <- sum(df$class == 0 & df$scored.class ==1)
return((fn+fp)/s)
}
print(errorfn(class_scored))
## [1] 0.1933702
accuracyfn(class_scored)+errorfn(class_scored)
## [1] 1
precisionfn <- function(df){
fp <- sum(df$class == 0 & df$scored.class ==1)
tp <- sum(df$class == 1 & df$scored.class ==1)
return(tp/(tp+fp))
}
print(precisionfn(class_scored))
## [1] 0.84375
sensitivityfn <- function(df){
fn <- sum(df$class == 1 & df$scored.class ==0)
tp <- sum(df$class == 1 & df$scored.class ==1)
return(tp/(tp+fn))
}
print(sensitivityfn(class_scored))
## [1] 0.4736842
specificityfn <- function(df){
tn <- sum(df$class == 0 & df$scored.class ==0)
fp <- sum(df$class == 0 & df$scored.class ==1)
return(tn/(tn+fp))
}
print(specificityfn(class_scored))
## [1] 0.9596774
f1scorefn <- function(df){
precision <- precisionfn(df)
sensitivity <- sensitivityfn(df)
return((2*precision*sensitivity)/(precision+sensitivity))
}
print(f1scorefn(class_scored))
## [1] 0.6067416
F1 score is:
\(F1~=~\frac{2~*~precision~*~sensitivity}{precision~+~sensitivity}\)
precision bound is \({[0~<~precision~<~1]}\)
sensitivity bound is \({[0~<~sensitivity~<~1~]}\)
So, \(precision*sensitivity\) will also have the bound between 0 and 1.
\(\frac{2 * Precision * Sensitivity}{Precision+Sensitivity} = \frac{Precision * Sensitivity}{Precision+Sensitivity}+\frac{Precision * Sensitivity}{Precision+Sensitivity}< \frac{Precision}{Precision+Sensitivity}+\frac{Sensitivity}{Precision+Sensitivity}\)
\(= \frac{Precision+Sensitivity}{Precision+Sensitivity}\)
$ = 1$
\(0<F1<1\)
Therefore, the sum will be greater than product which proves that F1 will always be between 0 and 1.
rocfn <- function(df){
for (t in seq(0,1,0.01))
{
#create dataset for each threshold
x <- data.frame(class = df[,1], scored.class = if_else(df[,3] >= t,1,0), scored.probability = df[,3])
#create vectors to store sens & speci for all datasets
if(!exists('sens') & !exists('speci'))
{
sens <- sensitivityfn(x)
speci <- 1- specificityfn(x)
}
else
{
sens <- c(sens,sensitivityfn(x))
speci <- c(speci, 1- specificityfn(x))
}
}
df_roc <- data.frame(sens, speci) %>% arrange(speci)
# AUC calculation
speci_df <- c(diff(df_roc$speci), 0)
sens_df <- c(diff(df_roc$sens), 0)
auc <- round(sum(df_roc$sens * speci_df) + sum(sens_df * speci_df)/2, 3)
#Create plot
ggplot(df_roc) + geom_line(aes(speci, sens)) + ggtitle("ROC curve - Manual") + xlab("Specificity") + ylab("Sensitivity") + annotate(geom = "text", x = 0.7, y = 0.07,label = paste("auc:", auc)) + geom_abline(intercept = 0, slope = 1)
}
rocfn(class_scored)
class_metrics <- c(accuracyfn(class_scored), errorfn(class_scored), f1scorefn(class_scored), precisionfn(class_scored), sensitivityfn(class_scored), specificityfn(class_scored))
names(class_metrics) <- c("Accuracy", "Error Rate", "F1 Score", "Percision", "Sensitivity", "Specificity")
print(class_metrics,col.names = "Metric Values")
## Accuracy Error Rate F1 Score Percision Sensitivity Specificity
## 0.8066298 0.1933702 0.6067416 0.8437500 0.4736842 0.9596774
library("caret")
## Loading required package: lattice
sensitivity(as.factor(class_scored$scored.class), as.factor(class_scored$class), positive='1')
## [1] 0.4736842
specificity(as.factor(class_scored$scored.class), as.factor(class_scored$class), negative='0')
## [1] 0.9596774
confusionMatrix(as.factor(class_scored$scored.class), as.factor(class_scored$class), positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.4737
## Specificity : 0.9597
## Pos Pred Value : 0.8438
## Neg Pred Value : 0.7987
## Prevalence : 0.3149
## Detection Rate : 0.1492
## Detection Prevalence : 0.1768
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 1
##
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
plot(roc(class_scored$class, class_scored$scored.probability), print.auc = TRUE, main = 'ROC Curve - pROC Package')
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
rocfn(class_scored)