df <- read.csv("classification-output-data.csv")
head(df)
class <- df$class
prd_cls <- df$scored.class
table(prd_cls, class)
## class
## prd_cls 0 1
## 0 119 30
## 1 5 27
It can be seen that there are 30+27 = 57 class labeled as 1 and 27 out of 57 are correctly predicted while 30 out of 57 are incorrectly predicted. Similarly 120 classes are labeled as 0 and 119 out of 120 are correctly predicted as 0 and 1 is incorrectly predicted as 1.
df_class <- df[, c("class", "scored.class")]
head(df_class)
\[Accuracy = \frac{TP+TN}{TP+FP+TN+FN}\] Assuming 0 as positive and 1 as negative, we can write our function as follows:
pred_acc <- function(df){
## This function calculates the accuracy when dataframe supplied as input has only original class and predicted class. The first column should be original class and second column must be predicted class.
tab = table(df[,1], df[,2]) # Creating the table of classes
TP = tab[1, 1] # Taking True positive from the table
TN = tab[2, 2] # Taking True negative from the table
FP = tab[2, 1] # Taking False positive from the table
FN = tab[1, 2] # Taking False negative from the table
acc = (TP+TN)/(TP+TN+FP+FN) # Calculation of accuracy using the formula provided
return(acc) # Return the value
}
pred_acc(df_class)
## [1] 0.8066298
It can be seen that the accuracy calculated by the function developed is 80.66% ## 4. Write a function that takes the dataset as a dataframe. with actual and predicted classifications identified, and returns the classification error rate of the predictions.
classErrorRate <- function(df){
## This function calculates the error rate when dataframe supplied as input has only original class and predicted class. The first column should be original class and second column must be predicted class.
tab = table(df[,1], df[,2]) # Creating the table of classes
TP = tab[1, 1] # Taking True positive from the table
TN = tab[2, 2] # Taking True negative from the table
FP = tab[2, 1] # Taking False positive from the table
FN = tab[1, 2] # Taking False negative from the table
err = (FP+FN)/(TP+TN+FP+FN) # Calculation of error rate using the formula provided
return(err) # Return the error rate
}
classErrorRate(df_class)
## [1] 0.1933702
Here the class error rate is 0.1934 or 19.34%
The sum of accuracy and the error rate:
pred_acc(df_class)+classErrorRate(df_class)
## [1] 1
It can be seen that the sum of error and accuracy is 1 which is verified.
precision <- function(df){
## This function calculates the precision when argument dataframe df has only two columns original class and predicted class. The first column should be original class and second column must be predicted class.
tab = table(df[,1], df[,2]) # Creating the table of classes
TP = tab[1, 1] # Taking True positive from the table
FP = tab[2, 1] # Taking False positive from the table
prec = (TP)/(TP+FP) # Calculation of accuracy using the formula provided
return(prec) # Return the value
}
precision(df_class)
## [1] 0.7986577
Here the precision is 84.37%.
\[sensitivity = \frac{TP}{TP+FN}\]
sensitivity <- function(df){
## This function calculates the sensitivity when argument is a dataframe with original class and predicted class. The first column should be original class and second column must be predicted class.
if (length(unique(df[,2]))!=1){
tab = table(df[,1], df[,2]) # Creating the table of classes
TP = tab[1, 1] # Taking True positive from the table
FN = tab[1, 2] # Taking False negative from the table
sens = (TP)/(TP+FN) # Calculation of accuracy using the formula provided
return(sens)}
else{
return (invisible(NULL))
}## Return the value
}
sensitivity(df_class)
## [1] 0.9596774
The calculated sensitivity is 0.4737
specificity <- function(df){
## This function calculates the specificity when argument is a dataframe with original class and predicted class. The first column should be original class and second column must be predicted class.
if (length(unique(df[,2]))!=1){
tab = table(df[,1], df[,2]) # Creating the table of classes
TN = tab[2, 2] # Taking True negative from the table
FP = tab[2, 1] # Taking False positive from the table
spec = (TN)/(FP+TN) # Calculation of accuracy using the formula provided
return(spec)}
else{
return (invisible(NULL))
}# Return the value
}
specificity(df_class)
## [1] 0.4736842
Here the specificity is 95.97%
F1Score <- function(df){
## This function calculates the F1Score when argument dataframe df has only two columns original class and predicted class. The first column should be original class and second column must be predicted class.
tab = table(df[,1], df[,2]) # Creating the table of classes
TP = tab[1, 1] # Taking True positive from the table
FP = tab[2, 1] # Taking False positive from the table
FN = tab[1, 2] # Taking False negative from the table
sens = (TP)/(TP+FN) # Calculation of accuracy using the formula provided
prec = (TP)/(TP+FP) # Calculation of accuracy using the formula provided
f1score = (2*prec*sens)/(prec+sens) # Calculation of f1score
return(f1score) # Return the value
}
We know it
F1Score(df_class)
## [1] 0.8717949
Here the F1 score is 0.6067
set.seed(100)
sens <- runif(100, 0, 1)
prec <- runif (100, 0, 1)
f1score <- (2*prec*sens)/(prec+sens)
plot(f1score, main = "Plot of F1 Scores for 0<sensitivity<1 and 0<precision<1",
ylab = "F1 Scores", pch = 16, col ='darkgreen', type= 'p')
It can be seen that the values of F1 scores lie between 0 and 1 for any combination of sensitivity and precision. Also the range of F1 Scores can be found for different combination of sensitivity and precision.
range(f1score)
## [1] 0.02261333 0.94792443
Hence, values of F1 scores are between 0 and 1.
roc_curve <- function(df, reference_class, probability_scores){
library(ggplot2)
## This function returns the ROC plot,
## The arguments are data, reference_class in dataframe data which is actual classes in
## the data and the probability score column in the dataframe data.
min_threshold <- min(df[probability_scores]) ## Min threshold value from the probability
threshold = seq(min_threshold, 1, (1-min_threshold)/(dim(df)[1]-1)) #generate data
sens = c() # Sensitivity
spec = c() # Specificity
for (i in 1:length(threshold)){
pred_scores = ifelse(df[probability_scores]< threshold[i], 0, 1) #Predicted scores
df_new = data.frame(df[reference_class], pred_scores) # New dataframe class and pre_class
if (is.null(sensitivity(df_new))!=TRUE){
sens[i] = sensitivity(df_new) # Append sensitivity
spec[i] = specificity(df_new) # Append specificity
}
}
df_plot <- data.frame(sens, spec) # dataframe for ggplot
df_plot <- na.omit(df_plot) # Omit the rows containing na values
g<-ggplot(df_plot, aes(x=spec, y=sens))+
geom_point()+ # Scatterplot
labs(
title = "ROC plot",
x = "Specificity",
y = "Sensivity"
)
return(g)
}
roc_curve(df, 'class','scored.probability')
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked _by_ '.GlobalEnv':
##
## precision, sensitivity, specificity
tab <- table(df$scored.class, df$class)
confusionMatrix(tab, positive = '0')
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.9597
## Specificity : 0.4737
## Pos Pred Value : 0.7987
## Neg Pred Value : 0.8438
## Prevalence : 0.6851
## Detection Rate : 0.6575
## Detection Prevalence : 0.8232
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 0
##
caret::sensitivity(tab, positive = '0')
## [1] 0.9596774
caret::specificity(tab, positive='0')
## [1] 0.4736842
##13. Investigate the pROC package. Use it to generate an ROC cureve for the data set. How do the results compare with your own functions?
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_obj <- roc(data = df, response = "class", predictor = "scored.class")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ggroc(roc_obj)
The plots obtained from our function roc_curve and the built in function roc in the package pROC are of the same shape only the specificity-axis in the function provided by pROC package in the reverse direction than our function. We provided scatter plot while ROC gives us line curve. AUC seems to be the almost the same.