preds = read.csv('classification-output-data.csv', sep=',', header = TRUE)
attach(preds)
table(class,scored.class)
##      scored.class
## class   0   1
##     0 119   5
##     1  30  27

The confusion matrix suggests that our sensitivity (TRUE POSITIVE RATE) is 27/57 = 0.47. Our specificty is 119/124 = 0.959. This means that our FALSE POSITIVE RATE is 1-Specificity = 0.04. In this case, we used the actual class as the first argument of the table which then sets the rows of the table to the actual class.

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the accuracy of the predictions.

Accuracy = (TP + TN) / (TP + FP + TN + FN)

This metric represents overall accuracy and measures the overall accordance between what was predicted by the model and what was observed in the test data set.

prediction_accuracy <- function(df){
        names    = c("class", "scored.class")
        columns  = df[colnames(df) %in% names]
        t        = table(columns)
        accuracy = (t[2,2] + t[1,1]) / (t[2,2] + t[1,2] + t[1,1] + t[2,1])
        accuracy = round(accuracy, 2)
        paste0("The prediction accuracy is: ", accuracy)
}

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the classification error rate of the predictions.

The classification accuracy is .81 and the classification error rate is .19. As expected they add up to 1.

classification_error_rate <- function(df){
        names   = c("class", "scored.class")
        columns = df[colnames(df) %in% names]
        t       = table(columns)
        classification_error_rate = (t[1,2] + t[2,1]) / (t[2,2] + t[1,2] + t[1,1] +         t[2,1])
        classification_error_rate = round(classification_error_rate, 2)
        paste0('The classification error rate is: ',classification_error_rate)
}

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the precision of the predictions.

precision <- function(df){
        names   = c("class", "scored.class")
        columns = df[colnames(df) %in% names]
        t       = table(columns)
        p       = (t[2,2] / (t[2,2] + t[1,2]))
        p       = round(p, 2)
        paste0('The precision is: ', p)
}

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the sensitivity of the predictions. Sensitivity is also known as recall.

sensitivity <- function(df){
        names   = c("class", "scored.class")
        columns = df[colnames(df) %in% names]
        t       = table(columns)
        s       = t[2,2] / (t[2,2] + t[2,1])
        s       = round(s,2)
        paste0('The sensitiity is: ',s)
}

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the specificity of the predictions.

specificity <- function(df){
        names   = c("class", "scored.class")
        columns = df[colnames(df) %in% names]
        t       = table(columns)
        s       = t[1,1] / (t[1,1] + t[1,2])
        s       = round(s,2)
        paste0('The specificity is: ',s)
}

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the F1 score of the predictions. 𝐹1 𝑆𝑐𝑜𝑟𝑒 = 2 × 𝑃𝑟𝑒𝑐𝑖𝑠𝑖𝑜𝑛 × 𝑆𝑒𝑛𝑠𝑖𝑡𝑖𝑣𝑖𝑡𝑦 𝑃𝑟𝑒𝑐𝑖𝑠𝑖𝑜𝑛 + 𝑆𝑒𝑛𝑠𝑖𝑡𝑖𝑣𝑖𝑡𝑦

f1.score <- function(df){
        names   = c("class", "scored.class")
        columns = df[colnames(df) %in% names]
        t = table(columns)
        p = (t[2,2] / (t[2,2] + t[1,2]))
        s = t[2,2] / (t[2,2] + t[2,1])
        f1 = (2*p*s) / (p + s)
        f1 = round(f1, 2)
        paste0("The F1 score is: ", f1)
}

Before we move on, let’s consider a question that was asked: What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1.

F1 will always fall between 0 and 1. Since precision and sensitivity will always be a number between 0 and 1 (because they are derived from always dividing a smaller number by a bigger number), their product will be smaller than both. The denomimator in the F1 score is proportional to the numerator so even if the product of sensitivity and precision times 2 is higher, the denominator will be bigger resulting in a number between 0 and 1. (can be equal to 0 or 1)

Write a function that generates an ROC curve from a data set with a true classification column (class in our example) and a probability column (scored.probability in our example). Your function should return a list that includes the plot of the ROC curve and a vector that contains the calculated area under the curve (AUC). Note that I recommend using a sequence of thresholds ranging from 0 to 1 at 0.01 intervals.

get.rocPlot.auc = function(df){
library(ggplot2)
threshold.grid = seq(0,1, by= 0.01)
predicted.classes = data.frame(row.names = 1:nrow(df))
x = seq_along(threshold.grid)
y = seq_along(threshold.grid)
for (i in threshold.grid) {
        yhat = as.numeric(df$scored.probability>i)
        predicted.classes = cbind(predicted.classes, yhat)
}
for (j in 1:length(threshold.grid)){
                class.factor = factor(df$class,levels = c(0,1))
                predicted.class.factor = factor(predicted.classes[,j], levels = c(0,1))
                t = table(class.factor, predicted.class.factor)
                sensitivity = t[2,2] / (t[2,2] + t[2,1])
                specificity = t[1,1] / (t[1,1] + t[1,2])
                y[j] = sensitivity
                x[j] = 1 - specificity
}

roc.data = data.frame(fpr = x, tpr = y)
roc.plot = ggplot(roc.data, aes(x=fpr, y=tpr)) + geom_step()
roc.plot = roc.plot + geom_abline(slope = 1, intercept = c(0,0), colour="red", lty=2)

my_auc <- function(outcome, proba){
 N = length(proba)
 N_pos = sum(outcome)
 df = data.frame(out = outcome, prob = proba)
 df = df[order(-df$prob),]
 df$above = (1:N) - cumsum(df$out)
 return( 1- sum( df$above * df$out ) / (N_pos * (N-N_pos) ) )
}

auc1 = my_auc(preds$class,preds$scored.probability)
results = list("Plot"=roc.plot, "Area under curve"=auc1)
results
}
  1. Use your created R functions and the provided classification output data set to produce all of the classification metrics discussed above.
prediction_accuracy(preds)
## [1] "The prediction accuracy is: 0.81"
classification_error_rate(preds)
## [1] "The classification error rate is: 0.19"
precision(preds)
## [1] "The precision is: 0.84"
sensitivity(preds)
## [1] "The sensitiity is: 0.47"
specificity(preds)
## [1] "The specificity is: 0.96"
f1.score(preds)
## [1] "The F1 score is: 0.61"
get.rocPlot.auc(preds)
## $Plot

## 
## $`Area under curve`
## [1] 0.8503113
  1. Investigate the caret package. In particular, consider the functions confusionMatrix, sensitivity, and specificity. Apply the functions to the data set. How do the results compare with your own functions?
library(caret)
p.factor      = factor(preds$scored.class,levels = c(1,0))
actual.factor = factor(preds$class, levels = c(1,0))
confusionMatrix(p.factor, actual.factor)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   0
##          1  27   5
##          0  30 119
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.4737          
##             Specificity : 0.9597          
##          Pos Pred Value : 0.8438          
##          Neg Pred Value : 0.7987          
##              Prevalence : 0.3149          
##          Detection Rate : 0.1492          
##    Detection Prevalence : 0.1768          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 1               
## 

The outputs from the confusionMatrix match up with mine. The outputs are below.

Accuracy : 0.8066 Sensitivity : 0.4737
Specificity : 0.9597

library(pROC)
plot.roc(class,scored.probability)

## 
## Call:
## plot.roc.default(x = class, predictor = scored.probability)
## 
## Data: scored.probability in 124 controls (class 0) < 57 cases (class 1).
## Area under the curve: 0.8503

The ROC curve looks fairly similar to my curve. The AUC is also the same.