library("caret")
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: ggplot2
library("ggplot2")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
my_data <- read.csv(file="https://raw.githubusercontent.com/olga0503/DATA-621/master/classification-output-data.csv",stringsAsFactors=T, header=T)
head(my_data)
## pregnant glucose diastolic skinfold insulin bmi pedigree age class
## 1 7 124 70 33 215 25.5 0.161 37 0
## 2 2 122 76 27 200 35.9 0.483 26 0
## 3 3 107 62 13 48 22.9 0.678 23 1
## 4 1 91 64 24 0 29.2 0.192 21 0
## 5 4 83 86 19 0 29.3 0.317 34 0
## 6 1 100 74 12 46 19.5 0.149 28 0
## scored.class scored.probability
## 1 0 0.32845226
## 2 0 0.27319044
## 3 0 0.10966039
## 4 0 0.05599835
## 5 0 0.10049072
## 6 0 0.05515460
calc_confusion_matrix <-function(actual,pred){
confusion_matrix <- table("Predicted"= pred,"Actual"= actual)
return(confusion_matrix)
}
calc_confusion_matrix(my_data$scored.class,my_data$class)
## Actual
## Predicted 0 1
## 0 119 5
## 1 30 27
#calculate true positive
calc_TP <- function(pred,actual){
TP <- calc_confusion_matrix(pred,actual)[4]
return(TP)
}
#calculate true negative
calc_TN <- function(pred,actual){
TN <- calc_confusion_matrix(pred,actual)[1]
return(TN)
}
#calculate false negative
calc_FN <- function(pred,actual){
FN <- calc_confusion_matrix(pred,actual)[2]
return(FN)
}
#calculate false positive
calc_FP <- function(pred,actual){
FP <- calc_confusion_matrix(pred,actual)[3]
return(FP)
}
calc_accuracy <- function(pred,actual){
accuracy = (calc_TP(pred,actual) + calc_TN(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual) + calc_TN(pred,actual) + calc_FN(pred,actual))
return(as.numeric(accuracy))
}
calc_accuracy(my_data$scored.class,my_data$class)
## [1] 0.8066298
calc_classification_error_rate <- function(pred,actual){
classification_error_rate = (calc_FP(pred,actual) + calc_FN(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual) + calc_TN(pred,actual) + calc_FN(pred,actual))
return(classification_error_rate)
}
calc_classification_error_rate(my_data$scored.class,my_data$class)
## [1] 0.1933702
calc_precision <- function(pred,actual){
precision = (calc_TP(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual))
return (round(precision,3))
}
calc_precision(my_data$scored.class,my_data$class)
## [1] 0.844
calc_sensitivity <- function(pred,actual){
sensitivity = (calc_TP(pred,actual))/(calc_TP(pred,actual) + calc_FN(pred,actual))
return(sensitivity)
}
calc_sensitivity(my_data$scored.class,my_data$class)
## [1] 0.4736842
calc_specificity <- function(pred,actual){
specificity <- calc_TN(pred,actual)/(calc_TN(pred,actual) + calc_FP(pred,actual))
return(specificity)
}
calc_specificity(my_data$scored.class,my_data$class)
## [1] 0.9596774
calc_F1_score <- function(pred,actual){
F1_score = (2*calc_precision(pred,actual)*calc_sensitivity(pred,actual))/(calc_precision(pred,actual) + calc_sensitivity(pred,actual))
return(F1_score)
}
calc_F1_score(my_data$scored.class,my_data$class)
## [1] 0.6068062
calc_roc_auc <- function(prob,actual){
threshold <- seq(0.01,1,0.01)
#create vectors that store x and y values
x_values <- c()
y_values <- c()
for (i in 1:length(threshold)){
new_pred <- ifelse(prob >= threshold[i], 1, 0)
df <- data.frame(new_pred, actual)
x_values <- c(x_values,1-calc_specificity(df$new_pred,df$actual))
y_values <- c(y_values,calc_sensitivity(df$new_pred,df$actual))
}
xy_df <- data.frame(x_values, y_values)
#order values by x values in ascending order
xy_df <- xy_df[order(x_values),]
#create vectors that store changes in x and y, AUC value and change in AUC
change_x_values <- c()
change_y_values <- c()
auc <- c()
auc_sum <- c()
for (i in 1:nrow(xy_df)){
change_x_values <- c(change_x_values,xy_df$x_values[i+1]-xy_df$x_values[i])
change_y_values <- c(change_y_values,xy_df$y_values[i+1]+xy_df$y_values[i])
auc <- c(auc,change_y_values[i]*change_x_values[i]/2)
}
df_auc <- data.frame(change_x_values,change_y_values,auc)
auc <- sum(df_auc$auc[1:91])
xy_df <- xy_df[complete.cases(xy_df),]
return (c(plot(y_values ~ x_values,xy_df, type="l", xlab="1-Specificity", ylab="Sensitivity", main="ROC Curve"),abline(0,1,lty=2),auc))
}
calc_roc_auc(my_data$scored.probability,my_data$class)

## [1] 0.8242784
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc.val <- roc(class~scored.probability, my_data)
plot(roc.val, main="pROC package ROC plot")

roc.val$auc
## Area under the curve: 0.8503
#install caret package
library(caret)
#Confusion Matrix
confusionMatrix(my_data$scored.class, my_data$class)$table
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27