library("caret")
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: ggplot2
library("ggplot2")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
my_data <- read.csv(file="https://raw.githubusercontent.com/olga0503/DATA-621/master/classification-output-data.csv",stringsAsFactors=T, header=T)
head(my_data)
##   pregnant glucose diastolic skinfold insulin  bmi pedigree age class
## 1        7     124        70       33     215 25.5    0.161  37     0
## 2        2     122        76       27     200 35.9    0.483  26     0
## 3        3     107        62       13      48 22.9    0.678  23     1
## 4        1      91        64       24       0 29.2    0.192  21     0
## 5        4      83        86       19       0 29.3    0.317  34     0
## 6        1     100        74       12      46 19.5    0.149  28     0
##   scored.class scored.probability
## 1            0         0.32845226
## 2            0         0.27319044
## 3            0         0.10966039
## 4            0         0.05599835
## 5            0         0.10049072
## 6            0         0.05515460
calc_confusion_matrix <-function(actual,pred){
  
confusion_matrix <- table("Predicted"= pred,"Actual"= actual)
return(confusion_matrix)

}

calc_confusion_matrix(my_data$scored.class,my_data$class)
##          Actual
## Predicted   0   1
##         0 119   5
##         1  30  27
#calculate true positive
calc_TP <- function(pred,actual){

TP <- calc_confusion_matrix(pred,actual)[4]
return(TP)

}

#calculate true negative
calc_TN <- function(pred,actual){
  
TN <- calc_confusion_matrix(pred,actual)[1]
return(TN)

}

#calculate false negative
calc_FN <- function(pred,actual){
  
FN <- calc_confusion_matrix(pred,actual)[2]
return(FN)

}

#calculate false positive
calc_FP <- function(pred,actual){
  
FP <- calc_confusion_matrix(pred,actual)[3]
return(FP)

}
calc_accuracy <- function(pred,actual){
  
accuracy = (calc_TP(pred,actual) + calc_TN(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual) + calc_TN(pred,actual) + calc_FN(pred,actual))

return(as.numeric(accuracy))

}

calc_accuracy(my_data$scored.class,my_data$class)
## [1] 0.8066298
calc_classification_error_rate <- function(pred,actual){
  
classification_error_rate = (calc_FP(pred,actual) + calc_FN(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual) + calc_TN(pred,actual) + calc_FN(pred,actual))
return(classification_error_rate)

}

calc_classification_error_rate(my_data$scored.class,my_data$class)
## [1] 0.1933702
calc_precision <- function(pred,actual){
  
precision = (calc_TP(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual))
return (round(precision,3))

}

calc_precision(my_data$scored.class,my_data$class)
## [1] 0.844
calc_sensitivity <- function(pred,actual){

sensitivity = (calc_TP(pred,actual))/(calc_TP(pred,actual) + calc_FN(pred,actual))
return(sensitivity)

}

calc_sensitivity(my_data$scored.class,my_data$class)
## [1] 0.4736842
calc_specificity <- function(pred,actual){
  
specificity <- calc_TN(pred,actual)/(calc_TN(pred,actual) + calc_FP(pred,actual))
return(specificity)

}

calc_specificity(my_data$scored.class,my_data$class)
## [1] 0.9596774
calc_F1_score <- function(pred,actual){

F1_score = (2*calc_precision(pred,actual)*calc_sensitivity(pred,actual))/(calc_precision(pred,actual) + calc_sensitivity(pred,actual))
return(F1_score)

}

calc_F1_score(my_data$scored.class,my_data$class)
## [1] 0.6068062
calc_roc_auc <- function(prob,actual){

  threshold <- seq(0.01,1,0.01)
  #create vectors that store x and y values
  x_values <- c()
  y_values <- c() 
  
 for (i in 1:length(threshold)){
   
    new_pred <- ifelse(prob >= threshold[i], 1, 0)
    df <- data.frame(new_pred, actual)
    
    x_values <- c(x_values,1-calc_specificity(df$new_pred,df$actual))
    y_values <- c(y_values,calc_sensitivity(df$new_pred,df$actual))
    
 }
  
  xy_df <- data.frame(x_values, y_values)
  #order values by x values in ascending order
  xy_df <- xy_df[order(x_values),]
  
  #create vectors that store changes in x and y, AUC value and change in AUC
  change_x_values <- c()
  change_y_values <- c()
  auc <- c()
  auc_sum <- c()
   
  for (i in 1:nrow(xy_df)){

  change_x_values <- c(change_x_values,xy_df$x_values[i+1]-xy_df$x_values[i])
  change_y_values <- c(change_y_values,xy_df$y_values[i+1]+xy_df$y_values[i])
  auc <- c(auc,change_y_values[i]*change_x_values[i]/2)
  
  }
  
  df_auc <- data.frame(change_x_values,change_y_values,auc)
  
  auc <- sum(df_auc$auc[1:91])
  
  xy_df <- xy_df[complete.cases(xy_df),]
  
  return (c(plot(y_values ~ x_values,xy_df, type="l", xlab="1-Specificity", ylab="Sensitivity", main="ROC Curve"),abline(0,1,lty=2),auc))
 
}

calc_roc_auc(my_data$scored.probability,my_data$class)

## [1] 0.8242784
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc.val <- roc(class~scored.probability, my_data)
plot(roc.val, main="pROC package ROC plot") 

roc.val$auc
## Area under the curve: 0.8503
#install caret package
library(caret)

#Confusion Matrix
confusionMatrix(my_data$scored.class, my_data$class)$table
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27