HW2

library("caret")

## Warning: package 'caret' was built under R version 3.4.3

## Loading required package: lattice

## Loading required package: ggplot2

library("ggplot2")
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

my_data <- read.csv(file="https://raw.githubusercontent.com/olga0503/DATA-621/master/classification-output-data.csv",stringsAsFactors=T, header=T)
head(my_data)

##   pregnant glucose diastolic skinfold insulin  bmi pedigree age class
## 1        7     124        70       33     215 25.5    0.161  37     0
## 2        2     122        76       27     200 35.9    0.483  26     0
## 3        3     107        62       13      48 22.9    0.678  23     1
## 4        1      91        64       24       0 29.2    0.192  21     0
## 5        4      83        86       19       0 29.3    0.317  34     0
## 6        1     100        74       12      46 19.5    0.149  28     0
##   scored.class scored.probability
## 1            0         0.32845226
## 2            0         0.27319044
## 3            0         0.10966039
## 4            0         0.05599835
## 5            0         0.10049072
## 6            0         0.05515460

calc_confusion_matrix <-function(actual,pred){
  
confusion_matrix <- table("Predicted"= pred,"Actual"= actual)
return(confusion_matrix)

}

calc_confusion_matrix(my_data$scored.class,my_data$class)

##          Actual
## Predicted   0   1
##         0 119   5
##         1  30  27

#calculate true positive
calc_TP <- function(pred,actual){

TP <- calc_confusion_matrix(pred,actual)[4]
return(TP)

}

#calculate true negative
calc_TN <- function(pred,actual){
  
TN <- calc_confusion_matrix(pred,actual)[1]
return(TN)

}

#calculate false negative
calc_FN <- function(pred,actual){
  
FN <- calc_confusion_matrix(pred,actual)[2]
return(FN)

}

#calculate false positive
calc_FP <- function(pred,actual){
  
FP <- calc_confusion_matrix(pred,actual)[3]
return(FP)

}

calc_accuracy <- function(pred,actual){
  
accuracy = (calc_TP(pred,actual) + calc_TN(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual) + calc_TN(pred,actual) + calc_FN(pred,actual))

return(as.numeric(accuracy))

}

calc_accuracy(my_data$scored.class,my_data$class)

## [1] 0.8066298

calc_classification_error_rate <- function(pred,actual){
  
classification_error_rate = (calc_FP(pred,actual) + calc_FN(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual) + calc_TN(pred,actual) + calc_FN(pred,actual))
return(classification_error_rate)

}

calc_classification_error_rate(my_data$scored.class,my_data$class)

## [1] 0.1933702

calc_precision <- function(pred,actual){
  
precision = (calc_TP(pred,actual))/(calc_TP(pred,actual) + calc_FP(pred,actual))
return (round(precision,3))

}

calc_precision(my_data$scored.class,my_data$class)

## [1] 0.844

calc_sensitivity <- function(pred,actual){

sensitivity = (calc_TP(pred,actual))/(calc_TP(pred,actual) + calc_FN(pred,actual))
return(sensitivity)

}

calc_sensitivity(my_data$scored.class,my_data$class)

## [1] 0.4736842

calc_specificity <- function(pred,actual){
  
specificity <- calc_TN(pred,actual)/(calc_TN(pred,actual) + calc_FP(pred,actual))
return(specificity)

}

calc_specificity(my_data$scored.class,my_data$class)

## [1] 0.9596774

calc_F1_score <- function(pred,actual){

F1_score = (2*calc_precision(pred,actual)*calc_sensitivity(pred,actual))/(calc_precision(pred,actual) + calc_sensitivity(pred,actual))
return(F1_score)

}

calc_F1_score(my_data$scored.class,my_data$class)

## [1] 0.6068062

calc_roc_auc <- function(prob,actual){

  threshold <- seq(0.01,1,0.01)
  #create vectors that store x and y values
  x_values <- c()
  y_values <- c() 
  
 for (i in 1:length(threshold)){
   
    new_pred <- ifelse(prob >= threshold[i], 1, 0)
    df <- data.frame(new_pred, actual)
    
    x_values <- c(x_values,1-calc_specificity(df$new_pred,df$actual))
    y_values <- c(y_values,calc_sensitivity(df$new_pred,df$actual))
    
 }
  
  xy_df <- data.frame(x_values, y_values)
  #order values by x values in ascending order
  xy_df <- xy_df[order(x_values),]
  
  #create vectors that store changes in x and y, AUC value and change in AUC
  change_x_values <- c()
  change_y_values <- c()
  auc <- c()
  auc_sum <- c()
   
  for (i in 1:nrow(xy_df)){

  change_x_values <- c(change_x_values,xy_df$x_values[i+1]-xy_df$x_values[i])
  change_y_values <- c(change_y_values,xy_df$y_values[i+1]+xy_df$y_values[i])
  auc <- c(auc,change_y_values[i]*change_x_values[i]/2)
  
  }
  
  df_auc <- data.frame(change_x_values,change_y_values,auc)
  
  auc <- sum(df_auc$auc[1:91])
  
  xy_df <- xy_df[complete.cases(xy_df),]
  
  return (c(plot(y_values ~ x_values,xy_df, type="l", xlab="1-Specificity", ylab="Sensitivity", main="ROC Curve"),abline(0,1,lty=2),auc))
 
}

calc_roc_auc(my_data$scored.probability,my_data$class)

## [1] 0.8242784

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

roc.val <- roc(class~scored.probability, my_data)
plot(roc.val, main="pROC package ROC plot")

roc.val$auc

## Area under the curve: 0.8503

#install caret package
library(caret)

#Confusion Matrix
confusionMatrix(my_data$scored.class, my_data$class)$table

##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27

HW2

Olya Fomicheva

3/18/2018