DATA 621 HW2

library(readr)
library(dplyr)
library(kableExtra)
library(plotly)
library(DescTools)

df <- read_csv("https://raw.githubusercontent.com/agersowitz/DATA-621/main/classification.txt")
df <- data.frame(df)


table(df$class,df$scored.class)%>%
  kbl(caption = "Confusion Matrix") %>%
  kable_classic(full_width = F, html_font = "Cambria")%>%
  kable_styling(latex_options = "HOLD_position")

Confusion Matrix
	0	1
0	119	5
1	30	27

The confusion Matrix above shows the occurrence of True Negatives (119), False Positives (5), False Negatives (30), and True Positives (27).

Questions 3-8, and 10 Are answered in the classification_metrics function below

classification_metrics <- function(df){
          TP <- nrow(filter(df,class==1 & scored.class==1))
          TN<- nrow(filter(df,class==0 & scored.class==0))
          FP<- nrow(filter(df,class==0 & scored.class==1))
          FN<- nrow(filter(df,class==1 & scored.class==0))
          Acc<- (TP+TN)/(TP+FP+TN+FN)
          CER<- (FP+FN)/(TP+FP+TN+FN)
          pre<- (TP)/(TP+FP)
          sens<-(TP)/(TP+FN)
          spec<-(TN)/(TN+FP)
          f1<-(2*pre*sens)/(pre+sens)
          if (Acc+CER==1){
            stmnt <- paste0("\nSuccess. Accuracy and Classification Error Rate sum to 1.")
          } else {
            stmnt <- paste0("\nFailiure. Accuracy and Classification Error Rate do not sum to 1.")

          }
          return(cat("Classification Metrics",
                     "\nAccuracy = ", round(Acc,3),
                "\nClassification Error Rate = ", round(CER,3),
                stmnt,
                "\nPrecison = ",round(pre,3),
                "\nSensitivity = ",round(sens,3),
                "\nSpecificity = ",round(spec,3),
                "\nF1 Score = ",round(f1,3),"\n\n"
                        
                        
                        ))
}





roc_curve <-function(df){
x<-c()
y<-c()
true_class = df[,"class"]
probabilities = df[,"scored.probability"]
thresholds = seq(0,1,0.01) 
rx<-0
ry<-0
for (threshold in thresholds) {
    
    predicted_class <- c()
    for (val in probabilities) {
      
      if (val > threshold) {
        predicted_class <- c(predicted_class,1) }
        
      else {
        predicted_class <- c(predicted_class,0) }
    
    }
    
    df2<-as.data.frame(cbind(true_class,predicted_class))
    TP <- nrow(filter(df2,true_class==1 & predicted_class==1))
    TN<- nrow(filter(df2,true_class==0 & predicted_class==0))
    FP<- nrow(filter(df2,true_class==0 & predicted_class==1))
    FN<- nrow(filter(df2,true_class==1 & predicted_class==0))
    specm1<-1-((TN)/(TN+FP))
    sens<-(TP)/(TP+FN)
    x<-append(x,specm1)
    y<-append(y,sens)
    
    
}
    dfr<-as.data.frame(cbind(x,y))
    auc<-AUC(x=x,y=y,from = 0, to = 1)
    
    plot_ly(dfr, x = ~x, y = ~y, type = 'scatter', mode = 'lines') %>% 
  layout(title = paste0("ROC Curve for Chosen Dataset"),
         annotations = list(text = paste0("Area Under Curve = ",round(auc,3)),  x = .75, y = .25,showarrow=FALSE ),
         xaxis = list(showgrid = FALSE, title = "1-Specificity (false positive rate)"),
         yaxis = list(showgrid = FALSE, title = "Sensitivity (true positive rate)"))
}


classification_metrics(df)

## Classification Metrics 
## Accuracy =  0.807 
## Classification Error Rate =  0.193 
## Success. Accuracy and Classification Error Rate sum to 1. 
## Precison =  0.844 
## Sensitivity =  0.474 
## Specificity =  0.96 
## F1 Score =  0.607

roc_curve(df)

9) Bounds of F1 Score

\[Precision =\frac{TP}{TP+FP}\] \[Sensitivity=\frac{TP}{TP+FN}\]

\[F1 Score=\frac{2*Precision*Sensitivity}{Precision+Sensitivity}\]

Considering the formulas above we know that Precision and Sensitivity Will be between 0 and 1 (if TP = 0 then they will be 0 and if FN or FP = 0 then they will = 1). So after plugging these minimum and maximum possible values into the F1 Score we will get our range of values.

\[F1 Score=\frac{2*Precision*Sensitivity}{Precision+Sensitivity}=\frac{2*0*0}{0+0}=0\] \[F1 Score=\frac{2*Precision*Sensitivity}{Precision+Sensitivity}=\frac{2*1*1}{1+1}=\frac{2}{2}=1\]

library(caret)

confusionMatrix(data=as.factor(df$scored.class),reference=as.factor(df$class), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##                                           
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.4737          
##             Specificity : 0.9597          
##          Pos Pred Value : 0.8438          
##          Neg Pred Value : 0.7987          
##              Prevalence : 0.3149          
##          Detection Rate : 0.1492          
##    Detection Prevalence : 0.1768          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 1               
##

sensitivity(data=as.factor(df$scored.class),reference=as.factor(df$class), positive = "1")

## [1] 0.4736842

specificity(data=as.factor(df$scored.class),reference=as.factor(df$class), negative = "0")

## [1] 0.9596774

My numbers seem to line up with the built in functions.

library(pROC)

roc1<-roc(df$class,df$scored.probability)
plot(roc1)

references : https://acutecaretesting.org/en/articles/roc-curves-what-are-they-and-how-are-they-used#:~:text=An%20ROC%20curve%20shows%20the,TP%2F(TP%2BFN))

DATA 621 HW2

Adam Gersowitz

3/14/2021

Questions 3-8, and 10 Are answered in the classification_metrics function below

9) Bounds of F1 Score