df <- read_csv("classification-output-data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## pregnant = col_double(),
## glucose = col_double(),
## diastolic = col_double(),
## skinfold = col_double(),
## insulin = col_double(),
## bmi = col_double(),
## pedigree = col_double(),
## age = col_double(),
## class = col_double(),
## scored.class = col_double(),
## scored.probability = col_double()
## )
summary(df)
## pregnant glucose diastolic skinfold
## Min. : 0.000 Min. : 57.0 Min. : 38.0 Min. : 0.0
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.0 1st Qu.: 0.0
## Median : 3.000 Median :112.0 Median : 70.0 Median :22.0
## Mean : 3.862 Mean :118.3 Mean : 71.7 Mean :19.8
## 3rd Qu.: 6.000 3rd Qu.:136.0 3rd Qu.: 78.0 3rd Qu.:32.0
## Max. :15.000 Max. :197.0 Max. :104.0 Max. :54.0
## insulin bmi pedigree age
## Min. : 0.00 Min. :19.40 Min. :0.0850 Min. :21.00
## 1st Qu.: 0.00 1st Qu.:26.30 1st Qu.:0.2570 1st Qu.:24.00
## Median : 0.00 Median :31.60 Median :0.3910 Median :30.00
## Mean : 63.77 Mean :31.58 Mean :0.4496 Mean :33.31
## 3rd Qu.:105.00 3rd Qu.:36.00 3rd Qu.:0.5800 3rd Qu.:41.00
## Max. :543.00 Max. :50.00 Max. :2.2880 Max. :67.00
## class scored.class scored.probability
## Min. :0.0000 Min. :0.0000 Min. :0.02323
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.11702
## Median :0.0000 Median :0.0000 Median :0.23999
## Mean :0.3149 Mean :0.1768 Mean :0.30373
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.43093
## Max. :1.0000 Max. :1.0000 Max. :0.94633
table(df$class,df$scored.class)
##
## 0 1
## 0 119 5
## 1 30 27
Formula for Accuracy is
\[Accuracy =\frac{TP+TN}{TP+FP+TN+FN}\]
pred_accuracy <- function(df){
df_tbl=table(df$class,df$scored.class)
true_negatives <- df_tbl[1,1]
false_positives <- df_tbl[1,2]
false_negatives <- df_tbl[2,1]
true_positives <- df_tbl[2,2]
accuracy = (true_positives + true_negatives)/(true_negatives+false_positives+false_negatives+true_positives)
return(accuracy)
}
pred_accuracy(df)
## [1] 0.8066298
Formula for CER is
\[CER =\frac{FP+FN}{TP+FP+TN+FN}\]
pred_CER <- function(df){
df_tbl=table(df$class,df$scored.class)
true_negatives <- df_tbl[1,1]
false_positives <- df_tbl[1,2]
false_negatives <- df_tbl[2,1]
true_positives <- df_tbl[2,2]
CER = (false_positives + false_negatives)/(true_negatives+false_positives+false_negatives+true_positives)
return(CER)
}
pred_CER(df)
## [1] 0.1933702
Formula for Precision is
\[Precisiion =\frac{TP}{TP+FP}\]
pred_precision <- function(df){
df_tbl=table(df$class,df$scored.class)
false_positives <- df_tbl[1,2]
true_positives <- df_tbl[2,2]
precision = (true_positives)/(false_positives+true_positives)
return(precision)
}
pred_precision(df)
## [1] 0.84375
Formula for Sensitivity is
\[Sensitivity =\frac{TP}{TP+FN}\]
pred_sensitivity <- function(df){
df_tbl=table(df$class,df$scored.class)
false_negatives <- df_tbl[2,1]
true_positives <- df_tbl[2,2]
sensitivity = (true_positives)/(false_negatives+true_positives)
return(sensitivity)
}
pred_sensitivity(df)
## [1] 0.4736842
Formula for Specificity is
\[Specificity =\frac{TN}{TN+FP}\]
pred_specificity <- function(df){
df_tbl=table(df$class,df$scored.class)
true_negatives <- df_tbl[1,1]
false_positives <- df_tbl[1,2]
specificity = (true_negatives)/(true_negatives+false_positives)
return(specificity)
}
pred_specificity(df)
## [1] 0.9596774
Formula for F1 Score is
\[F1 Score =\frac{2*Precision*Sensitivity}{Precision+Sensitivity}\]
pred_f_score <- function(df){
f_score = (2*pred_precision(df) * pred_sensitivity(df))/(pred_precision(df)+pred_sensitivity(df))
return(f_score)
}
pred_f_score(df)
## [1] 0.6067416
The F-Score is equal to 0.607 and It is in the range 0 < f_score < 1
roc_function<- function(d){
#Create a count
temp <- table(d[ ,'class'], d[ ,"scored.probability"])
#Calculate frequency
allPos <- sum(df$class == 1, na.rm=TRUE)
allNeg <- sum(df$class == 0, na.rm=TRUE)
#Set threshold
threshold <- seq(0,1,0.01)
#Calculating probability for threshold
x <- c()
y <- c()
for (i in 1:length(threshold)) {
TP <- sum(df$scored.probability >= threshold[i] & df$class == 1, na.rm=TRUE)
TN <- sum(df$scored.probability < threshold[i] & df$class == 0, na.rm=TRUE)
y[i] <- TP / allPos
x[i] <- 1-TN / allNeg
}
rocPlot <- plot(x,y,type = "s", xlim=c(-0.5,1.5),
main = "ROC Curve from function",
xlab = "1-Specificity",
ylab = "Sensitivity")
fPlot <- abline(0,1); fPlot
xd <- c(0, abs(diff(x)))
fAuc <- sum(xd*y); fAuc
print(paste0("Area under the curve: ", fAuc))
}
roc_function(df)
## [1] "Area under the curve: 0.843803056027165"
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.0.4
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
result <- data.frame(Accuracy=pred_accuracy(df),CER=pred_CER(df),Precision=pred_precision(df),Sensitivity=pred_sensitivity(df),Specificity=pred_specificity(df),F_Score=pred_f_score(df))
result %>%
kbl() %>%
kable_material_dark()
Accuracy | CER | Precision | Sensitivity | Specificity | F_Score |
---|---|---|---|---|---|
0.8066298 | 0.1933702 | 0.84375 | 0.4736842 | 0.9596774 | 0.6067416 |
library(caret)
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:AUC':
##
## sensitivity, specificity
confusionMatrix(data=as.factor(df$scored.class),reference=as.factor(df$class), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.4737
## Specificity : 0.9597
## Pos Pred Value : 0.8438
## Neg Pred Value : 0.7987
## Prevalence : 0.3149
## Detection Rate : 0.1492
## Detection Prevalence : 0.1768
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 1
##
pred_roc_curve <- function(labels,predictions){
library(pROC)
pROC_obj <- roc(labels,predictions,
smoothed = TRUE,
# arguments for ci
ci=TRUE, ci.alpha=0.9, stratified=FALSE,
# arguments for plot
plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
print.auc=TRUE, show.thres=TRUE)
sens.ci <- ci.se(pROC_obj)
plot(sens.ci, type="shape", col="lightblue")
## Warning in plot.ci.se(sens.ci, type = "shape", col = "lightblue"): Low
## definition shape.
plot(sens.ci, type="bars")
}
pred_roc_curve(df$class,df$scored.probability)
## Warning: package 'pROC' was built under R version 4.0.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:AUC':
##
## auc, roc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Warning in plot.ci.se(sens.ci, type = "shape", col = "lightblue"): Low
## definition shape.