This kernel uses and modifies code from https://www.r-bloggers.com/illustrated-guide-to-roc-and-auc/.
We input the data.
raw.data <- read.csv("creditcard.csv")We will use a function for calculation of ROC and AUC.
#calculate ROC (https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
calculate_roc <- function(verset, cost_of_fp, cost_of_fn, n=100) {
tp <- function(verset, threshold) {
sum(verset$predicted >= threshold & verset$Class == 1)
}
fp <- function(verset, threshold) {
sum(verset$predicted >= threshold & verset$Class == 0)
}
tn <- function(verset, threshold) {
sum(verset$predicted < threshold & verset$Class == 0)
}
fn <- function(verset, threshold) {
sum(verset$predicted < threshold & verset$Class == 1)
}
tpr <- function(verset, threshold) {
sum(verset$predicted >= threshold & verset$Class == 1) / sum(verset$Class == 1)
}
fpr <- function(verset, threshold) {
sum(verset$predicted >= threshold & verset$Class == 0) / sum(verset$Class == 0)
}
cost <- function(verset, threshold, cost_of_fp, cost_of_fn) {
sum(verset$predicted >= threshold & verset$Class == 0) * cost_of_fp +
sum(verset$predicted < threshold & verset$Class == 1) * cost_of_fn
}
fpr <- function(verset, threshold) {
sum(verset$predicted >= threshold & verset$Class == 0) / sum(verset$Class == 0)
}
threshold_round <- function(value, threshold)
{
return (as.integer(!(value < threshold)))
}
#calculate AUC (https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)
auc_ <- function(verset, threshold) {
auc(verset$Class, threshold_round(verset$predicted,threshold))
}
roc <- data.frame(threshold = seq(0,1,length.out=n), tpr=NA, fpr=NA)
roc$tp <- sapply(roc$threshold, function(th) tp(verset, th))
roc$fp <- sapply(roc$threshold, function(th) fp(verset, th))
roc$tn <- sapply(roc$threshold, function(th) tn(verset, th))
roc$fn <- sapply(roc$threshold, function(th) fn(verset, th))
roc$tpr <- sapply(roc$threshold, function(th) tpr(verset, th))
roc$fpr <- sapply(roc$threshold, function(th) fpr(verset, th))
roc$cost <- sapply(roc$threshold, function(th) cost(verset, th, cost_of_fp, cost_of_fn))
roc$auc <- sapply(roc$threshold, function(th) auc_(verset, th))
return(roc)
}The following functions is for graphical representation of ROC, AUC and cost function.
plot_roc <- function(roc, threshold, cost_of_fp, cost_of_fn) {
library(gridExtra)
norm_vec <- function(v) (v - min(v))/diff(range(v))
idx_threshold = which.min(abs(roc$threshold-threshold))
col_ramp <- colorRampPalette(c("green","orange","red","black"))(100)
col_by_cost <- col_ramp[ceiling(norm_vec(roc$cost)*99)+1]
p_roc <- ggplot(roc, aes(fpr,tpr)) +
geom_line(color=rgb(0,0,1,alpha=0.3)) +
geom_point(color=col_by_cost, size=2, alpha=0.5) +
labs(title = sprintf("ROC")) + xlab("FPR") + ylab("TPR") +
geom_hline(yintercept=roc[idx_threshold,"tpr"], alpha=0.5, linetype="dashed") +
geom_vline(xintercept=roc[idx_threshold,"fpr"], alpha=0.5, linetype="dashed")
p_auc <- ggplot(roc, aes(threshold, auc)) +
geom_line(color=rgb(0,0,1,alpha=0.3)) +
geom_point(color=col_by_cost, size=2, alpha=0.5) +
labs(title = sprintf("AUC")) +
geom_vline(xintercept=threshold, alpha=0.5, linetype="dashed")
p_cost <- ggplot(roc, aes(threshold, cost)) +
geom_line(color=rgb(0,0,1,alpha=0.3)) +
geom_point(color=col_by_cost, size=2, alpha=0.5) +
labs(title = sprintf("cost function")) +
geom_vline(xintercept=threshold, alpha=0.5, linetype="dashed")
sub_title <- sprintf("threshold at %.2f - cost of FP = %d, cost of FN = %d", threshold, cost_of_fp, cost_of_fn)
#
grid.arrange(p_roc, p_auc, p_cost, ncol=2,sub=textGrob(sub_title, gp=gpar(cex=1), just="bottom"))
}The follwing function is for showing a confusion matrix.
plot_confusion_matrix <- function(verset, sSubtitle) {
tst <- data.frame(round(verset$predicted,0), verset$Class)
opts <- c("Predicted", "True")
names(tst) <- opts
cf <- plyr::count(tst)
cf[opts][cf[opts]==0] <- "Not Fraud"
cf[opts][cf[opts]==1] <- "Fraud"
ggplot(data = cf, mapping = aes(x = True, y = Predicted)) +
labs(title = "Confusion matrix", subtitle = sSubtitle) +
geom_tile(aes(fill = freq), colour = "grey") +
geom_text(aes(label = sprintf("%1.0f", freq)), vjust = 1) +
scale_fill_gradient(low = "lightblue", high = "blue") +
theme_bw() + theme(legend.position = "none")
}The function will be called to compare the predicted value with the real values.
Let’s glimpse the data:
Columns and rows:
sprintf("Rows: %d Columns: %d",nrow(raw.data), length(names(raw.data)))## [1] "Rows: 284807 Columns: 31"
See first few rows of data:
head(raw.data,10) %>%
kable( "html", escape=F, align="c") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center")| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.3598071 | -0.0727812 | 2.5363467 | 1.3781552 | -0.3383208 | 0.4623878 | 0.2395986 | 0.0986979 | 0.3637870 | 0.0907942 | -0.5515995 | -0.6178009 | -0.9913898 | -0.3111694 | 1.4681770 | -0.4704005 | 0.2079712 | 0.0257906 | 0.4039930 | 0.2514121 | -0.0183068 | 0.2778376 | -0.1104739 | 0.0669281 | 0.1285394 | -0.1891148 | 0.1335584 | -0.0210531 | 149.62 | 0 |
| 0 | 1.1918571 | 0.2661507 | 0.1664801 | 0.4481541 | 0.0600176 | -0.0823608 | -0.0788030 | 0.0851017 | -0.2554251 | -0.1669744 | 1.6127267 | 1.0652353 | 0.4890950 | -0.1437723 | 0.6355581 | 0.4639170 | -0.1148047 | -0.1833613 | -0.1457830 | -0.0690831 | -0.2257752 | -0.6386720 | 0.1012880 | -0.3398465 | 0.1671704 | 0.1258945 | -0.0089831 | 0.0147242 | 2.69 | 0 |
| 1 | -1.3583541 | -1.3401631 | 1.7732093 | 0.3797796 | -0.5031981 | 1.8004994 | 0.7914610 | 0.2476758 | -1.5146543 | 0.2076429 | 0.6245015 | 0.0660837 | 0.7172927 | -0.1659459 | 2.3458649 | -2.8900832 | 1.1099694 | -0.1213593 | -2.2618571 | 0.5249797 | 0.2479982 | 0.7716794 | 0.9094123 | -0.6892810 | -0.3276418 | -0.1390966 | -0.0553528 | -0.0597518 | 378.66 | 0 |
| 1 | -0.9662717 | -0.1852260 | 1.7929933 | -0.8632913 | -0.0103089 | 1.2472032 | 0.2376089 | 0.3774359 | -1.3870241 | -0.0549519 | -0.2264873 | 0.1782282 | 0.5077569 | -0.2879237 | -0.6314181 | -1.0596472 | -0.6840928 | 1.9657750 | -1.2326220 | -0.2080378 | -0.1083005 | 0.0052736 | -0.1903205 | -1.1755753 | 0.6473760 | -0.2219288 | 0.0627228 | 0.0614576 | 123.50 | 0 |
| 2 | -1.1582331 | 0.8777368 | 1.5487178 | 0.4030339 | -0.4071934 | 0.0959215 | 0.5929407 | -0.2705327 | 0.8177393 | 0.7530744 | -0.8228429 | 0.5381956 | 1.3458516 | -1.1196698 | 0.1751211 | -0.4514492 | -0.2370332 | -0.0381948 | 0.8034869 | 0.4085424 | -0.0094307 | 0.7982785 | -0.1374581 | 0.1412670 | -0.2060096 | 0.5022922 | 0.2194222 | 0.2151531 | 69.99 | 0 |
| 2 | -0.4259659 | 0.9605230 | 1.1411093 | -0.1682521 | 0.4209869 | -0.0297276 | 0.4762009 | 0.2603143 | -0.5686714 | -0.3714072 | 1.3412620 | 0.3598938 | -0.3580907 | -0.1371337 | 0.5176168 | 0.4017259 | -0.0581328 | 0.0686531 | -0.0331938 | 0.0849677 | -0.2082535 | -0.5598248 | -0.0263977 | -0.3714266 | -0.2327938 | 0.1059148 | 0.2538442 | 0.0810803 | 3.67 | 0 |
| 4 | 1.2296576 | 0.1410035 | 0.0453708 | 1.2026127 | 0.1918810 | 0.2727081 | -0.0051590 | 0.0812129 | 0.4649600 | -0.0992543 | -1.4169072 | -0.1538258 | -0.7510627 | 0.1673720 | 0.0501436 | -0.4435868 | 0.0028205 | -0.6119873 | -0.0455750 | -0.2196326 | -0.1677163 | -0.2707097 | -0.1541038 | -0.7800554 | 0.7501369 | -0.2572368 | 0.0345074 | 0.0051678 | 4.99 | 0 |
| 7 | -0.6442694 | 1.4179635 | 1.0743804 | -0.4921990 | 0.9489341 | 0.4281185 | 1.1206314 | -3.8078642 | 0.6153747 | 1.2493762 | -0.6194678 | 0.2914744 | 1.7579642 | -1.3238652 | 0.6861325 | -0.0761270 | -1.2221273 | -0.3582216 | 0.3245047 | -0.1567419 | 1.9434653 | -1.0154547 | 0.0575035 | -0.6497090 | -0.4152666 | -0.0516343 | -1.2069211 | -1.0853392 | 40.80 | 0 |
| 7 | -0.8942861 | 0.2861572 | -0.1131922 | -0.2715261 | 2.6695987 | 3.7218181 | 0.3701451 | 0.8510844 | -0.3920476 | -0.4104304 | -0.7051166 | -0.1104523 | -0.2862536 | 0.0743554 | -0.3287831 | -0.2100773 | -0.4997680 | 0.1187649 | 0.5703282 | 0.0527357 | -0.0734251 | -0.2680916 | -0.2042327 | 1.0115918 | 0.3732047 | -0.3841573 | 0.0117474 | 0.1424043 | 93.20 | 0 |
| 9 | -0.3382618 | 1.1195934 | 1.0443666 | -0.2221873 | 0.4993608 | -0.2467611 | 0.6515832 | 0.0695386 | -0.7367273 | -0.3668456 | 1.0176145 | 0.8363896 | 1.0068435 | -0.4435228 | 0.1502191 | 0.7394528 | -0.5409799 | 0.4766773 | 0.4517730 | 0.2037115 | -0.2469139 | -0.6337526 | -0.1207941 | -0.3850499 | -0.0697330 | 0.0941988 | 0.2462193 | 0.0830756 | 3.68 | 0 |
There are totally 31 columns in the data. One column, Class is the target value; it is a binary value, can have either 0 (not fraud) or 1 (fraud) value. Another two columns have clear meaning: Amount is the amount of the transaction; Time is the time of the transaction. The rest of the features (28), anonymized, are named from V1 to V28. The data is highly unbalanced with respect of Class variable values. There are only 0.1727486% of the rows with value Class = 1. Typically, in such cases, we can either choose to preserve the data unbalancing or use a oversampling (of the data with minority value of target variable) or undersampling (of the data with majority value of the target variable). Here we will just preserve the unbalancing of the data. In terms of validation of the result, we will see that usual metrix, using a confusion matrix or accuracy are not the most relevant and will be prefered alternative solutions using AUC.
We represent the Pearson correlation for the data.
correlations <- cor(raw.data,method="pearson")
corrplot(correlations, number.cex = .9, method = "circle", type = "full", tl.cex=0.8,tl.col = "black") We can observe that most of the data features are not correlated. This is because before publishing, most of the features were presented to a Principal Component Analysis (PCA) algorithm. The features
V1 to V28 are most probably the Principal Components resulted after propagating the real features through PCA. We do not know if the numbering of the features reflects the importance of the Principal Components. This information might be checked partially using the Variable Importance from Random Forest.
After we split the data in a training and test set, we create the RF model using the training set.
nrows <- nrow(raw.data)
set.seed(314)
indexT <- sample(1:nrow(raw.data), 0.7 * nrows)
#separate train and validation set
trainset = raw.data[indexT,]
verset = raw.data[-indexT,]
n <- names(trainset)
rf.form <- as.formula(paste("Class ~", paste(n[!n %in% "Class"], collapse = " + ")))
trainset.rf <- randomForest(rf.form,trainset,ntree=100,importance=T)## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
For the trained model, let’s see visualize the variable importance.
varimp <- data.frame(trainset.rf$importance)
vi1 <- ggplot(varimp, aes(x=reorder(rownames(varimp),IncNodePurity), y=IncNodePurity)) +
geom_bar(stat="identity", fill="tomato", colour="black") +
coord_flip() + theme_bw(base_size = 8) +
labs(title="Prediction using RandomForest with 100 trees", subtitle="Variable importance (IncNodePurity)", x="Variable", y="Variable importance (IncNodePurity)")
vi2 <- ggplot(varimp, aes(x=reorder(rownames(varimp),X.IncMSE), y=X.IncMSE)) +
geom_bar(stat="identity", fill="lightblue", colour="black") +
coord_flip() + theme_bw(base_size = 8) +
labs(title="Prediction using RandomForest with 100 trees", subtitle="Variable importance (%IncMSE)", x="Variable", y="Variable importance (%IncMSE)")
grid.arrange(vi1, vi2, ncol=2)Let’s use the trained model for prediction of the Fraud/Not Fraud Class for the test set.
verset$predicted <- predict(trainset.rf ,verset)For the threshold at 0.5, let’s represent the Confusion matrix.
plot_confusion_matrix(verset, "Random Forest with 100 trees")For such a problem, where the number of TP is very small in comparison with the number of TN, the Confusion Matrix is less useful, since it is important to use a metric that include evaluation of FP and FN as well. It is important to minimize as much as possible the number of FN (Predicted: Not Fraud and True: Fraud) since their cost could be very large. Tipically AUC is used for such cases.
Let’s calculate the TP, FP, TN, FN, ROC, AUC and cost for threshold with values between 0 and 1 (100 values equaly distributed) and cost 1 for TN and 10 for FN.
roc <- calculate_roc(verset, 1, 10, n = 100)
mincost <- min(roc$cost)
roc %>%
mutate(
auc = ifelse(cost == mincost,
cell_spec(sprintf("%.5f", auc), "html", color = "green", background = "lightblue", bold = T),
cell_spec(sprintf("%.5f", auc), "html", color = "black", bold = F))
) %>%
kable( "html", escape=F, align="c") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center") %>%
scroll_box(height = "600px")| threshold | tpr | fpr | tp | fp | tn | fn | cost | auc |
|---|---|---|---|---|---|---|---|---|
| 0.0000000 | 0.9139073 | 0.0259344 | 138 | 2212 | 83080 | 13 | 2342 | 0.94399 |
| 0.0101010 | 0.9006623 | 0.0086995 | 136 | 742 | 84550 | 15 | 892 | 0.94598 |
| 0.0202020 | 0.8940397 | 0.0044553 | 135 | 380 | 84912 | 16 | 540 | 0.94479 |
| 0.0303030 | 0.8874172 | 0.0028373 | 134 | 242 | 85050 | 17 | 412 | 0.94229 |
| 0.0404040 | 0.8874172 | 0.0020401 | 134 | 174 | 85118 | 17 | 344 | 0.94269 |
| 0.0505051 | 0.8874172 | 0.0014656 | 134 | 125 | 85167 | 17 | 295 | 0.94298 |
| 0.0606061 | 0.8741722 | 0.0011724 | 132 | 100 | 85192 | 19 | 290 | 0.93650 |
| 0.0707071 | 0.8741722 | 0.0009849 | 132 | 84 | 85208 | 19 | 274 | 0.93659 |
| 0.0808081 | 0.8741722 | 0.0008324 | 132 | 71 | 85221 | 19 | 261 | 0.93667 |
| 0.0909091 | 0.8675497 | 0.0007386 | 131 | 63 | 85229 | 20 | 263 | 0.93341 |
| 0.1010101 | 0.8675497 | 0.0006097 | 131 | 52 | 85240 | 20 | 252 | 0.93347 |
| 0.1111111 | 0.8675497 | 0.0005159 | 131 | 44 | 85248 | 20 | 244 | 0.93352 |
| 0.1212121 | 0.8675497 | 0.0004924 | 131 | 42 | 85250 | 20 | 242 | 0.93353 |
| 0.1313131 | 0.8609272 | 0.0004455 | 130 | 38 | 85254 | 21 | 248 | 0.93024 |
| 0.1414141 | 0.8609272 | 0.0004455 | 130 | 38 | 85254 | 21 | 248 | 0.93024 |
| 0.1515152 | 0.8609272 | 0.0004104 | 130 | 35 | 85257 | 21 | 245 | 0.93026 |
| 0.1616162 | 0.8609272 | 0.0004104 | 130 | 35 | 85257 | 21 | 245 | 0.93026 |
| 0.1717172 | 0.8609272 | 0.0003986 | 130 | 34 | 85258 | 21 | 244 | 0.93026 |
| 0.1818182 | 0.8609272 | 0.0003517 | 130 | 30 | 85262 | 21 | 240 | 0.93029 |
| 0.1919192 | 0.8609272 | 0.0003517 | 130 | 30 | 85262 | 21 | 240 | 0.93029 |
| 0.2020202 | 0.8609272 | 0.0003048 | 130 | 26 | 85266 | 21 | 236 | 0.93031 |
| 0.2121212 | 0.8609272 | 0.0002931 | 130 | 25 | 85267 | 21 | 235 | 0.93032 |
| 0.2222222 | 0.8609272 | 0.0002697 | 130 | 23 | 85269 | 21 | 233 | 0.93033 |
| 0.2323232 | 0.8609272 | 0.0002697 | 130 | 23 | 85269 | 21 | 233 | 0.93033 |
| 0.2424242 | 0.8609272 | 0.0002579 | 130 | 22 | 85270 | 21 | 232 | 0.93033 |
| 0.2525253 | 0.8609272 | 0.0002462 | 130 | 21 | 85271 | 21 | 231 | 0.93034 |
| 0.2626263 | 0.8609272 | 0.0002228 | 130 | 19 | 85273 | 21 | 229 | 0.93035 |
| 0.2727273 | 0.8609272 | 0.0001993 | 130 | 17 | 85275 | 21 | 227 | 0.93036 |
| 0.2828283 | 0.8609272 | 0.0001876 | 130 | 16 | 85276 | 21 | 226 | 0.93037 |
| 0.2929293 | 0.8609272 | 0.0001759 | 130 | 15 | 85277 | 21 | 225 | 0.93038 |
| 0.3030303 | 0.8609272 | 0.0001759 | 130 | 15 | 85277 | 21 | 225 | 0.93038 |
| 0.3131313 | 0.8609272 | 0.0001759 | 130 | 15 | 85277 | 21 | 225 | 0.93038 |
| 0.3232323 | 0.8609272 | 0.0001641 | 130 | 14 | 85278 | 21 | 224 | 0.93038 |
| 0.3333333 | 0.8543046 | 0.0001641 | 129 | 14 | 85278 | 22 | 234 | 0.92707 |
| 0.3434343 | 0.8476821 | 0.0001641 | 128 | 14 | 85278 | 23 | 244 | 0.92376 |
| 0.3535354 | 0.8410596 | 0.0001524 | 127 | 13 | 85279 | 24 | 253 | 0.92045 |
| 0.3636364 | 0.8410596 | 0.0001407 | 127 | 12 | 85280 | 24 | 252 | 0.92046 |
| 0.3737374 | 0.8410596 | 0.0001290 | 127 | 11 | 85281 | 24 | 251 | 0.92047 |
| 0.3838384 | 0.8410596 | 0.0001290 | 127 | 11 | 85281 | 24 | 251 | 0.92047 |
| 0.3939394 | 0.8410596 | 0.0001290 | 127 | 11 | 85281 | 24 | 251 | 0.92047 |
| 0.4040404 | 0.8410596 | 0.0001290 | 127 | 11 | 85281 | 24 | 251 | 0.92047 |
| 0.4141414 | 0.8410596 | 0.0001172 | 127 | 10 | 85282 | 24 | 250 | 0.92047 |
| 0.4242424 | 0.8410596 | 0.0001172 | 127 | 10 | 85282 | 24 | 250 | 0.92047 |
| 0.4343434 | 0.8410596 | 0.0001172 | 127 | 10 | 85282 | 24 | 250 | 0.92047 |
| 0.4444444 | 0.8410596 | 0.0001055 | 127 | 9 | 85283 | 24 | 249 | 0.92048 |
| 0.4545455 | 0.8410596 | 0.0000938 | 127 | 8 | 85284 | 24 | 248 | 0.92048 |
| 0.4646465 | 0.8410596 | 0.0000938 | 127 | 8 | 85284 | 24 | 248 | 0.92048 |
| 0.4747475 | 0.8410596 | 0.0000938 | 127 | 8 | 85284 | 24 | 248 | 0.92048 |
| 0.4848485 | 0.8410596 | 0.0000938 | 127 | 8 | 85284 | 24 | 248 | 0.92048 |
| 0.4949495 | 0.8344371 | 0.0000821 | 126 | 7 | 85285 | 25 | 257 | 0.91718 |
| 0.5050505 | 0.8278146 | 0.0000586 | 125 | 5 | 85287 | 26 | 265 | 0.91388 |
| 0.5151515 | 0.8278146 | 0.0000586 | 125 | 5 | 85287 | 26 | 265 | 0.91388 |
| 0.5252525 | 0.8211921 | 0.0000586 | 124 | 5 | 85287 | 27 | 275 | 0.91057 |
| 0.5353535 | 0.8079470 | 0.0000586 | 122 | 5 | 85287 | 29 | 295 | 0.90394 |
| 0.5454545 | 0.8079470 | 0.0000586 | 122 | 5 | 85287 | 29 | 295 | 0.90394 |
| 0.5555556 | 0.8079470 | 0.0000586 | 122 | 5 | 85287 | 29 | 295 | 0.90394 |
| 0.5656566 | 0.8079470 | 0.0000586 | 122 | 5 | 85287 | 29 | 295 | 0.90394 |
| 0.5757576 | 0.8079470 | 0.0000469 | 122 | 4 | 85288 | 29 | 294 | 0.90395 |
| 0.5858586 | 0.8079470 | 0.0000469 | 122 | 4 | 85288 | 29 | 294 | 0.90395 |
| 0.5959596 | 0.8013245 | 0.0000352 | 121 | 3 | 85289 | 30 | 303 | 0.90064 |
| 0.6060606 | 0.8013245 | 0.0000352 | 121 | 3 | 85289 | 30 | 303 | 0.90064 |
| 0.6161616 | 0.7947020 | 0.0000352 | 120 | 3 | 85289 | 31 | 313 | 0.89733 |
| 0.6262626 | 0.7947020 | 0.0000352 | 120 | 3 | 85289 | 31 | 313 | 0.89733 |
| 0.6363636 | 0.7880795 | 0.0000352 | 119 | 3 | 85289 | 32 | 323 | 0.89402 |
| 0.6464646 | 0.7814570 | 0.0000352 | 118 | 3 | 85289 | 33 | 333 | 0.89071 |
| 0.6565657 | 0.7814570 | 0.0000352 | 118 | 3 | 85289 | 33 | 333 | 0.89071 |
| 0.6666667 | 0.7748344 | 0.0000352 | 117 | 3 | 85289 | 34 | 343 | 0.88740 |
| 0.6767677 | 0.7615894 | 0.0000352 | 115 | 3 | 85289 | 36 | 363 | 0.88078 |
| 0.6868687 | 0.7549669 | 0.0000352 | 114 | 3 | 85289 | 37 | 373 | 0.87747 |
| 0.6969697 | 0.7549669 | 0.0000352 | 114 | 3 | 85289 | 37 | 373 | 0.87747 |
| 0.7070707 | 0.7483444 | 0.0000352 | 113 | 3 | 85289 | 38 | 383 | 0.87415 |
| 0.7171717 | 0.7350993 | 0.0000352 | 111 | 3 | 85289 | 40 | 403 | 0.86753 |
| 0.7272727 | 0.7350993 | 0.0000352 | 111 | 3 | 85289 | 40 | 403 | 0.86753 |
| 0.7373737 | 0.7284768 | 0.0000352 | 110 | 3 | 85289 | 41 | 413 | 0.86422 |
| 0.7474747 | 0.7284768 | 0.0000352 | 110 | 3 | 85289 | 41 | 413 | 0.86422 |
| 0.7575758 | 0.7284768 | 0.0000352 | 110 | 3 | 85289 | 41 | 413 | 0.86422 |
| 0.7676768 | 0.7086093 | 0.0000352 | 107 | 3 | 85289 | 44 | 443 | 0.85429 |
| 0.7777778 | 0.7086093 | 0.0000234 | 107 | 2 | 85290 | 44 | 442 | 0.85429 |
| 0.7878788 | 0.7019868 | 0.0000234 | 106 | 2 | 85290 | 45 | 452 | 0.85098 |
| 0.7979798 | 0.6821192 | 0.0000234 | 103 | 2 | 85290 | 48 | 482 | 0.84105 |
| 0.8080808 | 0.6622517 | 0.0000234 | 100 | 2 | 85290 | 51 | 512 | 0.83111 |
| 0.8181818 | 0.6291391 | 0.0000234 | 95 | 2 | 85290 | 56 | 562 | 0.81456 |
| 0.8282828 | 0.6158940 | 0.0000234 | 93 | 2 | 85290 | 58 | 582 | 0.80794 |
| 0.8383838 | 0.6026490 | 0.0000234 | 91 | 2 | 85290 | 60 | 602 | 0.80131 |
| 0.8484848 | 0.5960265 | 0.0000117 | 90 | 1 | 85291 | 61 | 611 | 0.79801 |
| 0.8585859 | 0.5827815 | 0.0000117 | 88 | 1 | 85291 | 63 | 631 | 0.79138 |
| 0.8686869 | 0.5562914 | 0.0000117 | 84 | 1 | 85291 | 67 | 671 | 0.77814 |
| 0.8787879 | 0.5364238 | 0.0000117 | 81 | 1 | 85291 | 70 | 701 | 0.76821 |
| 0.8888889 | 0.5231788 | 0.0000117 | 79 | 1 | 85291 | 72 | 721 | 0.76158 |
| 0.8989899 | 0.4900662 | 0.0000000 | 74 | 0 | 85292 | 77 | 770 | 0.74503 |
| 0.9090909 | 0.4701987 | 0.0000000 | 71 | 0 | 85292 | 80 | 800 | 0.73510 |
| 0.9191919 | 0.4635762 | 0.0000000 | 70 | 0 | 85292 | 81 | 810 | 0.73179 |
| 0.9292929 | 0.4503311 | 0.0000000 | 68 | 0 | 85292 | 83 | 830 | 0.72517 |
| 0.9393939 | 0.4039735 | 0.0000000 | 61 | 0 | 85292 | 90 | 900 | 0.70199 |
| 0.9494949 | 0.3443709 | 0.0000000 | 52 | 0 | 85292 | 99 | 990 | 0.67219 |
| 0.9595960 | 0.3178808 | 0.0000000 | 48 | 0 | 85292 | 103 | 1030 | 0.65894 |
| 0.9696970 | 0.3046358 | 0.0000000 | 46 | 0 | 85292 | 105 | 1050 | 0.65232 |
| 0.9797980 | 0.2516556 | 0.0000000 | 38 | 0 | 85292 | 113 | 1130 | 0.62583 |
| 0.9898990 | 0.1589404 | 0.0000000 | 24 | 0 | 85292 | 127 | 1270 | 0.57947 |
| 1.0000000 | 0.0728477 | 0.0000000 | 11 | 0 | 85292 | 140 | 1400 | 0.53642 |
Let’s plot the ROC, AUC and cost functions for a ref. threshold of 0.3.
threshold = 0.3
plot_roc(roc, threshold, 1, 10)The calculated accuracy is not very relevant in the conditions where there is a very large unbalance between the number of fraud and non-fraud events in the dataset. In such cases, we can see a very large accuracy. More relevant is the value of ROC-AUC (Area Under Curve for the Receiver Operator Characteristic). The value obtained (0.93) is relativelly good, considering that we did not performed any tunning, working with default RF algorithm parameters.
[1] Receiver Operating Characteristic, https://en.wikipedia.org/wiki/Receiver_operating_characteristic
[2] Area under curve, https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
[3] Computing and visualizing PCA in R, https://www.r-bloggers.com/computing-and-visualizing-pca-in-r/