Introduction

This kernel uses and modifies code from https://www.r-bloggers.com/illustrated-guide-to-roc-and-auc/.

We input the data.

raw.data <- read.csv("creditcard.csv")

Utility functions

We will use a function for calculation of ROC and AUC.

#calculate ROC (https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
calculate_roc <- function(verset, cost_of_fp, cost_of_fn, n=100) {
  
  tp <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 1)
  }
  
  fp <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 0)
  }
  
  tn <- function(verset, threshold) {
    sum(verset$predicted < threshold & verset$Class == 0)
  }
  
  fn <- function(verset, threshold) {
    sum(verset$predicted < threshold & verset$Class == 1)
  }
  
  tpr <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 1) / sum(verset$Class == 1)
  }
  
  fpr <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 0) / sum(verset$Class == 0)
  }
  
  cost <- function(verset, threshold, cost_of_fp, cost_of_fn) {
    sum(verset$predicted >= threshold & verset$Class == 0) * cost_of_fp + 
      sum(verset$predicted < threshold & verset$Class == 1) * cost_of_fn
  }
  fpr <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 0) / sum(verset$Class == 0)
  }
  
  threshold_round <- function(value, threshold)
  {
    return (as.integer(!(value < threshold)))
  }
  #calculate AUC (https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)
  auc_ <- function(verset, threshold) {
    auc(verset$Class, threshold_round(verset$predicted,threshold))
  }
  
  roc <- data.frame(threshold = seq(0,1,length.out=n), tpr=NA, fpr=NA)
  roc$tp <- sapply(roc$threshold, function(th) tp(verset, th))
  roc$fp <- sapply(roc$threshold, function(th) fp(verset, th))
  roc$tn <- sapply(roc$threshold, function(th) tn(verset, th))
  roc$fn <- sapply(roc$threshold, function(th) fn(verset, th))
  roc$tpr <- sapply(roc$threshold, function(th) tpr(verset, th))
  roc$fpr <- sapply(roc$threshold, function(th) fpr(verset, th))
  roc$cost <- sapply(roc$threshold, function(th) cost(verset, th, cost_of_fp, cost_of_fn))
  roc$auc <-  sapply(roc$threshold, function(th) auc_(verset, th))
  
  return(roc)
}

The following functions is for graphical representation of ROC, AUC and cost function.

plot_roc <- function(roc, threshold, cost_of_fp, cost_of_fn) {
  library(gridExtra)
  
  norm_vec <- function(v) (v - min(v))/diff(range(v))
  
  idx_threshold = which.min(abs(roc$threshold-threshold))
  
  col_ramp <- colorRampPalette(c("green","orange","red","black"))(100)
  col_by_cost <- col_ramp[ceiling(norm_vec(roc$cost)*99)+1]
  p_roc <- ggplot(roc, aes(fpr,tpr)) + 
    geom_line(color=rgb(0,0,1,alpha=0.3)) +
    geom_point(color=col_by_cost, size=2, alpha=0.5) +
    labs(title = sprintf("ROC")) + xlab("FPR") + ylab("TPR") +
    geom_hline(yintercept=roc[idx_threshold,"tpr"], alpha=0.5, linetype="dashed") +
    geom_vline(xintercept=roc[idx_threshold,"fpr"], alpha=0.5, linetype="dashed")
  
  p_auc <- ggplot(roc, aes(threshold, auc)) +
    geom_line(color=rgb(0,0,1,alpha=0.3)) +
    geom_point(color=col_by_cost, size=2, alpha=0.5) +
    labs(title = sprintf("AUC")) +
    geom_vline(xintercept=threshold, alpha=0.5, linetype="dashed")
  
  p_cost <- ggplot(roc, aes(threshold, cost)) +
    geom_line(color=rgb(0,0,1,alpha=0.3)) +
    geom_point(color=col_by_cost, size=2, alpha=0.5) +
    labs(title = sprintf("cost function")) +
    geom_vline(xintercept=threshold, alpha=0.5, linetype="dashed")
  
  sub_title <- sprintf("threshold at %.2f - cost of FP = %d, cost of FN = %d", threshold, cost_of_fp, cost_of_fn)
  # 
  grid.arrange(p_roc, p_auc, p_cost, ncol=2,sub=textGrob(sub_title, gp=gpar(cex=1), just="bottom"))
}

The follwing function is for showing a confusion matrix.

plot_confusion_matrix <- function(verset, sSubtitle) {
    tst <- data.frame(round(verset$predicted,0), verset$Class)
    opts <-  c("Predicted", "True")
    names(tst) <- opts
    cf <- plyr::count(tst)
    cf[opts][cf[opts]==0] <- "Not Fraud"
    cf[opts][cf[opts]==1] <- "Fraud"
    
    ggplot(data =  cf, mapping = aes(x = True, y = Predicted)) +
      labs(title = "Confusion matrix", subtitle = sSubtitle) +
      geom_tile(aes(fill = freq), colour = "grey") +
      geom_text(aes(label = sprintf("%1.0f", freq)), vjust = 1) +
      scale_fill_gradient(low = "lightblue", high = "blue") +
      theme_bw() + theme(legend.position = "none")
  
}

The function will be called to compare the predicted value with the real values.

Explore the data

Let’s glimpse the data:

Columns and rows:

sprintf("Rows: %d Columns: %d",nrow(raw.data), length(names(raw.data)))
## [1] "Rows: 284807 Columns: 31"

See first few rows of data:

head(raw.data,10) %>%
  kable( "html", escape=F, align="c") %>%
   kable_styling(bootstrap_options = "striped", full_width = F, position = "center")
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 -1.3598071 -0.0727812 2.5363467 1.3781552 -0.3383208 0.4623878 0.2395986 0.0986979 0.3637870 0.0907942 -0.5515995 -0.6178009 -0.9913898 -0.3111694 1.4681770 -0.4704005 0.2079712 0.0257906 0.4039930 0.2514121 -0.0183068 0.2778376 -0.1104739 0.0669281 0.1285394 -0.1891148 0.1335584 -0.0210531 149.62 0
0 1.1918571 0.2661507 0.1664801 0.4481541 0.0600176 -0.0823608 -0.0788030 0.0851017 -0.2554251 -0.1669744 1.6127267 1.0652353 0.4890950 -0.1437723 0.6355581 0.4639170 -0.1148047 -0.1833613 -0.1457830 -0.0690831 -0.2257752 -0.6386720 0.1012880 -0.3398465 0.1671704 0.1258945 -0.0089831 0.0147242 2.69 0
1 -1.3583541 -1.3401631 1.7732093 0.3797796 -0.5031981 1.8004994 0.7914610 0.2476758 -1.5146543 0.2076429 0.6245015 0.0660837 0.7172927 -0.1659459 2.3458649 -2.8900832 1.1099694 -0.1213593 -2.2618571 0.5249797 0.2479982 0.7716794 0.9094123 -0.6892810 -0.3276418 -0.1390966 -0.0553528 -0.0597518 378.66 0
1 -0.9662717 -0.1852260 1.7929933 -0.8632913 -0.0103089 1.2472032 0.2376089 0.3774359 -1.3870241 -0.0549519 -0.2264873 0.1782282 0.5077569 -0.2879237 -0.6314181 -1.0596472 -0.6840928 1.9657750 -1.2326220 -0.2080378 -0.1083005 0.0052736 -0.1903205 -1.1755753 0.6473760 -0.2219288 0.0627228 0.0614576 123.50 0
2 -1.1582331 0.8777368 1.5487178 0.4030339 -0.4071934 0.0959215 0.5929407 -0.2705327 0.8177393 0.7530744 -0.8228429 0.5381956 1.3458516 -1.1196698 0.1751211 -0.4514492 -0.2370332 -0.0381948 0.8034869 0.4085424 -0.0094307 0.7982785 -0.1374581 0.1412670 -0.2060096 0.5022922 0.2194222 0.2151531 69.99 0
2 -0.4259659 0.9605230 1.1411093 -0.1682521 0.4209869 -0.0297276 0.4762009 0.2603143 -0.5686714 -0.3714072 1.3412620 0.3598938 -0.3580907 -0.1371337 0.5176168 0.4017259 -0.0581328 0.0686531 -0.0331938 0.0849677 -0.2082535 -0.5598248 -0.0263977 -0.3714266 -0.2327938 0.1059148 0.2538442 0.0810803 3.67 0
4 1.2296576 0.1410035 0.0453708 1.2026127 0.1918810 0.2727081 -0.0051590 0.0812129 0.4649600 -0.0992543 -1.4169072 -0.1538258 -0.7510627 0.1673720 0.0501436 -0.4435868 0.0028205 -0.6119873 -0.0455750 -0.2196326 -0.1677163 -0.2707097 -0.1541038 -0.7800554 0.7501369 -0.2572368 0.0345074 0.0051678 4.99 0
7 -0.6442694 1.4179635 1.0743804 -0.4921990 0.9489341 0.4281185 1.1206314 -3.8078642 0.6153747 1.2493762 -0.6194678 0.2914744 1.7579642 -1.3238652 0.6861325 -0.0761270 -1.2221273 -0.3582216 0.3245047 -0.1567419 1.9434653 -1.0154547 0.0575035 -0.6497090 -0.4152666 -0.0516343 -1.2069211 -1.0853392 40.80 0
7 -0.8942861 0.2861572 -0.1131922 -0.2715261 2.6695987 3.7218181 0.3701451 0.8510844 -0.3920476 -0.4104304 -0.7051166 -0.1104523 -0.2862536 0.0743554 -0.3287831 -0.2100773 -0.4997680 0.1187649 0.5703282 0.0527357 -0.0734251 -0.2680916 -0.2042327 1.0115918 0.3732047 -0.3841573 0.0117474 0.1424043 93.20 0
9 -0.3382618 1.1195934 1.0443666 -0.2221873 0.4993608 -0.2467611 0.6515832 0.0695386 -0.7367273 -0.3668456 1.0176145 0.8363896 1.0068435 -0.4435228 0.1502191 0.7394528 -0.5409799 0.4766773 0.4517730 0.2037115 -0.2469139 -0.6337526 -0.1207941 -0.3850499 -0.0697330 0.0941988 0.2462193 0.0830756 3.68 0

There are totally 31 columns in the data. One column, Class is the target value; it is a binary value, can have either 0 (not fraud) or 1 (fraud) value. Another two columns have clear meaning: Amount is the amount of the transaction; Time is the time of the transaction. The rest of the features (28), anonymized, are named from V1 to V28. The data is highly unbalanced with respect of Class variable values. There are only 0.1727486% of the rows with value Class = 1. Typically, in such cases, we can either choose to preserve the data unbalancing or use a oversampling (of the data with minority value of target variable) or undersampling (of the data with majority value of the target variable). Here we will just preserve the unbalancing of the data. In terms of validation of the result, we will see that usual metrix, using a confusion matrix or accuracy are not the most relevant and will be prefered alternative solutions using AUC.

Correlations

We represent the Pearson correlation for the data.

correlations <- cor(raw.data,method="pearson")
corrplot(correlations, number.cex = .9, method = "circle", type = "full", tl.cex=0.8,tl.col = "black")

We can observe that most of the data features are not correlated. This is because before publishing, most of the features were presented to a Principal Component Analysis (PCA) algorithm. The features V1 to V28 are most probably the Principal Components resulted after propagating the real features through PCA. We do not know if the numbering of the features reflects the importance of the Principal Components. This information might be checked partially using the Variable Importance from Random Forest.

Model

After we split the data in a training and test set, we create the RF model using the training set.

nrows <- nrow(raw.data)
set.seed(314)
indexT <- sample(1:nrow(raw.data), 0.7 * nrows)

#separate train and validation set
trainset = raw.data[indexT,]
verset =   raw.data[-indexT,]

n <- names(trainset)
rf.form <- as.formula(paste("Class ~", paste(n[!n %in% "Class"], collapse = " + ")))

trainset.rf <- randomForest(rf.form,trainset,ntree=100,importance=T)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

For the trained model, let’s see visualize the variable importance.

varimp <- data.frame(trainset.rf$importance)

  vi1 <- ggplot(varimp, aes(x=reorder(rownames(varimp),IncNodePurity), y=IncNodePurity)) +
  geom_bar(stat="identity", fill="tomato", colour="black") +
  coord_flip() + theme_bw(base_size = 8) +
  labs(title="Prediction using RandomForest with 100 trees", subtitle="Variable importance (IncNodePurity)", x="Variable", y="Variable importance (IncNodePurity)")
  
  vi2 <- ggplot(varimp, aes(x=reorder(rownames(varimp),X.IncMSE), y=X.IncMSE)) +
  geom_bar(stat="identity", fill="lightblue", colour="black") +
  coord_flip() + theme_bw(base_size = 8) +
  labs(title="Prediction using RandomForest with 100 trees", subtitle="Variable importance (%IncMSE)", x="Variable", y="Variable importance (%IncMSE)")

grid.arrange(vi1, vi2, ncol=2)

Prediction

Let’s use the trained model for prediction of the Fraud/Not Fraud Class for the test set.

verset$predicted <- predict(trainset.rf ,verset)

For the threshold at 0.5, let’s represent the Confusion matrix.

plot_confusion_matrix(verset, "Random Forest with 100 trees")

For such a problem, where the number of TP is very small in comparison with the number of TN, the Confusion Matrix is less useful, since it is important to use a metric that include evaluation of FP and FN as well. It is important to minimize as much as possible the number of FN (Predicted: Not Fraud and True: Fraud) since their cost could be very large. Tipically AUC is used for such cases.

Let’s calculate the TP, FP, TN, FN, ROC, AUC and cost for threshold with values between 0 and 1 (100 values equaly distributed) and cost 1 for TN and 10 for FN.

roc <- calculate_roc(verset, 1, 10, n = 100)

mincost <- min(roc$cost)
roc %>%
 mutate(
   auc = ifelse(cost == mincost,
                  cell_spec(sprintf("%.5f", auc), "html", color = "green", background = "lightblue", bold = T),
                  cell_spec(sprintf("%.5f", auc), "html", color = "black", bold = F))
  ) %>%
  kable( "html", escape=F, align="c") %>%
   kable_styling(bootstrap_options = "striped", full_width = F, position = "center") %>%
   scroll_box(height = "600px")
threshold tpr fpr tp fp tn fn cost auc
0.0000000 0.9139073 0.0259344 138 2212 83080 13 2342 0.94399
0.0101010 0.9006623 0.0086995 136 742 84550 15 892 0.94598
0.0202020 0.8940397 0.0044553 135 380 84912 16 540 0.94479
0.0303030 0.8874172 0.0028373 134 242 85050 17 412 0.94229
0.0404040 0.8874172 0.0020401 134 174 85118 17 344 0.94269
0.0505051 0.8874172 0.0014656 134 125 85167 17 295 0.94298
0.0606061 0.8741722 0.0011724 132 100 85192 19 290 0.93650
0.0707071 0.8741722 0.0009849 132 84 85208 19 274 0.93659
0.0808081 0.8741722 0.0008324 132 71 85221 19 261 0.93667
0.0909091 0.8675497 0.0007386 131 63 85229 20 263 0.93341
0.1010101 0.8675497 0.0006097 131 52 85240 20 252 0.93347
0.1111111 0.8675497 0.0005159 131 44 85248 20 244 0.93352
0.1212121 0.8675497 0.0004924 131 42 85250 20 242 0.93353
0.1313131 0.8609272 0.0004455 130 38 85254 21 248 0.93024
0.1414141 0.8609272 0.0004455 130 38 85254 21 248 0.93024
0.1515152 0.8609272 0.0004104 130 35 85257 21 245 0.93026
0.1616162 0.8609272 0.0004104 130 35 85257 21 245 0.93026
0.1717172 0.8609272 0.0003986 130 34 85258 21 244 0.93026
0.1818182 0.8609272 0.0003517 130 30 85262 21 240 0.93029
0.1919192 0.8609272 0.0003517 130 30 85262 21 240 0.93029
0.2020202 0.8609272 0.0003048 130 26 85266 21 236 0.93031
0.2121212 0.8609272 0.0002931 130 25 85267 21 235 0.93032
0.2222222 0.8609272 0.0002697 130 23 85269 21 233 0.93033
0.2323232 0.8609272 0.0002697 130 23 85269 21 233 0.93033
0.2424242 0.8609272 0.0002579 130 22 85270 21 232 0.93033
0.2525253 0.8609272 0.0002462 130 21 85271 21 231 0.93034
0.2626263 0.8609272 0.0002228 130 19 85273 21 229 0.93035
0.2727273 0.8609272 0.0001993 130 17 85275 21 227 0.93036
0.2828283 0.8609272 0.0001876 130 16 85276 21 226 0.93037
0.2929293 0.8609272 0.0001759 130 15 85277 21 225 0.93038
0.3030303 0.8609272 0.0001759 130 15 85277 21 225 0.93038
0.3131313 0.8609272 0.0001759 130 15 85277 21 225 0.93038
0.3232323 0.8609272 0.0001641 130 14 85278 21 224 0.93038
0.3333333 0.8543046 0.0001641 129 14 85278 22 234 0.92707
0.3434343 0.8476821 0.0001641 128 14 85278 23 244 0.92376
0.3535354 0.8410596 0.0001524 127 13 85279 24 253 0.92045
0.3636364 0.8410596 0.0001407 127 12 85280 24 252 0.92046
0.3737374 0.8410596 0.0001290 127 11 85281 24 251 0.92047
0.3838384 0.8410596 0.0001290 127 11 85281 24 251 0.92047
0.3939394 0.8410596 0.0001290 127 11 85281 24 251 0.92047
0.4040404 0.8410596 0.0001290 127 11 85281 24 251 0.92047
0.4141414 0.8410596 0.0001172 127 10 85282 24 250 0.92047
0.4242424 0.8410596 0.0001172 127 10 85282 24 250 0.92047
0.4343434 0.8410596 0.0001172 127 10 85282 24 250 0.92047
0.4444444 0.8410596 0.0001055 127 9 85283 24 249 0.92048
0.4545455 0.8410596 0.0000938 127 8 85284 24 248 0.92048
0.4646465 0.8410596 0.0000938 127 8 85284 24 248 0.92048
0.4747475 0.8410596 0.0000938 127 8 85284 24 248 0.92048
0.4848485 0.8410596 0.0000938 127 8 85284 24 248 0.92048
0.4949495 0.8344371 0.0000821 126 7 85285 25 257 0.91718
0.5050505 0.8278146 0.0000586 125 5 85287 26 265 0.91388
0.5151515 0.8278146 0.0000586 125 5 85287 26 265 0.91388
0.5252525 0.8211921 0.0000586 124 5 85287 27 275 0.91057
0.5353535 0.8079470 0.0000586 122 5 85287 29 295 0.90394
0.5454545 0.8079470 0.0000586 122 5 85287 29 295 0.90394
0.5555556 0.8079470 0.0000586 122 5 85287 29 295 0.90394
0.5656566 0.8079470 0.0000586 122 5 85287 29 295 0.90394
0.5757576 0.8079470 0.0000469 122 4 85288 29 294 0.90395
0.5858586 0.8079470 0.0000469 122 4 85288 29 294 0.90395
0.5959596 0.8013245 0.0000352 121 3 85289 30 303 0.90064
0.6060606 0.8013245 0.0000352 121 3 85289 30 303 0.90064
0.6161616 0.7947020 0.0000352 120 3 85289 31 313 0.89733
0.6262626 0.7947020 0.0000352 120 3 85289 31 313 0.89733
0.6363636 0.7880795 0.0000352 119 3 85289 32 323 0.89402
0.6464646 0.7814570 0.0000352 118 3 85289 33 333 0.89071
0.6565657 0.7814570 0.0000352 118 3 85289 33 333 0.89071
0.6666667 0.7748344 0.0000352 117 3 85289 34 343 0.88740
0.6767677 0.7615894 0.0000352 115 3 85289 36 363 0.88078
0.6868687 0.7549669 0.0000352 114 3 85289 37 373 0.87747
0.6969697 0.7549669 0.0000352 114 3 85289 37 373 0.87747
0.7070707 0.7483444 0.0000352 113 3 85289 38 383 0.87415
0.7171717 0.7350993 0.0000352 111 3 85289 40 403 0.86753
0.7272727 0.7350993 0.0000352 111 3 85289 40 403 0.86753
0.7373737 0.7284768 0.0000352 110 3 85289 41 413 0.86422
0.7474747 0.7284768 0.0000352 110 3 85289 41 413 0.86422
0.7575758 0.7284768 0.0000352 110 3 85289 41 413 0.86422
0.7676768 0.7086093 0.0000352 107 3 85289 44 443 0.85429
0.7777778 0.7086093 0.0000234 107 2 85290 44 442 0.85429
0.7878788 0.7019868 0.0000234 106 2 85290 45 452 0.85098
0.7979798 0.6821192 0.0000234 103 2 85290 48 482 0.84105
0.8080808 0.6622517 0.0000234 100 2 85290 51 512 0.83111
0.8181818 0.6291391 0.0000234 95 2 85290 56 562 0.81456
0.8282828 0.6158940 0.0000234 93 2 85290 58 582 0.80794
0.8383838 0.6026490 0.0000234 91 2 85290 60 602 0.80131
0.8484848 0.5960265 0.0000117 90 1 85291 61 611 0.79801
0.8585859 0.5827815 0.0000117 88 1 85291 63 631 0.79138
0.8686869 0.5562914 0.0000117 84 1 85291 67 671 0.77814
0.8787879 0.5364238 0.0000117 81 1 85291 70 701 0.76821
0.8888889 0.5231788 0.0000117 79 1 85291 72 721 0.76158
0.8989899 0.4900662 0.0000000 74 0 85292 77 770 0.74503
0.9090909 0.4701987 0.0000000 71 0 85292 80 800 0.73510
0.9191919 0.4635762 0.0000000 70 0 85292 81 810 0.73179
0.9292929 0.4503311 0.0000000 68 0 85292 83 830 0.72517
0.9393939 0.4039735 0.0000000 61 0 85292 90 900 0.70199
0.9494949 0.3443709 0.0000000 52 0 85292 99 990 0.67219
0.9595960 0.3178808 0.0000000 48 0 85292 103 1030 0.65894
0.9696970 0.3046358 0.0000000 46 0 85292 105 1050 0.65232
0.9797980 0.2516556 0.0000000 38 0 85292 113 1130 0.62583
0.9898990 0.1589404 0.0000000 24 0 85292 127 1270 0.57947
1.0000000 0.0728477 0.0000000 11 0 85292 140 1400 0.53642

Let’s plot the ROC, AUC and cost functions for a ref. threshold of 0.3.

threshold = 0.3
plot_roc(roc, threshold, 1, 10)

Conclusions

The calculated accuracy is not very relevant in the conditions where there is a very large unbalance between the number of fraud and non-fraud events in the dataset. In such cases, we can see a very large accuracy. More relevant is the value of ROC-AUC (Area Under Curve for the Receiver Operator Characteristic). The value obtained (0.93) is relativelly good, considering that we did not performed any tunning, working with default RF algorithm parameters.

References

[1] Receiver Operating Characteristic, https://en.wikipedia.org/wiki/Receiver_operating_characteristic
[2] Area under curve, https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
[3] Computing and visualizing PCA in R, https://www.r-bloggers.com/computing-and-visualizing-pca-in-r/