Introduction

This kernel uses and modifies code from https://www.r-bloggers.com/illustrated-guide-to-roc-and-auc/.

We input the data.

raw.data <- read.csv("creditcard.csv")

Utility functions

We will use a function for calculation of ROC and AUC.

#calculate ROC (https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
calculate_roc <- function(verset, cost_of_fp, cost_of_fn, n=100) {
  
  tp <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 1)
  }
  
  fp <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 0)
  }
  
  tn <- function(verset, threshold) {
    sum(verset$predicted < threshold & verset$Class == 0)
  }
  
  fn <- function(verset, threshold) {
    sum(verset$predicted < threshold & verset$Class == 1)
  }
  
  tpr <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 1) / sum(verset$Class == 1)
  }
  
  fpr <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 0) / sum(verset$Class == 0)
  }
  
  cost <- function(verset, threshold, cost_of_fp, cost_of_fn) {
    sum(verset$predicted >= threshold & verset$Class == 0) * cost_of_fp + 
      sum(verset$predicted < threshold & verset$Class == 1) * cost_of_fn
  }
  fpr <- function(verset, threshold) {
    sum(verset$predicted >= threshold & verset$Class == 0) / sum(verset$Class == 0)
  }
  
  threshold_round <- function(value, threshold)
  {
    return (as.integer(!(value < threshold)))
  }
  #calculate AUC (https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)
  auc_ <- function(verset, threshold) {
    auc(verset$Class, threshold_round(verset$predicted,threshold))
  }
  
  roc <- data.frame(threshold = seq(0,1,length.out=n), tpr=NA, fpr=NA)
  roc$tp <- sapply(roc$threshold, function(th) tp(verset, th))
  roc$fp <- sapply(roc$threshold, function(th) fp(verset, th))
  roc$tn <- sapply(roc$threshold, function(th) tn(verset, th))
  roc$fn <- sapply(roc$threshold, function(th) fn(verset, th))
  roc$tpr <- sapply(roc$threshold, function(th) tpr(verset, th))
  roc$fpr <- sapply(roc$threshold, function(th) fpr(verset, th))
  roc$cost <- sapply(roc$threshold, function(th) cost(verset, th, cost_of_fp, cost_of_fn))
  roc$auc <-  sapply(roc$threshold, function(th) auc_(verset, th))
  
  return(roc)
}

The following functions is for graphical representation of ROC, AUC and cost function.

plot_roc <- function(roc, threshold, cost_of_fp, cost_of_fn) {
  library(gridExtra)
  
  norm_vec <- function(v) (v - min(v))/diff(range(v))
  
  idx_threshold = which.min(abs(roc$threshold-threshold))
  
  col_ramp <- colorRampPalette(c("green","orange","red","black"))(100)
  col_by_cost <- col_ramp[ceiling(norm_vec(roc$cost)*99)+1]
  p_roc <- ggplot(roc, aes(fpr,tpr)) + 
    geom_line(color=rgb(0,0,1,alpha=0.3)) +
    geom_point(color=col_by_cost, size=2, alpha=0.5) +
    labs(title = sprintf("ROC")) + xlab("FPR") + ylab("TPR") +
    geom_hline(yintercept=roc[idx_threshold,"tpr"], alpha=0.5, linetype="dashed") +
    geom_vline(xintercept=roc[idx_threshold,"fpr"], alpha=0.5, linetype="dashed")
  
  p_auc <- ggplot(roc, aes(threshold, auc)) +
    geom_line(color=rgb(0,0,1,alpha=0.3)) +
    geom_point(color=col_by_cost, size=2, alpha=0.5) +
    labs(title = sprintf("AUC")) +
    geom_vline(xintercept=threshold, alpha=0.5, linetype="dashed")
  
  p_cost <- ggplot(roc, aes(threshold, cost)) +
    geom_line(color=rgb(0,0,1,alpha=0.3)) +
    geom_point(color=col_by_cost, size=2, alpha=0.5) +
    labs(title = sprintf("cost function")) +
    geom_vline(xintercept=threshold, alpha=0.5, linetype="dashed")
  
  sub_title <- sprintf("threshold at %.2f - cost of FP = %d, cost of FN = %d", threshold, cost_of_fp, cost_of_fn)
  # 
  grid.arrange(p_roc, p_auc, p_cost, ncol=2,sub=textGrob(sub_title, gp=gpar(cex=1), just="bottom"))
}

The follwing function is for showing a confusion matrix.

plot_confusion_matrix <- function(verset, sSubtitle) {
    tst <- data.frame(round(verset$predicted,0), verset$Class)
    opts <-  c("Predicted", "True")
    names(tst) <- opts
    cf <- plyr::count(tst)
    cf[opts][cf[opts]==0] <- "Not Fraud"
    cf[opts][cf[opts]==1] <- "Fraud"
    
    ggplot(data =  cf, mapping = aes(x = True, y = Predicted)) +
      labs(title = "Confusion matrix", subtitle = sSubtitle) +
      geom_tile(aes(fill = freq), colour = "grey") +
      geom_text(aes(label = sprintf("%1.0f", freq)), vjust = 1) +
      scale_fill_gradient(low = "lightblue", high = "blue") +
      theme_bw() + theme(legend.position = "none")
  
}

The function will be called to compare the predicted value with the real values.

Explore the data

Let’s glimpse the data:

Columns and rows:

sprintf("Rows: %d Columns: %d",nrow(raw.data), length(names(raw.data)))

## [1] "Rows: 284807 Columns: 31"

See first few rows of data:

head(raw.data,10) %>%
  kable( "html", escape=F, align="c") %>%
   kable_styling(bootstrap_options = "striped", full_width = F, position = "center")

Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	-1.3598071	-0.0727812	2.5363467	1.3781552	-0.3383208	0.4623878	0.2395986	0.0986979	0.3637870	0.0907942	-0.5515995	-0.6178009	-0.9913898	-0.3111694	1.4681770	-0.4704005	0.2079712	0.0257906	0.4039930	0.2514121	-0.0183068	0.2778376	-0.1104739	0.0669281	0.1285394	-0.1891148	0.1335584	-0.0210531	149.62
0	1.1918571	0.2661507	0.1664801	0.4481541	0.0600176	-0.0823608	-0.0788030	0.0851017	-0.2554251	-0.1669744	1.6127267	1.0652353	0.4890950	-0.1437723	0.6355581	0.4639170	-0.1148047	-0.1833613	-0.1457830	-0.0690831	-0.2257752	-0.6386720	0.1012880	-0.3398465	0.1671704	0.1258945	-0.0089831	0.0147242	2.69
1	-1.3583541	-1.3401631	1.7732093	0.3797796	-0.5031981	1.8004994	0.7914610	0.2476758	-1.5146543	0.2076429	0.6245015	0.0660837	0.7172927	-0.1659459	2.3458649	-2.8900832	1.1099694	-0.1213593	-2.2618571	0.5249797	0.2479982	0.7716794	0.9094123	-0.6892810	-0.3276418	-0.1390966	-0.0553528	-0.0597518	378.66
1	-0.9662717	-0.1852260	1.7929933	-0.8632913	-0.0103089	1.2472032	0.2376089	0.3774359	-1.3870241	-0.0549519	-0.2264873	0.1782282	0.5077569	-0.2879237	-0.6314181	-1.0596472	-0.6840928	1.9657750	-1.2326220	-0.2080378	-0.1083005	0.0052736	-0.1903205	-1.1755753	0.6473760	-0.2219288	0.0627228	0.0614576	123.50
2	-1.1582331	0.8777368	1.5487178	0.4030339	-0.4071934	0.0959215	0.5929407	-0.2705327	0.8177393	0.7530744	-0.8228429	0.5381956	1.3458516	-1.1196698	0.1751211	-0.4514492	-0.2370332	-0.0381948	0.8034869	0.4085424	-0.0094307	0.7982785	-0.1374581	0.1412670	-0.2060096	0.5022922	0.2194222	0.2151531	69.99
2	-0.4259659	0.9605230	1.1411093	-0.1682521	0.4209869	-0.0297276	0.4762009	0.2603143	-0.5686714	-0.3714072	1.3412620	0.3598938	-0.3580907	-0.1371337	0.5176168	0.4017259	-0.0581328	0.0686531	-0.0331938	0.0849677	-0.2082535	-0.5598248	-0.0263977	-0.3714266	-0.2327938	0.1059148	0.2538442	0.0810803	3.67
4	1.2296576	0.1410035	0.0453708	1.2026127	0.1918810	0.2727081	-0.0051590	0.0812129	0.4649600	-0.0992543	-1.4169072	-0.1538258	-0.7510627	0.1673720	0.0501436	-0.4435868	0.0028205	-0.6119873	-0.0455750	-0.2196326	-0.1677163	-0.2707097	-0.1541038	-0.7800554	0.7501369	-0.2572368	0.0345074	0.0051678	4.99
7	-0.6442694	1.4179635	1.0743804	-0.4921990	0.9489341	0.4281185	1.1206314	-3.8078642	0.6153747	1.2493762	-0.6194678	0.2914744	1.7579642	-1.3238652	0.6861325	-0.0761270	-1.2221273	-0.3582216	0.3245047	-0.1567419	1.9434653	-1.0154547	0.0575035	-0.6497090	-0.4152666	-0.0516343	-1.2069211	-1.0853392	40.80
7	-0.8942861	0.2861572	-0.1131922	-0.2715261	2.6695987	3.7218181	0.3701451	0.8510844	-0.3920476	-0.4104304	-0.7051166	-0.1104523	-0.2862536	0.0743554	-0.3287831	-0.2100773	-0.4997680	0.1187649	0.5703282	0.0527357	-0.0734251	-0.2680916	-0.2042327	1.0115918	0.3732047	-0.3841573	0.0117474	0.1424043	93.20
9	-0.3382618	1.1195934	1.0443666	-0.2221873	0.4993608	-0.2467611	0.6515832	0.0695386	-0.7367273	-0.3668456	1.0176145	0.8363896	1.0068435	-0.4435228	0.1502191	0.7394528	-0.5409799	0.4766773	0.4517730	0.2037115	-0.2469139	-0.6337526	-0.1207941	-0.3850499	-0.0697330	0.0941988	0.2462193	0.0830756	3.68

There are totally 31 columns in the data. One column, Class is the target value; it is a binary value, can have either 0 (not fraud) or 1 (fraud) value. Another two columns have clear meaning: Amount is the amount of the transaction; Time is the time of the transaction. The rest of the features (28), anonymized, are named from V1 to V28. The data is highly unbalanced with respect of Class variable values. There are only 0.1727486% of the rows with value Class = 1. Typically, in such cases, we can either choose to preserve the data unbalancing or use a oversampling (of the data with minority value of target variable) or undersampling (of the data with majority value of the target variable). Here we will just preserve the unbalancing of the data. In terms of validation of the result, we will see that usual metrix, using a confusion matrix or accuracy are not the most relevant and will be prefered alternative solutions using AUC.

Correlations

We represent the Pearson correlation for the data.

correlations <- cor(raw.data,method="pearson")
corrplot(correlations, number.cex = .9, method = "circle", type = "full", tl.cex=0.8,tl.col = "black")

We can observe that most of the data features are not correlated. This is because before publishing, most of the features were presented to a Principal Component Analysis (PCA) algorithm. The features V1 to V28 are most probably the Principal Components resulted after propagating the real features through PCA. We do not know if the numbering of the features reflects the importance of the Principal Components. This information might be checked partially using the Variable Importance from Random Forest.

Model

After we split the data in a training and test set, we create the RF model using the training set.

nrows <- nrow(raw.data)
set.seed(314)
indexT <- sample(1:nrow(raw.data), 0.7 * nrows)

#separate train and validation set
trainset = raw.data[indexT,]
verset =   raw.data[-indexT,]

n <- names(trainset)
rf.form <- as.formula(paste("Class ~", paste(n[!n %in% "Class"], collapse = " + ")))

trainset.rf <- randomForest(rf.form,trainset,ntree=100,importance=T)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

For the trained model, let’s see visualize the variable importance.

varimp <- data.frame(trainset.rf$importance)

  vi1 <- ggplot(varimp, aes(x=reorder(rownames(varimp),IncNodePurity), y=IncNodePurity)) +
  geom_bar(stat="identity", fill="tomato", colour="black") +
  coord_flip() + theme_bw(base_size = 8) +
  labs(title="Prediction using RandomForest with 100 trees", subtitle="Variable importance (IncNodePurity)", x="Variable", y="Variable importance (IncNodePurity)")
  
  vi2 <- ggplot(varimp, aes(x=reorder(rownames(varimp),X.IncMSE), y=X.IncMSE)) +
  geom_bar(stat="identity", fill="lightblue", colour="black") +
  coord_flip() + theme_bw(base_size = 8) +
  labs(title="Prediction using RandomForest with 100 trees", subtitle="Variable importance (%IncMSE)", x="Variable", y="Variable importance (%IncMSE)")

grid.arrange(vi1, vi2, ncol=2)

Prediction

Let’s use the trained model for prediction of the Fraud/Not Fraud Class for the test set.

verset$predicted <- predict(trainset.rf ,verset)

For the threshold at 0.5, let’s represent the Confusion matrix.

plot_confusion_matrix(verset, "Random Forest with 100 trees")

For such a problem, where the number of TP is very small in comparison with the number of TN, the Confusion Matrix is less useful, since it is important to use a metric that include evaluation of FP and FN as well. It is important to minimize as much as possible the number of FN (Predicted: Not Fraud and True: Fraud) since their cost could be very large. Tipically AUC is used for such cases.

Let’s calculate the TP, FP, TN, FN, ROC, AUC and cost for threshold with values between 0 and 1 (100 values equaly distributed) and cost 1 for TN and 10 for FN.

roc <- calculate_roc(verset, 1, 10, n = 100)

mincost <- min(roc$cost)
roc %>%
 mutate(
   auc = ifelse(cost == mincost,
                  cell_spec(sprintf("%.5f", auc), "html", color = "green", background = "lightblue", bold = T),
                  cell_spec(sprintf("%.5f", auc), "html", color = "black", bold = F))
  ) %>%
  kable( "html", escape=F, align="c") %>%
   kable_styling(bootstrap_options = "striped", full_width = F, position = "center") %>%
   scroll_box(height = "600px")

threshold	tpr	fpr	tp	fp	tn	fn	cost	auc
0.0000000	0.9139073	0.0259344	138	2212	83080	13	2342	0.94399
0.0101010	0.9006623	0.0086995	136	742	84550	15	892	0.94598
0.0202020	0.8940397	0.0044553	135	380	84912	16	540	0.94479
0.0303030	0.8874172	0.0028373	134	242	85050	17	412	0.94229
0.0404040	0.8874172	0.0020401	134	174	85118	17	344	0.94269
0.0505051	0.8874172	0.0014656	134	125	85167	17	295	0.94298
0.0606061	0.8741722	0.0011724	132	100	85192	19	290	0.93650
0.0707071	0.8741722	0.0009849	132	84	85208	19	274	0.93659
0.0808081	0.8741722	0.0008324	132	71	85221	19	261	0.93667
0.0909091	0.8675497	0.0007386	131	63	85229	20	263	0.93341
0.1010101	0.8675497	0.0006097	131	52	85240	20	252	0.93347
0.1111111	0.8675497	0.0005159	131	44	85248	20	244	0.93352
0.1212121	0.8675497	0.0004924	131	42	85250	20	242	0.93353
0.1313131	0.8609272	0.0004455	130	38	85254	21	248	0.93024
0.1414141	0.8609272	0.0004455	130	38	85254	21	248	0.93024
0.1515152	0.8609272	0.0004104	130	35	85257	21	245	0.93026
0.1616162	0.8609272	0.0004104	130	35	85257	21	245	0.93026
0.1717172	0.8609272	0.0003986	130	34	85258	21	244	0.93026
0.1818182	0.8609272	0.0003517	130	30	85262	21	240	0.93029
0.1919192	0.8609272	0.0003517	130	30	85262	21	240	0.93029
0.2020202	0.8609272	0.0003048	130	26	85266	21	236	0.93031
0.2121212	0.8609272	0.0002931	130	25	85267	21	235	0.93032
0.2222222	0.8609272	0.0002697	130	23	85269	21	233	0.93033
0.2323232	0.8609272	0.0002697	130	23	85269	21	233	0.93033
0.2424242	0.8609272	0.0002579	130	22	85270	21	232	0.93033
0.2525253	0.8609272	0.0002462	130	21	85271	21	231	0.93034
0.2626263	0.8609272	0.0002228	130	19	85273	21	229	0.93035
0.2727273	0.8609272	0.0001993	130	17	85275	21	227	0.93036
0.2828283	0.8609272	0.0001876	130	16	85276	21	226	0.93037
0.2929293	0.8609272	0.0001759	130	15	85277	21	225	0.93038
0.3030303	0.8609272	0.0001759	130	15	85277	21	225	0.93038
0.3131313	0.8609272	0.0001759	130	15	85277	21	225	0.93038
0.3232323	0.8609272	0.0001641	130	14	85278	21	224	0.93038
0.3333333	0.8543046	0.0001641	129	14	85278	22	234	0.92707
0.3434343	0.8476821	0.0001641	128	14	85278	23	244	0.92376
0.3535354	0.8410596	0.0001524	127	13	85279	24	253	0.92045
0.3636364	0.8410596	0.0001407	127	12	85280	24	252	0.92046
0.3737374	0.8410596	0.0001290	127	11	85281	24	251	0.92047
0.3838384	0.8410596	0.0001290	127	11	85281	24	251	0.92047
0.3939394	0.8410596	0.0001290	127	11	85281	24	251	0.92047
0.4040404	0.8410596	0.0001290	127	11	85281	24	251	0.92047
0.4141414	0.8410596	0.0001172	127	10	85282	24	250	0.92047
0.4242424	0.8410596	0.0001172	127	10	85282	24	250	0.92047
0.4343434	0.8410596	0.0001172	127	10	85282	24	250	0.92047
0.4444444	0.8410596	0.0001055	127	9	85283	24	249	0.92048
0.4545455	0.8410596	0.0000938	127	8	85284	24	248	0.92048
0.4646465	0.8410596	0.0000938	127	8	85284	24	248	0.92048
0.4747475	0.8410596	0.0000938	127	8	85284	24	248	0.92048
0.4848485	0.8410596	0.0000938	127	8	85284	24	248	0.92048
0.4949495	0.8344371	0.0000821	126	7	85285	25	257	0.91718
0.5050505	0.8278146	0.0000586	125	5	85287	26	265	0.91388
0.5151515	0.8278146	0.0000586	125	5	85287	26	265	0.91388
0.5252525	0.8211921	0.0000586	124	5	85287	27	275	0.91057
0.5353535	0.8079470	0.0000586	122	5	85287	29	295	0.90394
0.5454545	0.8079470	0.0000586	122	5	85287	29	295	0.90394
0.5555556	0.8079470	0.0000586	122	5	85287	29	295	0.90394
0.5656566	0.8079470	0.0000586	122	5	85287	29	295	0.90394
0.5757576	0.8079470	0.0000469	122	4	85288	29	294	0.90395
0.5858586	0.8079470	0.0000469	122	4	85288	29	294	0.90395
0.5959596	0.8013245	0.0000352	121	3	85289	30	303	0.90064
0.6060606	0.8013245	0.0000352	121	3	85289	30	303	0.90064
0.6161616	0.7947020	0.0000352	120	3	85289	31	313	0.89733
0.6262626	0.7947020	0.0000352	120	3	85289	31	313	0.89733
0.6363636	0.7880795	0.0000352	119	3	85289	32	323	0.89402
0.6464646	0.7814570	0.0000352	118	3	85289	33	333	0.89071
0.6565657	0.7814570	0.0000352	118	3	85289	33	333	0.89071
0.6666667	0.7748344	0.0000352	117	3	85289	34	343	0.88740
0.6767677	0.7615894	0.0000352	115	3	85289	36	363	0.88078
0.6868687	0.7549669	0.0000352	114	3	85289	37	373	0.87747
0.6969697	0.7549669	0.0000352	114	3	85289	37	373	0.87747
0.7070707	0.7483444	0.0000352	113	3	85289	38	383	0.87415
0.7171717	0.7350993	0.0000352	111	3	85289	40	403	0.86753
0.7272727	0.7350993	0.0000352	111	3	85289	40	403	0.86753
0.7373737	0.7284768	0.0000352	110	3	85289	41	413	0.86422
0.7474747	0.7284768	0.0000352	110	3	85289	41	413	0.86422
0.7575758	0.7284768	0.0000352	110	3	85289	41	413	0.86422
0.7676768	0.7086093	0.0000352	107	3	85289	44	443	0.85429
0.7777778	0.7086093	0.0000234	107	2	85290	44	442	0.85429
0.7878788	0.7019868	0.0000234	106	2	85290	45	452	0.85098
0.7979798	0.6821192	0.0000234	103	2	85290	48	482	0.84105
0.8080808	0.6622517	0.0000234	100	2	85290	51	512	0.83111
0.8181818	0.6291391	0.0000234	95	2	85290	56	562	0.81456
0.8282828	0.6158940	0.0000234	93	2	85290	58	582	0.80794
0.8383838	0.6026490	0.0000234	91	2	85290	60	602	0.80131
0.8484848	0.5960265	0.0000117	90	1	85291	61	611	0.79801
0.8585859	0.5827815	0.0000117	88	1	85291	63	631	0.79138
0.8686869	0.5562914	0.0000117	84	1	85291	67	671	0.77814
0.8787879	0.5364238	0.0000117	81	1	85291	70	701	0.76821
0.8888889	0.5231788	0.0000117	79	1	85291	72	721	0.76158
0.8989899	0.4900662	0.0000000	74	0	85292	77	770	0.74503
0.9090909	0.4701987	0.0000000	71	0	85292	80	800	0.73510
0.9191919	0.4635762	0.0000000	70	0	85292	81	810	0.73179
0.9292929	0.4503311	0.0000000	68	0	85292	83	830	0.72517
0.9393939	0.4039735	0.0000000	61	0	85292	90	900	0.70199
0.9494949	0.3443709	0.0000000	52	0	85292	99	990	0.67219
0.9595960	0.3178808	0.0000000	48	0	85292	103	1030	0.65894
0.9696970	0.3046358	0.0000000	46	0	85292	105	1050	0.65232
0.9797980	0.2516556	0.0000000	38	0	85292	113	1130	0.62583
0.9898990	0.1589404	0.0000000	24	0	85292	127	1270	0.57947
1.0000000	0.0728477	0.0000000	11	0	85292	140	1400	0.53642

Let’s plot the ROC, AUC and cost functions for a ref. threshold of 0.3.

threshold = 0.3
plot_roc(roc, threshold, 1, 10)

Conclusions

The calculated accuracy is not very relevant in the conditions where there is a very large unbalance between the number of fraud and non-fraud events in the dataset. In such cases, we can see a very large accuracy. More relevant is the value of ROC-AUC (Area Under Curve for the Receiver Operator Characteristic). The value obtained (0.93) is relativelly good, considering that we did not performed any tunning, working with default RF algorithm parameters.

References

[1] Receiver Operating Characteristic, https://en.wikipedia.org/wiki/Receiver_operating_characteristic
[2] Area under curve, https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
[3] Computing and visualizing PCA in R, https://www.r-bloggers.com/computing-and-visualizing-pca-in-r/

Credit Card Fraud Detection using RandomForest

Mohit Anand

March 23, 2019