In this markdown, I’ve used German credit card dataset and used SMOTE to handle class imbalance and then I’ve used Logistic and Random Forest to predict if the probability of fraud.

The datasets contain transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions

Dataset link

x <- read.csv("E:\\Papers\\creditcard.csv", sep = ",")

set.seed(1029)
## Remove rows that do not have target variable values
final <- x[!(is.na(x$Class)),]

final$Class <- factor(final$Class)

library(caTools)

split <- sample.split(final$Class, SplitRatio = 0.75)

dresstrain <- subset(final, split == TRUE)
dresstest <- subset(final, split == FALSE)


## Let's check the count of unique value in the target variable
as.data.frame(table(dresstrain$Class))
##   Var1   Freq
## 1    0 213236
## 2    1    369
## Loading DMwr to balance the unbalanced class
library(DMwR)

## Smote : Synthetic Minority Oversampling Technique To Handle Class Imbalancy In Binary Classification
balanced.data <- SMOTE(Class ~., dresstrain, perc.over = 4800, k = 5, perc.under = 1000)

as.data.frame(table(balanced.data$Class))
##   Var1   Freq
## 1    0 177120
## 2    1  18081

Logistic Regression

library(caret)  

model <- glm (Class~., data=balanced.data, family = binomial)
summary(model)
## 
## Call:
## glm(formula = Class ~ ., family = binomial, data = balanced.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.4904  -0.1345  -0.0819  -0.0478   3.3359  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.359e+00  7.607e-02 -70.447  < 2e-16 ***
## Time        -3.417e-06  6.392e-07  -5.346 8.97e-08 ***
## V1           1.884e-01  1.809e-02  10.415  < 2e-16 ***
## V2           1.685e-01  3.935e-02   4.281 1.86e-05 ***
## V3           9.730e-02  2.023e-02   4.809 1.52e-06 ***
## V4           7.327e-01  2.062e-02  35.538  < 2e-16 ***
## V5           2.216e-01  2.890e-02   7.667 1.76e-14 ***
## V6          -2.952e-01  2.562e-02 -11.521  < 2e-16 ***
## V7          -1.377e-01  3.549e-02  -3.879 0.000105 ***
## V8          -2.812e-01  1.594e-02 -17.639  < 2e-16 ***
## V9          -1.487e-01  2.957e-02  -5.027 4.99e-07 ***
## V10         -5.672e-01  3.238e-02 -17.516  < 2e-16 ***
## V11          1.888e-01  2.234e-02   8.454  < 2e-16 ***
## V12         -5.884e-01  2.315e-02 -25.420  < 2e-16 ***
## V13         -4.877e-01  2.058e-02 -23.694  < 2e-16 ***
## V14         -7.797e-01  2.047e-02 -38.096  < 2e-16 ***
## V15          5.692e-03  2.401e-02   0.237 0.812620    
## V16         -1.856e-01  3.199e-02  -5.803 6.53e-09 ***
## V17         -1.270e-01  2.262e-02  -5.613 1.99e-08 ***
## V18          6.157e-03  3.308e-02   0.186 0.852358    
## V19         -1.155e-01  2.775e-02  -4.162 3.15e-05 ***
## V20         -3.460e-01  4.104e-02  -8.430  < 2e-16 ***
## V21          1.704e-01  2.141e-02   7.958 1.75e-15 ***
## V22          3.873e-01  3.648e-02  10.619  < 2e-16 ***
## V23          1.175e-02  3.474e-02   0.338 0.735187    
## V24          2.134e-01  4.225e-02   5.050 4.42e-07 ***
## V25          2.036e-01  4.304e-02   4.730 2.25e-06 ***
## V26         -3.818e-01  5.476e-02  -6.972 3.12e-12 ***
## V27         -1.297e-01  6.074e-02  -2.134 0.032804 *  
## V28          1.802e-02  5.698e-02   0.316 0.751872    
## Amount       2.313e-03  3.561e-04   6.495 8.31e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 120468  on 195200  degrees of freedom
## Residual deviance:  20712  on 195170  degrees of freedom
## AIC: 20774
## 
## Number of Fisher Scoring iterations: 25
## Predict the Values
predict <- predict(model, dresstest, type = 'response')

## Create Confusion Matrix
table(dresstest$Class, predict > 0.5)
##    
##     FALSE  TRUE
##   0 70872   207
##   1    18   105
#ROCR Curve
library(ROCR)
ROCRpred <- prediction(predict, dresstest$Class)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7))

Random FOrest

library(randomForest)  
library(e1071)  


rf = randomForest(Class~.,  
                   ntree = 100,
                   data = balanced.data)
plot(rf) 

varImp(rf)
##          Overall
## Time    212.8759
## V1      321.0845
## V2      463.3183
## V3     1916.0623
## V4     1264.0709
## V5      188.7063
## V6      340.9743
## V7      929.5873
## V8      270.2217
## V9     1140.0924
## V10    4865.3979
## V11    2678.2463
## V12    3266.2186
## V13     243.7294
## V14    6561.4483
## V15     180.2387
## V16    1908.9783
## V17    3115.7313
## V18     529.4505
## V19     222.6081
## V20     162.6423
## V21     493.5526
## V22     145.5791
## V23     156.9237
## V24     154.1533
## V25     145.8141
## V26     185.8320
## V27     271.7943
## V28     222.8617
## Amount  250.0227
## Important variables according to the model
varImpPlot(rf,  
           sort = T,
           n.var=25,
           main="Variable Importance")

predicted.response <- predict(rf, dresstest)


confusionMatrix(data=predicted.response,  
                reference=dresstest$Class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 71060    20
##          1    19   103
##                                           
##                Accuracy : 0.9995          
##                  95% CI : (0.9993, 0.9996)
##     No Information Rate : 0.9983          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8405          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9997          
##             Specificity : 0.8374          
##          Pos Pred Value : 0.9997          
##          Neg Pred Value : 0.8443          
##              Prevalence : 0.9983          
##          Detection Rate : 0.9980          
##    Detection Prevalence : 0.9983          
##       Balanced Accuracy : 0.9186          
##                                           
##        'Positive' Class : 0               
##