Loading libraries

suppressWarnings(suppressPackageStartupMessages({
library(tidymodels)
library(gridExtra)
library(tidyverse)
library(dplyr)
library(rpart)
library(Amelia)
library(corrr)
library(corrplot)
library(DMwR)
library(ROSE)
library(caret)
library(skimr)
library(DataExplorer)
library(themis)
library(vip)
library(finetune)
}))

Loading the dataset

credit <- read.csv("creditcard.csv")

Data Exploratory Analysis

Retrieving randomly 5 observations

slice_sample(credit, n =5)

##     Time         V1         V2         V3          V4          V5         V6
## 1 160970 -1.5180707  1.2716754 -0.9731566 -1.46417875  1.53963025  5.6668650
## 2  97050 -0.2141594  1.4291548 -0.6355252 -0.31467215  0.77269853 -0.7034733
## 3 149869 -0.9753952 -0.7345229  1.9011316 -3.08922367 -0.41979987 -0.2433192
## 4  29649 -0.8379995  1.4400929  1.0692271 -0.04778071  0.05495333 -1.0066454
## 5  86216  1.2365598 -0.4680224 -0.1736092 -0.79652233  0.03944336  0.4491434
##           V7         V8          V9         V10        V11         V12
## 1 -2.8727109 -5.1184808  1.57681027 -0.45295118 -0.8281466  0.91307554
## 2  0.6097327  0.2069346  0.92329406 -1.47381031  0.2903500 -1.95722465
## 3 -0.4855803  0.1991122 -2.24157053  0.69852304  0.4145764 -0.19223069
## 4  0.6965378 -0.0750572 -0.07795964  0.09673809 -0.1312989  0.18494491
## 5 -0.3337432  0.1493877 -1.23815797  0.48536118  1.0734905  0.02568159
##          V13         V14        V15        V16         V17         V18
## 1 -0.6318797 -0.06665409 -0.8213055 -0.4727753  0.25914515  0.01305042
## 2  2.6534447  0.55078620 -1.0685863  0.3335206  1.25657301 -0.09357265
## 3  0.9051159 -0.69362169 -1.2016133  0.1116274 -0.17613894  0.37398281
## 4  0.6219236 -0.67215191  0.8785842  0.3358349 -0.05884711 -0.21253656
## 5  0.2874330  0.19554251  1.4549940 -0.6490433  1.71367448 -3.68915529
##          V19          V20        V21        V22          V23        V24
## 1 -0.5136326 -1.413849579  6.3268655 -1.1951469  0.442034014  0.6679636
## 2 -0.5051978 -0.053745649 -0.4286161 -0.9383172  0.180539135  0.4619782
## 3 -0.3829592  0.003492558 -0.3637085 -0.7707556 -0.075024239 -0.4615598
## 4 -0.1293844  0.393681200 -0.3262531 -0.7017721  0.023474169  0.2830855
## 5 -1.0414367 -0.060729101  0.3357937  1.0649150 -0.001465721 -0.5852603
##           V25         V26        V27          V28 Amount Class
## 1  1.30552855  0.13337358 0.48488613  0.281076953  16.80     0
## 2 -0.39396921  0.07972389 0.07567006  0.016961015  14.99     0
## 3  0.37825146 -0.42325210 0.26847967  0.118660452  39.95     0
## 4 -0.02453593  0.08423675 0.45905735  0.233793076   2.78     0
## 5  0.44415563  0.02243078 0.04475483 -0.008667399  15.00     0

Retriving the structure of the dataset

str(credit)

## 'data.frame':    284807 obs. of  31 variables:
##  $ Time  : num  0 0 1 1 2 2 4 7 7 9 ...
##  $ V1    : num  -1.36 1.192 -1.358 -0.966 -1.158 ...
##  $ V2    : num  -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
##  $ V3    : num  2.536 0.166 1.773 1.793 1.549 ...
##  $ V4    : num  1.378 0.448 0.38 -0.863 0.403 ...
##  $ V5    : num  -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
##  $ V6    : num  0.4624 -0.0824 1.8005 1.2472 0.0959 ...
##  $ V7    : num  0.2396 -0.0788 0.7915 0.2376 0.5929 ...
##  $ V8    : num  0.0987 0.0851 0.2477 0.3774 -0.2705 ...
##  $ V9    : num  0.364 -0.255 -1.515 -1.387 0.818 ...
##  $ V10   : num  0.0908 -0.167 0.2076 -0.055 0.7531 ...
##  $ V11   : num  -0.552 1.613 0.625 -0.226 -0.823 ...
##  $ V12   : num  -0.6178 1.0652 0.0661 0.1782 0.5382 ...
##  $ V13   : num  -0.991 0.489 0.717 0.508 1.346 ...
##  $ V14   : num  -0.311 -0.144 -0.166 -0.288 -1.12 ...
##  $ V15   : num  1.468 0.636 2.346 -0.631 0.175 ...
##  $ V16   : num  -0.47 0.464 -2.89 -1.06 -0.451 ...
##  $ V17   : num  0.208 -0.115 1.11 -0.684 -0.237 ...
##  $ V18   : num  0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
##  $ V19   : num  0.404 -0.146 -2.262 -1.233 0.803 ...
##  $ V20   : num  0.2514 -0.0691 0.525 -0.208 0.4085 ...
##  $ V21   : num  -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
##  $ V22   : num  0.27784 -0.63867 0.77168 0.00527 0.79828 ...
##  $ V23   : num  -0.11 0.101 0.909 -0.19 -0.137 ...
##  $ V24   : num  0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
##  $ V25   : num  0.129 0.167 -0.328 0.647 -0.206 ...
##  $ V26   : num  -0.189 0.126 -0.139 -0.222 0.502 ...
##  $ V27   : num  0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
##  $ V28   : num  -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
##  $ Amount: num  149.62 2.69 378.66 123.5 69.99 ...
##  $ Class : int  0 0 0 0 0 0 0 0 0 0 ...

As we can see, the dataset has 31 observations and 284,807 observations, all variables are identified as numeric even the dependent variable.

# Tranforming the dependent variable to a factor
credit$Class <- factor(credit$Class, levels = c(1,0), labels = c("Fraud", "No Fraud") )

Calculating basic statistics for the dataset

summary(credit)

##       Time              V1                  V2                  V3          
##  Min.   :     0   Min.   :-56.40751   Min.   :-72.71573   Min.   :-48.3256  
##  1st Qu.: 54202   1st Qu.: -0.92037   1st Qu.: -0.59855   1st Qu.: -0.8904  
##  Median : 84692   Median :  0.01811   Median :  0.06549   Median :  0.1799  
##  Mean   : 94814   Mean   :  0.00000   Mean   :  0.00000   Mean   :  0.0000  
##  3rd Qu.:139321   3rd Qu.:  1.31564   3rd Qu.:  0.80372   3rd Qu.:  1.0272  
##  Max.   :172792   Max.   :  2.45493   Max.   : 22.05773   Max.   :  9.3826  
##        V4                 V5                   V6                 V7          
##  Min.   :-5.68317   Min.   :-113.74331   Min.   :-26.1605   Min.   :-43.5572  
##  1st Qu.:-0.84864   1st Qu.:  -0.69160   1st Qu.: -0.7683   1st Qu.: -0.5541  
##  Median :-0.01985   Median :  -0.05434   Median : -0.2742   Median :  0.0401  
##  Mean   : 0.00000   Mean   :   0.00000   Mean   :  0.0000   Mean   :  0.0000  
##  3rd Qu.: 0.74334   3rd Qu.:   0.61193   3rd Qu.:  0.3986   3rd Qu.:  0.5704  
##  Max.   :16.87534   Max.   :  34.80167   Max.   : 73.3016   Max.   :120.5895  
##        V8                  V9                 V10                 V11          
##  Min.   :-73.21672   Min.   :-13.43407   Min.   :-24.58826   Min.   :-4.79747  
##  1st Qu.: -0.20863   1st Qu.: -0.64310   1st Qu.: -0.53543   1st Qu.:-0.76249  
##  Median :  0.02236   Median : -0.05143   Median : -0.09292   Median :-0.03276  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.32735   3rd Qu.:  0.59714   3rd Qu.:  0.45392   3rd Qu.: 0.73959  
##  Max.   : 20.00721   Max.   : 15.59500   Max.   : 23.74514   Max.   :12.01891  
##       V12                V13                V14                V15          
##  Min.   :-18.6837   Min.   :-5.79188   Min.   :-19.2143   Min.   :-4.49894  
##  1st Qu.: -0.4056   1st Qu.:-0.64854   1st Qu.: -0.4256   1st Qu.:-0.58288  
##  Median :  0.1400   Median :-0.01357   Median :  0.0506   Median : 0.04807  
##  Mean   :  0.0000   Mean   : 0.00000   Mean   :  0.0000   Mean   : 0.00000  
##  3rd Qu.:  0.6182   3rd Qu.: 0.66251   3rd Qu.:  0.4931   3rd Qu.: 0.64882  
##  Max.   :  7.8484   Max.   : 7.12688   Max.   : 10.5268   Max.   : 8.87774  
##       V16                 V17                 V18           
##  Min.   :-14.12985   Min.   :-25.16280   Min.   :-9.498746  
##  1st Qu.: -0.46804   1st Qu.: -0.48375   1st Qu.:-0.498850  
##  Median :  0.06641   Median : -0.06568   Median :-0.003636  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.000000  
##  3rd Qu.:  0.52330   3rd Qu.:  0.39968   3rd Qu.: 0.500807  
##  Max.   : 17.31511   Max.   :  9.25353   Max.   : 5.041069  
##       V19                 V20                 V21           
##  Min.   :-7.213527   Min.   :-54.49772   Min.   :-34.83038  
##  1st Qu.:-0.456299   1st Qu.: -0.21172   1st Qu.: -0.22839  
##  Median : 0.003735   Median : -0.06248   Median : -0.02945  
##  Mean   : 0.000000   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.: 0.458949   3rd Qu.:  0.13304   3rd Qu.:  0.18638  
##  Max.   : 5.591971   Max.   : 39.42090   Max.   : 27.20284  
##       V22                  V23                 V24          
##  Min.   :-10.933144   Min.   :-44.80774   Min.   :-2.83663  
##  1st Qu.: -0.542350   1st Qu.: -0.16185   1st Qu.:-0.35459  
##  Median :  0.006782   Median : -0.01119   Median : 0.04098  
##  Mean   :  0.000000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.528554   3rd Qu.:  0.14764   3rd Qu.: 0.43953  
##  Max.   : 10.503090   Max.   : 22.52841   Max.   : 4.58455  
##       V25                 V26                V27            
##  Min.   :-10.29540   Min.   :-2.60455   Min.   :-22.565679  
##  1st Qu.: -0.31715   1st Qu.:-0.32698   1st Qu.: -0.070840  
##  Median :  0.01659   Median :-0.05214   Median :  0.001342  
##  Mean   :  0.00000   Mean   : 0.00000   Mean   :  0.000000  
##  3rd Qu.:  0.35072   3rd Qu.: 0.24095   3rd Qu.:  0.091045  
##  Max.   :  7.51959   Max.   : 3.51735   Max.   : 31.612198  
##       V28                Amount              Class       
##  Min.   :-15.43008   Min.   :    0.00   Fraud   :   492  
##  1st Qu.: -0.05296   1st Qu.:    5.60   No Fraud:284315  
##  Median :  0.01124   Median :   22.00                    
##  Mean   :  0.00000   Mean   :   88.35                    
##  3rd Qu.:  0.07828   3rd Qu.:   77.17                    
##  Max.   : 33.84781   Max.   :25691.16

A quick view to the basic statistics reveals that the data has outliers since there are discrepancies between the mean and median for the majority of variables. On the other hand, we don’t know what the variables V1-V8 are, so it is not possible to check for data constrains.

Checking for missing values

missmap(credit)

colSums(is.na(credit))

##   Time     V1     V2     V3     V4     V5     V6     V7     V8     V9    V10 
##      0      0      0      0      0      0      0      0      0      0      0 
##    V11    V12    V13    V14    V15    V16    V17    V18    V19    V20    V21 
##      0      0      0      0      0      0      0      0      0      0      0 
##    V22    V23    V24    V25    V26    V27    V28 Amount  Class 
##      0      0      0      0      0      0      0      0      0

The dataset is complete and there is no missing data in any of the variables.

Cheking for class imbalance

ggplot(credit, aes(Class)) +  geom_bar(aes(fill =  Class)) +  geom_text(aes(label  = ..count..), stat = "count", vjust = 1) + labs(y = "Number of transactions", title = "Credit Transactions by type")

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

The plot shows that only a small fraction of the total transactions are fraudulent, in fact, less than 1% are fraudulent. With that in mind, it is important to take into account class imbalance in order to make a reliable classification model.

Checking for multicolinearity

credit %>%select(where(is.numeric), -Class) %>% cor %>% corrplot(number.cex = 0.9, method = "color", tl.cex = 0.8, tl.col = "red")

Taking into account that most the variables were obtained trough Principal Component Analysis, there is no correlation between the features V1-V28. On the other hand, we see a strong inverse correlation between V3 and amount as well as a strong positive correlation between amount and V7 and V20.

Distribution of time between fraudulent and not fraudulent transactions

ggplot(credit, aes(Time, color = Class)) + geom_density()

The plot shows that the variable time has a different distribution between fraudulent and no fraudulent transactions, in fact we see a bimodal distribution for non fraudulent transactions.

Distribution of amount between fraudulent a non fraudulent transactions

p1 <- ggplot(data = credit[credit$Class=="No Fraud",], aes(x=Amount)) + geom_histogram(binwidth = 100) + coord_cartesian(xlim = c(0,1000)) + labs(title = "Distribution of amount for non fraudulent transactions", y = "Number of transactions")

p2 <- ggplot(data = credit[credit$Class=="Fraud",], aes(x=Amount)) + geom_histogram(binwidth = 100) + coord_cartesian(xlim = c(0,1000)) +  labs(title  = "Distribution of amount for fraudulent transactions", y = "Number of transactions")

grid.arrange(p1, p2, ncol = 1, nrow = 2)

The plots show that the amount has a left skewed distribution between fraudulent and non fraudulent transactions. ### Side by Side Boxplots

plot_boxplot(credit, by  = "Class", geom_boxplot_args = list("outlier.color" = "white", alpha =0.05))

The box-plots show different distributions for the variables V3, V4, V9, V10, V11,V12, V14, V16 and V17 between fraudulent and no fraudulent transactions. With that in mind, it is plausible to expect that is variables are important to predict whether a transaction is fraudulent or not.

Training, test and validarion splits

set.seed(1996)
fraud_split <- initial_split(credit, prop = 0.70, strata = Class)

fraud_train <- fraud_split %>% training()
fraud_test <- fraud_split %>% testing()

set.seed(2014)
fraud_folds <- vfold_cv(fraud_train, v = 5, strata = Class)

Data preprocessing

fraud_recipe <- recipe(formula =  Class~.,
                       data = fraud_train
                       ) %>% step_normalize(all_numeric_predictors()) %>%
                      step_smote(Class, over_ratio = 0.5 )
fraud_prep <- prep(fraud_recipe)
fraud_prep

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 30

##

## ── Training information

## Training data contained 199364 data points and no incomplete rows.

##

## ── Operations

## • Centering and scaling for: Time, V1, V2, V3, V4, V5, V6, V7, ... | Trained

## • SMOTE based on: Class | Trained

Although is not necessary to normalize the data, it is done to explore other sampling techniques for imbalanced data sets like SMOTE or ROSE sampling methods. For computational purposes the minority class is upsampled to be half as many observations than the mayority class.

Model specification

boost_spec <- boost_tree(trees = 400,
                         mtry = tune(),
                         learn_rate = tune(),
                         tree_depth = tune(),
                         sample_size = tune()) %>%
                         set_mode("classification") %>%
                         set_engine("xgboost")

boost_spec

## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 400
##   tree_depth = tune()
##   learn_rate = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

The XGBoost algorithm is applied since it gives very good result for unbalanced datasets, although it requires some hyperparameter tuning to get a good classification predictive model.

tune_workflow <-  workflow() %>% 
                  add_recipe(fraud_recipe) %>%
                  add_model(boost_spec)

tune_workflow

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_smote()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 400
##   tree_depth = tune()
##   learn_rate = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

Tuning Hyperparameters

doParallel::registerDoParallel()
set.seed(1996)
tunegrid_boost <- tune_grid(tune_workflow,
                            resamples =fraud_folds,
                            grid = 16,
                            metrics = metric_set(roc_auc))

## i Creating pre-processing data to finalize unknown parameter: mtry

tunegrid_boost

## # Tuning results
## # 5-fold cross-validation using stratification 
## # A tibble: 5 × 4
##   splits                 id    .metrics          .notes          
##   <list>                 <chr> <list>            <list>          
## 1 <split [159491/39873]> Fold1 <tibble [16 × 8]> <tibble [0 × 3]>
## 2 <split [159491/39873]> Fold2 <tibble [16 × 8]> <tibble [0 × 3]>
## 3 <split [159491/39873]> Fold3 <tibble [16 × 8]> <tibble [0 × 3]>
## 4 <split [159491/39873]> Fold4 <tibble [16 × 8]> <tibble [0 × 3]>
## 5 <split [159492/39872]> Fold5 <tibble [16 × 8]> <tibble [0 × 3]>

Exploring tuning results

autoplot(tunegrid_boost)

tunegrid_boost %>% collect_metrics(summarize = FALSE) %>%
  filter(.metric == "roc_auc") %>%
  group_by(id) %>%
  summarize(min_roc_auc = min(.estimate),
            median_roc_auc =  median(.estimate),
            max_roc_auc =  max(.estimate))

## # A tibble: 5 × 4
##   id    min_roc_auc median_roc_auc max_roc_auc
##   <chr>       <dbl>          <dbl>       <dbl>
## 1 Fold1       0.966          0.982       0.991
## 2 Fold2       0.961          0.981       0.988
## 3 Fold3       0.961          0.975       0.981
## 4 Fold4       0.967          0.979       0.986
## 5 Fold5       0.970          0.986       0.993

The results are fairly consistent across folds indicating that there are not problems of model overfitting.

Viewing the best performing models

tunegrid_boost %>% show_best(metric = "roc_auc", n = 5)

## # A tibble: 5 × 10
##    mtry tree_depth learn_rate sample_size .metric .estimator  mean     n std_err
##   <int>      <int>      <dbl>       <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1    16         14    0.00718       0.856 roc_auc binary     0.984     5 0.00237
## 2    20         11    0.00922       0.718 roc_auc binary     0.983     5 0.00315
## 3     7         13    0.0226        0.790 roc_auc binary     0.983     5 0.00113
## 4    18         12    0.0315        0.309 roc_auc binary     0.983     5 0.00128
## 5     6          8    0.00306       0.987 roc_auc binary     0.983     5 0.00324
## # ℹ 1 more variable: .config <chr>

Selecting the best model

best_auc <-  select_best(tunegrid_boost, "roc_auc")
best_auc

## # A tibble: 1 × 5
##    mtry tree_depth learn_rate sample_size .config              
##   <int>      <int>      <dbl>       <dbl> <chr>                
## 1    16         14    0.00718       0.856 Preprocessor1_Model06

final_model <- finalize_model(boost_spec, best_auc
)

Variable importance

final_model %>% set_engine("xgboost", importance = "permutation") %>% fit(Class~., data = juice(fraud_prep)) %>% vip()

## [10:31:18] WARNING: src/learner.cc:767: 
## Parameters: { "importance" } are not used.

Last fit

final_workflow <- workflow() %>% add_recipe(fraud_recipe) %>% add_model(final_model)

final_workflow

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_smote()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = 16
##   trees = 400
##   tree_depth = 14
##   learn_rate = 0.00718059643165048
##   sample_size = 0.85595240630646
## 
## Computational engine: xgboost

fraud_last_fit <- final_workflow %>% last_fit(fraud_split)

## Warning: package 'xgboost' was built under R version 4.2.3

Viewing performance metrics for the last fit

fraud_last_fit %>% collect_metrics()

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.999 Preprocessor1_Model1
## 2 roc_auc  binary         0.983 Preprocessor1_Model1

fraud_last_fit %>% collect_predictions() %>% roc_curve(truth =  Class, .pred_Fraud) %>% autoplot()

fraud_last_fit %>% collect_predictions() %>% conf_mat(truth = Class, estimate = .pred_class)

##           Truth
## Prediction Fraud No Fraud
##   Fraud      132       43
##   No Fraud    24    85244

Fraud Detection with Bagged Trees with R

Jose Daniel Isidoro Reyes