suppressWarnings(suppressPackageStartupMessages({
library(tidymodels)
library(gridExtra)
library(tidyverse)
library(dplyr)
library(rpart)
library(Amelia)
library(corrr)
library(corrplot)
library(DMwR)
library(ROSE)
library(caret)
library(skimr)
library(DataExplorer)
library(themis)
library(vip)
library(finetune)
}))
credit <- read.csv("creditcard.csv")
slice_sample(credit, n =5)
## Time V1 V2 V3 V4 V5 V6
## 1 160970 -1.5180707 1.2716754 -0.9731566 -1.46417875 1.53963025 5.6668650
## 2 97050 -0.2141594 1.4291548 -0.6355252 -0.31467215 0.77269853 -0.7034733
## 3 149869 -0.9753952 -0.7345229 1.9011316 -3.08922367 -0.41979987 -0.2433192
## 4 29649 -0.8379995 1.4400929 1.0692271 -0.04778071 0.05495333 -1.0066454
## 5 86216 1.2365598 -0.4680224 -0.1736092 -0.79652233 0.03944336 0.4491434
## V7 V8 V9 V10 V11 V12
## 1 -2.8727109 -5.1184808 1.57681027 -0.45295118 -0.8281466 0.91307554
## 2 0.6097327 0.2069346 0.92329406 -1.47381031 0.2903500 -1.95722465
## 3 -0.4855803 0.1991122 -2.24157053 0.69852304 0.4145764 -0.19223069
## 4 0.6965378 -0.0750572 -0.07795964 0.09673809 -0.1312989 0.18494491
## 5 -0.3337432 0.1493877 -1.23815797 0.48536118 1.0734905 0.02568159
## V13 V14 V15 V16 V17 V18
## 1 -0.6318797 -0.06665409 -0.8213055 -0.4727753 0.25914515 0.01305042
## 2 2.6534447 0.55078620 -1.0685863 0.3335206 1.25657301 -0.09357265
## 3 0.9051159 -0.69362169 -1.2016133 0.1116274 -0.17613894 0.37398281
## 4 0.6219236 -0.67215191 0.8785842 0.3358349 -0.05884711 -0.21253656
## 5 0.2874330 0.19554251 1.4549940 -0.6490433 1.71367448 -3.68915529
## V19 V20 V21 V22 V23 V24
## 1 -0.5136326 -1.413849579 6.3268655 -1.1951469 0.442034014 0.6679636
## 2 -0.5051978 -0.053745649 -0.4286161 -0.9383172 0.180539135 0.4619782
## 3 -0.3829592 0.003492558 -0.3637085 -0.7707556 -0.075024239 -0.4615598
## 4 -0.1293844 0.393681200 -0.3262531 -0.7017721 0.023474169 0.2830855
## 5 -1.0414367 -0.060729101 0.3357937 1.0649150 -0.001465721 -0.5852603
## V25 V26 V27 V28 Amount Class
## 1 1.30552855 0.13337358 0.48488613 0.281076953 16.80 0
## 2 -0.39396921 0.07972389 0.07567006 0.016961015 14.99 0
## 3 0.37825146 -0.42325210 0.26847967 0.118660452 39.95 0
## 4 -0.02453593 0.08423675 0.45905735 0.233793076 2.78 0
## 5 0.44415563 0.02243078 0.04475483 -0.008667399 15.00 0
str(credit)
## 'data.frame': 284807 obs. of 31 variables:
## $ Time : num 0 0 1 1 2 2 4 7 7 9 ...
## $ V1 : num -1.36 1.192 -1.358 -0.966 -1.158 ...
## $ V2 : num -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
## $ V3 : num 2.536 0.166 1.773 1.793 1.549 ...
## $ V4 : num 1.378 0.448 0.38 -0.863 0.403 ...
## $ V5 : num -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
## $ V6 : num 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
## $ V7 : num 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
## $ V8 : num 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
## $ V9 : num 0.364 -0.255 -1.515 -1.387 0.818 ...
## $ V10 : num 0.0908 -0.167 0.2076 -0.055 0.7531 ...
## $ V11 : num -0.552 1.613 0.625 -0.226 -0.823 ...
## $ V12 : num -0.6178 1.0652 0.0661 0.1782 0.5382 ...
## $ V13 : num -0.991 0.489 0.717 0.508 1.346 ...
## $ V14 : num -0.311 -0.144 -0.166 -0.288 -1.12 ...
## $ V15 : num 1.468 0.636 2.346 -0.631 0.175 ...
## $ V16 : num -0.47 0.464 -2.89 -1.06 -0.451 ...
## $ V17 : num 0.208 -0.115 1.11 -0.684 -0.237 ...
## $ V18 : num 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
## $ V19 : num 0.404 -0.146 -2.262 -1.233 0.803 ...
## $ V20 : num 0.2514 -0.0691 0.525 -0.208 0.4085 ...
## $ V21 : num -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
## $ V22 : num 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
## $ V23 : num -0.11 0.101 0.909 -0.19 -0.137 ...
## $ V24 : num 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
## $ V25 : num 0.129 0.167 -0.328 0.647 -0.206 ...
## $ V26 : num -0.189 0.126 -0.139 -0.222 0.502 ...
## $ V27 : num 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
## $ V28 : num -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
## $ Amount: num 149.62 2.69 378.66 123.5 69.99 ...
## $ Class : int 0 0 0 0 0 0 0 0 0 0 ...
As we can see, the dataset has 31 observations and 284,807 observations, all variables are identified as numeric even the dependent variable.
# Tranforming the dependent variable to a factor
credit$Class <- factor(credit$Class, levels = c(1,0), labels = c("Fraud", "No Fraud") )
summary(credit)
## Time V1 V2 V3
## Min. : 0 Min. :-56.40751 Min. :-72.71573 Min. :-48.3256
## 1st Qu.: 54202 1st Qu.: -0.92037 1st Qu.: -0.59855 1st Qu.: -0.8904
## Median : 84692 Median : 0.01811 Median : 0.06549 Median : 0.1799
## Mean : 94814 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:139321 3rd Qu.: 1.31564 3rd Qu.: 0.80372 3rd Qu.: 1.0272
## Max. :172792 Max. : 2.45493 Max. : 22.05773 Max. : 9.3826
## V4 V5 V6 V7
## Min. :-5.68317 Min. :-113.74331 Min. :-26.1605 Min. :-43.5572
## 1st Qu.:-0.84864 1st Qu.: -0.69160 1st Qu.: -0.7683 1st Qu.: -0.5541
## Median :-0.01985 Median : -0.05434 Median : -0.2742 Median : 0.0401
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.74334 3rd Qu.: 0.61193 3rd Qu.: 0.3986 3rd Qu.: 0.5704
## Max. :16.87534 Max. : 34.80167 Max. : 73.3016 Max. :120.5895
## V8 V9 V10 V11
## Min. :-73.21672 Min. :-13.43407 Min. :-24.58826 Min. :-4.79747
## 1st Qu.: -0.20863 1st Qu.: -0.64310 1st Qu.: -0.53543 1st Qu.:-0.76249
## Median : 0.02236 Median : -0.05143 Median : -0.09292 Median :-0.03276
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.32735 3rd Qu.: 0.59714 3rd Qu.: 0.45392 3rd Qu.: 0.73959
## Max. : 20.00721 Max. : 15.59500 Max. : 23.74514 Max. :12.01891
## V12 V13 V14 V15
## Min. :-18.6837 Min. :-5.79188 Min. :-19.2143 Min. :-4.49894
## 1st Qu.: -0.4056 1st Qu.:-0.64854 1st Qu.: -0.4256 1st Qu.:-0.58288
## Median : 0.1400 Median :-0.01357 Median : 0.0506 Median : 0.04807
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6182 3rd Qu.: 0.66251 3rd Qu.: 0.4931 3rd Qu.: 0.64882
## Max. : 7.8484 Max. : 7.12688 Max. : 10.5268 Max. : 8.87774
## V16 V17 V18
## Min. :-14.12985 Min. :-25.16280 Min. :-9.498746
## 1st Qu.: -0.46804 1st Qu.: -0.48375 1st Qu.:-0.498850
## Median : 0.06641 Median : -0.06568 Median :-0.003636
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.52330 3rd Qu.: 0.39968 3rd Qu.: 0.500807
## Max. : 17.31511 Max. : 9.25353 Max. : 5.041069
## V19 V20 V21
## Min. :-7.213527 Min. :-54.49772 Min. :-34.83038
## 1st Qu.:-0.456299 1st Qu.: -0.21172 1st Qu.: -0.22839
## Median : 0.003735 Median : -0.06248 Median : -0.02945
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.458949 3rd Qu.: 0.13304 3rd Qu.: 0.18638
## Max. : 5.591971 Max. : 39.42090 Max. : 27.20284
## V22 V23 V24
## Min. :-10.933144 Min. :-44.80774 Min. :-2.83663
## 1st Qu.: -0.542350 1st Qu.: -0.16185 1st Qu.:-0.35459
## Median : 0.006782 Median : -0.01119 Median : 0.04098
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.528554 3rd Qu.: 0.14764 3rd Qu.: 0.43953
## Max. : 10.503090 Max. : 22.52841 Max. : 4.58455
## V25 V26 V27
## Min. :-10.29540 Min. :-2.60455 Min. :-22.565679
## 1st Qu.: -0.31715 1st Qu.:-0.32698 1st Qu.: -0.070840
## Median : 0.01659 Median :-0.05214 Median : 0.001342
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.35072 3rd Qu.: 0.24095 3rd Qu.: 0.091045
## Max. : 7.51959 Max. : 3.51735 Max. : 31.612198
## V28 Amount Class
## Min. :-15.43008 Min. : 0.00 Fraud : 492
## 1st Qu.: -0.05296 1st Qu.: 5.60 No Fraud:284315
## Median : 0.01124 Median : 22.00
## Mean : 0.00000 Mean : 88.35
## 3rd Qu.: 0.07828 3rd Qu.: 77.17
## Max. : 33.84781 Max. :25691.16
A quick view to the basic statistics reveals that the data has outliers since there are discrepancies between the mean and median for the majority of variables. On the other hand, we don’t know what the variables V1-V8 are, so it is not possible to check for data constrains.
missmap(credit)
colSums(is.na(credit))
## Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 0 0 0 0 0 0 0 0 0 0 0
## V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 0 0 0 0 0 0 0 0 0 0 0
## V22 V23 V24 V25 V26 V27 V28 Amount Class
## 0 0 0 0 0 0 0 0 0
The dataset is complete and there is no missing data in any of the variables.
ggplot(credit, aes(Class)) + geom_bar(aes(fill = Class)) + geom_text(aes(label = ..count..), stat = "count", vjust = 1) + labs(y = "Number of transactions", title = "Credit Transactions by type")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The plot shows that only a small fraction of the total transactions are fraudulent, in fact, less than 1% are fraudulent. With that in mind, it is important to take into account class imbalance in order to make a reliable classification model.
credit %>%select(where(is.numeric), -Class) %>% cor %>% corrplot(number.cex = 0.9, method = "color", tl.cex = 0.8, tl.col = "red")
Taking into account that most the variables were obtained trough Principal Component Analysis, there is no correlation between the features V1-V28. On the other hand, we see a strong inverse correlation between V3 and amount as well as a strong positive correlation between amount and V7 and V20.
ggplot(credit, aes(Time, color = Class)) + geom_density()
The plot shows that the variable time has a different distribution between fraudulent and no fraudulent transactions, in fact we see a bimodal distribution for non fraudulent transactions.
p1 <- ggplot(data = credit[credit$Class=="No Fraud",], aes(x=Amount)) + geom_histogram(binwidth = 100) + coord_cartesian(xlim = c(0,1000)) + labs(title = "Distribution of amount for non fraudulent transactions", y = "Number of transactions")
p2 <- ggplot(data = credit[credit$Class=="Fraud",], aes(x=Amount)) + geom_histogram(binwidth = 100) + coord_cartesian(xlim = c(0,1000)) + labs(title = "Distribution of amount for fraudulent transactions", y = "Number of transactions")
grid.arrange(p1, p2, ncol = 1, nrow = 2)
The plots show that the amount has a left skewed distribution between fraudulent and non fraudulent transactions. ### Side by Side Boxplots
plot_boxplot(credit, by = "Class", geom_boxplot_args = list("outlier.color" = "white", alpha =0.05))
The box-plots show different distributions for the variables V3, V4, V9, V10, V11,V12, V14, V16 and V17 between fraudulent and no fraudulent transactions. With that in mind, it is plausible to expect that is variables are important to predict whether a transaction is fraudulent or not.
set.seed(1996)
fraud_split <- initial_split(credit, prop = 0.70, strata = Class)
fraud_train <- fraud_split %>% training()
fraud_test <- fraud_split %>% testing()
set.seed(2014)
fraud_folds <- vfold_cv(fraud_train, v = 5, strata = Class)
fraud_recipe <- recipe(formula = Class~.,
data = fraud_train
) %>% step_normalize(all_numeric_predictors()) %>%
step_smote(Class, over_ratio = 0.5 )
fraud_prep <- prep(fraud_recipe)
fraud_prep
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 30
##
## ── Training information
## Training data contained 199364 data points and no incomplete rows.
##
## ── Operations
## • Centering and scaling for: Time, V1, V2, V3, V4, V5, V6, V7, ... | Trained
## • SMOTE based on: Class | Trained
Although is not necessary to normalize the data, it is done to explore other sampling techniques for imbalanced data sets like SMOTE or ROSE sampling methods. For computational purposes the minority class is upsampled to be half as many observations than the mayority class.
boost_spec <- boost_tree(trees = 400,
mtry = tune(),
learn_rate = tune(),
tree_depth = tune(),
sample_size = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
boost_spec
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 400
## tree_depth = tune()
## learn_rate = tune()
## sample_size = tune()
##
## Computational engine: xgboost
The XGBoost algorithm is applied since it gives very good result for unbalanced datasets, although it requires some hyperparameter tuning to get a good classification predictive model.
tune_workflow <- workflow() %>%
add_recipe(fraud_recipe) %>%
add_model(boost_spec)
tune_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_smote()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 400
## tree_depth = tune()
## learn_rate = tune()
## sample_size = tune()
##
## Computational engine: xgboost
doParallel::registerDoParallel()
set.seed(1996)
tunegrid_boost <- tune_grid(tune_workflow,
resamples =fraud_folds,
grid = 16,
metrics = metric_set(roc_auc))
## i Creating pre-processing data to finalize unknown parameter: mtry
tunegrid_boost
## # Tuning results
## # 5-fold cross-validation using stratification
## # A tibble: 5 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [159491/39873]> Fold1 <tibble [16 × 8]> <tibble [0 × 3]>
## 2 <split [159491/39873]> Fold2 <tibble [16 × 8]> <tibble [0 × 3]>
## 3 <split [159491/39873]> Fold3 <tibble [16 × 8]> <tibble [0 × 3]>
## 4 <split [159491/39873]> Fold4 <tibble [16 × 8]> <tibble [0 × 3]>
## 5 <split [159492/39872]> Fold5 <tibble [16 × 8]> <tibble [0 × 3]>
autoplot(tunegrid_boost)
tunegrid_boost %>% collect_metrics(summarize = FALSE) %>%
filter(.metric == "roc_auc") %>%
group_by(id) %>%
summarize(min_roc_auc = min(.estimate),
median_roc_auc = median(.estimate),
max_roc_auc = max(.estimate))
## # A tibble: 5 × 4
## id min_roc_auc median_roc_auc max_roc_auc
## <chr> <dbl> <dbl> <dbl>
## 1 Fold1 0.966 0.982 0.991
## 2 Fold2 0.961 0.981 0.988
## 3 Fold3 0.961 0.975 0.981
## 4 Fold4 0.967 0.979 0.986
## 5 Fold5 0.970 0.986 0.993
The results are fairly consistent across folds indicating that there are not problems of model overfitting.
tunegrid_boost %>% show_best(metric = "roc_auc", n = 5)
## # A tibble: 5 × 10
## mtry tree_depth learn_rate sample_size .metric .estimator mean n std_err
## <int> <int> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 16 14 0.00718 0.856 roc_auc binary 0.984 5 0.00237
## 2 20 11 0.00922 0.718 roc_auc binary 0.983 5 0.00315
## 3 7 13 0.0226 0.790 roc_auc binary 0.983 5 0.00113
## 4 18 12 0.0315 0.309 roc_auc binary 0.983 5 0.00128
## 5 6 8 0.00306 0.987 roc_auc binary 0.983 5 0.00324
## # ℹ 1 more variable: .config <chr>
best_auc <- select_best(tunegrid_boost, "roc_auc")
best_auc
## # A tibble: 1 × 5
## mtry tree_depth learn_rate sample_size .config
## <int> <int> <dbl> <dbl> <chr>
## 1 16 14 0.00718 0.856 Preprocessor1_Model06
final_model <- finalize_model(boost_spec, best_auc
)
final_model %>% set_engine("xgboost", importance = "permutation") %>% fit(Class~., data = juice(fraud_prep)) %>% vip()
## [10:31:18] WARNING: src/learner.cc:767:
## Parameters: { "importance" } are not used.
final_workflow <- workflow() %>% add_recipe(fraud_recipe) %>% add_model(final_model)
final_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_smote()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = 16
## trees = 400
## tree_depth = 14
## learn_rate = 0.00718059643165048
## sample_size = 0.85595240630646
##
## Computational engine: xgboost
fraud_last_fit <- final_workflow %>% last_fit(fraud_split)
## Warning: package 'xgboost' was built under R version 4.2.3
fraud_last_fit %>% collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.999 Preprocessor1_Model1
## 2 roc_auc binary 0.983 Preprocessor1_Model1
fraud_last_fit %>% collect_predictions() %>% roc_curve(truth = Class, .pred_Fraud) %>% autoplot()
fraud_last_fit %>% collect_predictions() %>% conf_mat(truth = Class, estimate = .pred_class)
## Truth
## Prediction Fraud No Fraud
## Fraud 132 43
## No Fraud 24 85244