The dataset contains transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are ‘Time’ and ‘Amount’. Feature ‘Time’ contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature ‘Amount’ is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature ‘Class’ is the response variable and it takes value 1 in case of fraud and 0 otherwise.

Given the class imbalance ratio, we recommend measuring the accuracy using the Area Under the Precision-Recall Curve (AUPRC). Confusion matrix accuracy is not meaningful for unbalanced classification.

You can download creditcard.csv from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

library(caret)
library(ggplot2)
library(dplyr)
library(corrplot)
library(tidyverse)
library(LiblineaR)
library(recipes)
library(themis)
library(kernlab)
library(pROC)
library(ROSE)
library(DMwR2)
library(h2o)
library(glmnet)
library(xgboost)
library(PRROC)

Load the data

setwd("C:/Users/ozge/Desktop/credit_card_deneme")
creditcard <- read.csv("creditcard.csv")

Summary of the credit card data

head(creditcard)
##   Time         V1          V2        V3         V4          V5          V6
## 1    0 -1.3598071 -0.07278117 2.5363467  1.3781552 -0.33832077  0.46238778
## 2    0  1.1918571  0.26615071 0.1664801  0.4481541  0.06001765 -0.08236081
## 3    1 -1.3583541 -1.34016307 1.7732093  0.3797796 -0.50319813  1.80049938
## 4    1 -0.9662717 -0.18522601 1.7929933 -0.8632913 -0.01030888  1.24720317
## 5    2 -1.1582331  0.87773675 1.5487178  0.4030339 -0.40719338  0.09592146
## 6    2 -0.4259659  0.96052304 1.1411093 -0.1682521  0.42098688 -0.02972755
##            V7          V8         V9         V10        V11         V12
## 1  0.23959855  0.09869790  0.3637870  0.09079417 -0.5515995 -0.61780086
## 2 -0.07880298  0.08510165 -0.2554251 -0.16697441  1.6127267  1.06523531
## 3  0.79146096  0.24767579 -1.5146543  0.20764287  0.6245015  0.06608369
## 4  0.23760894  0.37743587 -1.3870241 -0.05495192 -0.2264873  0.17822823
## 5  0.59294075 -0.27053268  0.8177393  0.75307443 -0.8228429  0.53819555
## 6  0.47620095  0.26031433 -0.5686714 -0.37140720  1.3412620  0.35989384
##          V13        V14        V15        V16         V17         V18
## 1 -0.9913898 -0.3111694  1.4681770 -0.4704005  0.20797124  0.02579058
## 2  0.4890950 -0.1437723  0.6355581  0.4639170 -0.11480466 -0.18336127
## 3  0.7172927 -0.1659459  2.3458649 -2.8900832  1.10996938 -0.12135931
## 4  0.5077569 -0.2879237 -0.6314181 -1.0596472 -0.68409279  1.96577500
## 5  1.3458516 -1.1196698  0.1751211 -0.4514492 -0.23703324 -0.03819479
## 6 -0.3580907 -0.1371337  0.5176168  0.4017259 -0.05813282  0.06865315
##           V19         V20          V21          V22         V23         V24
## 1  0.40399296  0.25141210 -0.018306778  0.277837576 -0.11047391  0.06692807
## 2 -0.14578304 -0.06908314 -0.225775248 -0.638671953  0.10128802 -0.33984648
## 3 -2.26185710  0.52497973  0.247998153  0.771679402  0.90941226 -0.68928096
## 4 -1.23262197 -0.20803778 -0.108300452  0.005273597 -0.19032052 -1.17557533
## 5  0.80348692  0.40854236 -0.009430697  0.798278495 -0.13745808  0.14126698
## 6 -0.03319379  0.08496767 -0.208253515 -0.559824796 -0.02639767 -0.37142658
##          V25        V26          V27         V28 Amount Class
## 1  0.1285394 -0.1891148  0.133558377 -0.02105305 149.62     0
## 2  0.1671704  0.1258945 -0.008983099  0.01472417   2.69     0
## 3 -0.3276418 -0.1390966 -0.055352794 -0.05975184 378.66     0
## 4  0.6473760 -0.2219288  0.062722849  0.06145763 123.50     0
## 5 -0.2060096  0.5022922  0.219422230  0.21515315  69.99     0
## 6 -0.2327938  0.1059148  0.253844225  0.08108026   3.67     0
summary(creditcard)
##       Time              V1                  V2                  V3          
##  Min.   :     0   Min.   :-56.40751   Min.   :-72.71573   Min.   :-48.3256  
##  1st Qu.: 54202   1st Qu.: -0.92037   1st Qu.: -0.59855   1st Qu.: -0.8904  
##  Median : 84692   Median :  0.01811   Median :  0.06549   Median :  0.1799  
##  Mean   : 94814   Mean   :  0.00000   Mean   :  0.00000   Mean   :  0.0000  
##  3rd Qu.:139321   3rd Qu.:  1.31564   3rd Qu.:  0.80372   3rd Qu.:  1.0272  
##  Max.   :172792   Max.   :  2.45493   Max.   : 22.05773   Max.   :  9.3826  
##        V4                 V5                   V6                 V7          
##  Min.   :-5.68317   Min.   :-113.74331   Min.   :-26.1605   Min.   :-43.5572  
##  1st Qu.:-0.84864   1st Qu.:  -0.69160   1st Qu.: -0.7683   1st Qu.: -0.5541  
##  Median :-0.01985   Median :  -0.05434   Median : -0.2742   Median :  0.0401  
##  Mean   : 0.00000   Mean   :   0.00000   Mean   :  0.0000   Mean   :  0.0000  
##  3rd Qu.: 0.74334   3rd Qu.:   0.61193   3rd Qu.:  0.3986   3rd Qu.:  0.5704  
##  Max.   :16.87534   Max.   :  34.80167   Max.   : 73.3016   Max.   :120.5895  
##        V8                  V9                 V10                 V11          
##  Min.   :-73.21672   Min.   :-13.43407   Min.   :-24.58826   Min.   :-4.79747  
##  1st Qu.: -0.20863   1st Qu.: -0.64310   1st Qu.: -0.53543   1st Qu.:-0.76249  
##  Median :  0.02236   Median : -0.05143   Median : -0.09292   Median :-0.03276  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.32735   3rd Qu.:  0.59714   3rd Qu.:  0.45392   3rd Qu.: 0.73959  
##  Max.   : 20.00721   Max.   : 15.59500   Max.   : 23.74514   Max.   :12.01891  
##       V12                V13                V14                V15          
##  Min.   :-18.6837   Min.   :-5.79188   Min.   :-19.2143   Min.   :-4.49894  
##  1st Qu.: -0.4056   1st Qu.:-0.64854   1st Qu.: -0.4256   1st Qu.:-0.58288  
##  Median :  0.1400   Median :-0.01357   Median :  0.0506   Median : 0.04807  
##  Mean   :  0.0000   Mean   : 0.00000   Mean   :  0.0000   Mean   : 0.00000  
##  3rd Qu.:  0.6182   3rd Qu.: 0.66251   3rd Qu.:  0.4931   3rd Qu.: 0.64882  
##  Max.   :  7.8484   Max.   : 7.12688   Max.   : 10.5268   Max.   : 8.87774  
##       V16                 V17                 V18           
##  Min.   :-14.12985   Min.   :-25.16280   Min.   :-9.498746  
##  1st Qu.: -0.46804   1st Qu.: -0.48375   1st Qu.:-0.498850  
##  Median :  0.06641   Median : -0.06568   Median :-0.003636  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.000000  
##  3rd Qu.:  0.52330   3rd Qu.:  0.39968   3rd Qu.: 0.500807  
##  Max.   : 17.31511   Max.   :  9.25353   Max.   : 5.041069  
##       V19                 V20                 V21           
##  Min.   :-7.213527   Min.   :-54.49772   Min.   :-34.83038  
##  1st Qu.:-0.456299   1st Qu.: -0.21172   1st Qu.: -0.22839  
##  Median : 0.003735   Median : -0.06248   Median : -0.02945  
##  Mean   : 0.000000   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.: 0.458949   3rd Qu.:  0.13304   3rd Qu.:  0.18638  
##  Max.   : 5.591971   Max.   : 39.42090   Max.   : 27.20284  
##       V22                  V23                 V24          
##  Min.   :-10.933144   Min.   :-44.80774   Min.   :-2.83663  
##  1st Qu.: -0.542350   1st Qu.: -0.16185   1st Qu.:-0.35459  
##  Median :  0.006782   Median : -0.01119   Median : 0.04098  
##  Mean   :  0.000000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.528554   3rd Qu.:  0.14764   3rd Qu.: 0.43953  
##  Max.   : 10.503090   Max.   : 22.52841   Max.   : 4.58455  
##       V25                 V26                V27            
##  Min.   :-10.29540   Min.   :-2.60455   Min.   :-22.565679  
##  1st Qu.: -0.31715   1st Qu.:-0.32698   1st Qu.: -0.070840  
##  Median :  0.01659   Median :-0.05214   Median :  0.001342  
##  Mean   :  0.00000   Mean   : 0.00000   Mean   :  0.000000  
##  3rd Qu.:  0.35072   3rd Qu.: 0.24095   3rd Qu.:  0.091045  
##  Max.   :  7.51959   Max.   : 3.51735   Max.   : 31.612198  
##       V28                Amount             Class         
##  Min.   :-15.43008   Min.   :    0.00   Min.   :0.000000  
##  1st Qu.: -0.05296   1st Qu.:    5.60   1st Qu.:0.000000  
##  Median :  0.01124   Median :   22.00   Median :0.000000  
##  Mean   :  0.00000   Mean   :   88.35   Mean   :0.001728  
##  3rd Qu.:  0.07828   3rd Qu.:   77.17   3rd Qu.:0.000000  
##  Max.   : 33.84781   Max.   :25691.16   Max.   :1.000000
slice_sample(creditcard, n=10)
##      Time         V1         V2          V3         V4         V5          V6
## 1   79649 -0.5765298  0.8024263  1.27146501  0.1551318 -0.1914559 -0.57444009
## 2  153752 -1.2796333 -0.3325638 -0.49623592  1.3706392  3.1041763 -1.30056006
## 3   26847  1.0662275 -1.0786974  1.47490021 -0.3269744 -2.1454943 -0.73947917
## 4  115881  2.0707735 -0.7420339 -0.85037206 -0.2657001 -0.6205071 -0.39380603
## 5  154242 -0.4388533  1.4496002 -3.65700592 -1.6115014  3.2679898  2.53153436
## 6  164485 -1.1345349  0.2330074  1.60286263 -1.0033010 -0.2469486  0.01208722
## 7  137613  1.1447911  0.7448279 -1.44164254  1.7417154  0.2651848 -0.79532923
## 8   45923 -0.2198232  1.4734856  0.02720225  0.9943687  0.1816129 -0.88070368
## 9  167101 -0.2021112  1.2635791 -0.75853482 -0.4856347  0.5375364 -0.91595348
## 10 153525  2.1374834  0.1469457 -2.44830610  0.3939755  0.6369531 -1.79325153
##            V7          V8         V9          V10         V11           V12
## 1   0.9264783  0.02719064 -0.4288248 -0.630036286  0.27101050 -9.344354e-05
## 2   0.5784788 -0.16639080 -0.7750215 -0.132521830  1.19199019  4.566840e-01
## 3  -1.0539963 -0.07165695 -0.3787547  0.542979832  0.02154368  4.425232e-01
## 4  -0.5541589 -0.12996139 -0.6372424  1.038047528  0.46730946  1.029260e+00
## 5   0.5549607  0.93943919 -0.3557659 -0.129309905  0.13975275 -2.019823e-01
## 6  -0.5166304  0.82723167  0.2739687 -1.080644753 -0.07642238  2.786068e-01
## 7   0.4729544 -1.22183824 -0.2489892 -0.005723708 -0.68016264 -5.114519e-01
## 8   0.6685907  0.20583637 -0.9100215 -0.097988780  0.04450857  9.811081e-01
## 9   0.6214350  0.32443698 -0.1868532 -1.055514056 -0.83556511 -6.328867e-02
## 10  0.9565911 -0.58404171  0.2963657  0.111534233 -1.04390809 -5.310429e-01
##            V13          V14         V15         V16         V17        V18
## 1  -0.11525551 -0.336486575  1.14809810  0.02174258  0.40847549 -0.6354767
## 2  -0.08561097 -1.023557634 -0.76225349 -0.64462091  0.99636262  0.8133687
## 3   1.26974682 -0.510824512  1.37606400 -0.80232562 -0.20000790  1.3706801
## 4   1.06633481  0.001936594 -0.65398993 -1.17052188 -0.57130923  1.4571431
## 5  -0.40863300 -0.234488318  0.24337944  0.02078957  0.51643635 -0.1583971
## 6  -0.91202238  0.207814480 -0.74620073  1.12278742 -0.91869630  0.7406828
## 7  -0.73647004 -0.391634159  0.94336192 -1.08704188  1.76733081  0.2494799
## 8   1.20781670  0.609009677  0.71731359 -0.88155715  0.34746978 -0.7928405
## 9  -0.16413103 -0.653463986 -0.25690696  0.45168054  0.56377628 -0.1632633
## 10 -1.68718695  1.188898610  0.08382977 -1.01226211  0.01757501 -0.4165197
##            V19         V20         V21        V22         V23         V24
## 1  -0.59347764  0.26715583 -0.16074423 -0.5369689  0.27690478  0.33308541
## 2   0.82741169  0.74496596  0.18896820  0.4530237  0.07630484  0.58969944
## 3  -1.41591263 -0.17199980 -0.04921762  0.1459888  0.02898353  0.98473896
## 4  -0.50106428 -0.50692390 -0.34304798 -0.4102081  0.22174795 -0.33692932
## 5  -0.71269657 -0.06158877  0.17068847  0.3917650  0.05075306  0.65002917
## 6  -0.54865616 -0.21818082  0.16132641  0.2063681 -0.12666844 -0.40200183
## 7   2.37499001  0.05110286  0.77161897 -0.0152730  0.08136853 -0.15536283
## 8  -0.11157577 -0.02759102  0.23906452  0.8758596  0.12157365  0.44968220
## 9  -0.25061016 -0.09560083 -0.29602508 -0.8558209  0.17208891  0.62183103
## 10  0.05806814 -0.38127555  0.27552886  0.9670000 -0.20182718 -0.04380673
##            V25         V26         V27          V28 Amount Class
## 1  -0.24409799  0.07905963  0.22866653  0.104994947  94.48     0
## 2   0.08361377 -0.45783806  0.21538069  0.086589276  48.46     0
## 3  -0.12723909  1.06709877 -0.01777489  0.054942803 125.90     0
## 4  -0.28306456  0.22319776 -0.01740954 -0.055963460  30.90     0
## 5  -0.23947845  0.52599873 -0.18199015  0.054466773  12.00     0
## 6  -0.41436283  0.27509475 -0.02461016  0.006444748   6.99     0
## 7  -0.24275518  0.72694944  0.30744396  0.381852932  49.99     0
## 8  -0.76178684 -0.40773742  0.33985738  0.170960151   2.69     0
## 9  -0.40821244  0.11241438  0.09367545  0.022464678   9.12     0
## 10  0.77375162  0.13503352 -0.08263250 -0.090162117   1.00     0
str(creditcard)
## 'data.frame':    284807 obs. of  31 variables:
##  $ Time  : num  0 0 1 1 2 2 4 7 7 9 ...
##  $ V1    : num  -1.36 1.192 -1.358 -0.966 -1.158 ...
##  $ V2    : num  -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
##  $ V3    : num  2.536 0.166 1.773 1.793 1.549 ...
##  $ V4    : num  1.378 0.448 0.38 -0.863 0.403 ...
##  $ V5    : num  -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
##  $ V6    : num  0.4624 -0.0824 1.8005 1.2472 0.0959 ...
##  $ V7    : num  0.2396 -0.0788 0.7915 0.2376 0.5929 ...
##  $ V8    : num  0.0987 0.0851 0.2477 0.3774 -0.2705 ...
##  $ V9    : num  0.364 -0.255 -1.515 -1.387 0.818 ...
##  $ V10   : num  0.0908 -0.167 0.2076 -0.055 0.7531 ...
##  $ V11   : num  -0.552 1.613 0.625 -0.226 -0.823 ...
##  $ V12   : num  -0.6178 1.0652 0.0661 0.1782 0.5382 ...
##  $ V13   : num  -0.991 0.489 0.717 0.508 1.346 ...
##  $ V14   : num  -0.311 -0.144 -0.166 -0.288 -1.12 ...
##  $ V15   : num  1.468 0.636 2.346 -0.631 0.175 ...
##  $ V16   : num  -0.47 0.464 -2.89 -1.06 -0.451 ...
##  $ V17   : num  0.208 -0.115 1.11 -0.684 -0.237 ...
##  $ V18   : num  0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
##  $ V19   : num  0.404 -0.146 -2.262 -1.233 0.803 ...
##  $ V20   : num  0.2514 -0.0691 0.525 -0.208 0.4085 ...
##  $ V21   : num  -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
##  $ V22   : num  0.27784 -0.63867 0.77168 0.00527 0.79828 ...
##  $ V23   : num  -0.11 0.101 0.909 -0.19 -0.137 ...
##  $ V24   : num  0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
##  $ V25   : num  0.129 0.167 -0.328 0.647 -0.206 ...
##  $ V26   : num  -0.189 0.126 -0.139 -0.222 0.502 ...
##  $ V27   : num  0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
##  $ V28   : num  -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
##  $ Amount: num  149.62 2.69 378.66 123.5 69.99 ...
##  $ Class : int  0 0 0 0 0 0 0 0 0 0 ...
creditcard$Amount <- scale(creditcard$Amount, center = TRUE, scale = TRUE)
summary(creditcard)
##       Time              V1                  V2                  V3          
##  Min.   :     0   Min.   :-56.40751   Min.   :-72.71573   Min.   :-48.3256  
##  1st Qu.: 54202   1st Qu.: -0.92037   1st Qu.: -0.59855   1st Qu.: -0.8904  
##  Median : 84692   Median :  0.01811   Median :  0.06549   Median :  0.1799  
##  Mean   : 94814   Mean   :  0.00000   Mean   :  0.00000   Mean   :  0.0000  
##  3rd Qu.:139321   3rd Qu.:  1.31564   3rd Qu.:  0.80372   3rd Qu.:  1.0272  
##  Max.   :172792   Max.   :  2.45493   Max.   : 22.05773   Max.   :  9.3826  
##        V4                 V5                   V6                 V7          
##  Min.   :-5.68317   Min.   :-113.74331   Min.   :-26.1605   Min.   :-43.5572  
##  1st Qu.:-0.84864   1st Qu.:  -0.69160   1st Qu.: -0.7683   1st Qu.: -0.5541  
##  Median :-0.01985   Median :  -0.05434   Median : -0.2742   Median :  0.0401  
##  Mean   : 0.00000   Mean   :   0.00000   Mean   :  0.0000   Mean   :  0.0000  
##  3rd Qu.: 0.74334   3rd Qu.:   0.61193   3rd Qu.:  0.3986   3rd Qu.:  0.5704  
##  Max.   :16.87534   Max.   :  34.80167   Max.   : 73.3016   Max.   :120.5895  
##        V8                  V9                 V10                 V11          
##  Min.   :-73.21672   Min.   :-13.43407   Min.   :-24.58826   Min.   :-4.79747  
##  1st Qu.: -0.20863   1st Qu.: -0.64310   1st Qu.: -0.53543   1st Qu.:-0.76249  
##  Median :  0.02236   Median : -0.05143   Median : -0.09292   Median :-0.03276  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.32735   3rd Qu.:  0.59714   3rd Qu.:  0.45392   3rd Qu.: 0.73959  
##  Max.   : 20.00721   Max.   : 15.59500   Max.   : 23.74514   Max.   :12.01891  
##       V12                V13                V14                V15          
##  Min.   :-18.6837   Min.   :-5.79188   Min.   :-19.2143   Min.   :-4.49894  
##  1st Qu.: -0.4056   1st Qu.:-0.64854   1st Qu.: -0.4256   1st Qu.:-0.58288  
##  Median :  0.1400   Median :-0.01357   Median :  0.0506   Median : 0.04807  
##  Mean   :  0.0000   Mean   : 0.00000   Mean   :  0.0000   Mean   : 0.00000  
##  3rd Qu.:  0.6182   3rd Qu.: 0.66251   3rd Qu.:  0.4931   3rd Qu.: 0.64882  
##  Max.   :  7.8484   Max.   : 7.12688   Max.   : 10.5268   Max.   : 8.87774  
##       V16                 V17                 V18           
##  Min.   :-14.12985   Min.   :-25.16280   Min.   :-9.498746  
##  1st Qu.: -0.46804   1st Qu.: -0.48375   1st Qu.:-0.498850  
##  Median :  0.06641   Median : -0.06568   Median :-0.003636  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.000000  
##  3rd Qu.:  0.52330   3rd Qu.:  0.39968   3rd Qu.: 0.500807  
##  Max.   : 17.31511   Max.   :  9.25353   Max.   : 5.041069  
##       V19                 V20                 V21           
##  Min.   :-7.213527   Min.   :-54.49772   Min.   :-34.83038  
##  1st Qu.:-0.456299   1st Qu.: -0.21172   1st Qu.: -0.22839  
##  Median : 0.003735   Median : -0.06248   Median : -0.02945  
##  Mean   : 0.000000   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.: 0.458949   3rd Qu.:  0.13304   3rd Qu.:  0.18638  
##  Max.   : 5.591971   Max.   : 39.42090   Max.   : 27.20284  
##       V22                  V23                 V24          
##  Min.   :-10.933144   Min.   :-44.80774   Min.   :-2.83663  
##  1st Qu.: -0.542350   1st Qu.: -0.16185   1st Qu.:-0.35459  
##  Median :  0.006782   Median : -0.01119   Median : 0.04098  
##  Mean   :  0.000000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.528554   3rd Qu.:  0.14764   3rd Qu.: 0.43953  
##  Max.   : 10.503090   Max.   : 22.52841   Max.   : 4.58455  
##       V25                 V26                V27            
##  Min.   :-10.29540   Min.   :-2.60455   Min.   :-22.565679  
##  1st Qu.: -0.31715   1st Qu.:-0.32698   1st Qu.: -0.070840  
##  Median :  0.01659   Median :-0.05214   Median :  0.001342  
##  Mean   :  0.00000   Mean   : 0.00000   Mean   :  0.000000  
##  3rd Qu.:  0.35072   3rd Qu.: 0.24095   3rd Qu.:  0.091045  
##  Max.   :  7.51959   Max.   : 3.51735   Max.   : 31.612198  
##       V28                 Amount.V1          Class         
##  Min.   :-15.43008   Min.   : -0.35323   Min.   :0.000000  
##  1st Qu.: -0.05296   1st Qu.: -0.33084   1st Qu.:0.000000  
##  Median :  0.01124   Median : -0.26527   Median :0.000000  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   :0.001728  
##  3rd Qu.:  0.07828   3rd Qu.: -0.04472   3rd Qu.:0.000000  
##  Max.   : 33.84781   Max.   :102.36206   Max.   :1.000000

Baseline occurrence of fraud

credit_table <- table(creditcard$Class)
print(credit_table)
## 
##      0      1 
## 284315    492
print(credit_table[2]/(credit_table[1]+credit_table[2]))
##           1 
## 0.001727486
creditcard$Class<- factor(make.names(creditcard$Class), labels = c("non_fraud", "fraud"))
creditcard<-subset(creditcard, select = -c(Time))

Split data

set.seed(77) 
partition <- caret::createDataPartition(y=creditcard$Class, p=.75, list=FALSE) 
imbal_train <- creditcard[partition,]
imbal_test <- creditcard[-partition,]
print(nrow(imbal_train)/(nrow(imbal_test)+nrow(imbal_train)))
## [1] 0.7500026

Different versions of training set

set.seed(9560)
down_train <- downSample(x = imbal_train[, -ncol(imbal_train)],
                         y = imbal_train$Class)
table(down_train$Class)  
## 
## non_fraud     fraud 
##       369       369
set.seed(9560)
up_train <- upSample(x = imbal_train[, -ncol(imbal_train)],
                     y = imbal_train$Class)                         
table(up_train$Class) 
## 
## non_fraud     fraud 
##    213237    213237
set.seed(9560)
smote_train <- smote(imbal_train,var="Class",over_ratio = 0.5)                         
table(smote_train$Class) 
## 
## non_fraud     fraud 
##    213237    106618
set.seed(9560)
rose_train <- ovun.sample(Class ~ ., data  = imbal_train,method="both",p=0.5)$data                         
table(rose_train$Class)
## 
## non_fraud     fraud 
##    106996    106610

Train control parameters

ctrl <- trainControl(method = "cv", 
                     number = 5,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary)

Train Model: Exteme Gradient Boosting with L1 and L2 Regularization

In this training model, smote_train training data is used.

train <- train(Class ~., data = smote_train, method = 'xgbLinear',trControl = ctrl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
train_xgb<-train

Predictions and probabilities

prediction_probability_xgb <- predict(train_xgb, imbal_test, type="prob")


prediction_raw_xgb <- predict(train_xgb, imbal_test, type="raw")
fraud_probs_xgb <- predict(train_xgb, imbal_test, type="prob")[,2]
non_fraud_probs_xgb <- predict(train_xgb, imbal_test, type="prob")[,1]

Confusion Matrix

pred_xgb <- factor(ifelse(fraud_probs_xgb >= .5, "fraud", "non_fraud"))

prediction_raw_xgb<-relevel(prediction_raw_xgb,ref=c("fraud"))

imbal_test$Class<-relevel(imbal_test$Class,ref=c("fraud"))

confusionMatrix(data = pred_xgb, reference = factor(imbal_test$Class,levels=c("fraud","non_fraud")))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  fraud non_fraud
##   fraud       101        24
##   non_fraud    22     71054
##                                           
##                Accuracy : 0.9994          
##                  95% CI : (0.9991, 0.9995)
##     No Information Rate : 0.9983          
##     P-Value [Acc > NIR] : 1.441e-15       
##                                           
##                   Kappa : 0.8142          
##                                           
##  Mcnemar's Test P-Value : 0.8828          
##                                           
##             Sensitivity : 0.821138        
##             Specificity : 0.999662        
##          Pos Pred Value : 0.808000        
##          Neg Pred Value : 0.999690        
##              Prevalence : 0.001728        
##          Detection Rate : 0.001419        
##    Detection Prevalence : 0.001756        
##       Balanced Accuracy : 0.910400        
##                                           
##        'Positive' Class : fraud           
## 
dat_xgb<-data.frame(obs=imbal_test$Class,pred=prediction_raw_xgb,prediction_probability_xgb)

twoClassSummary(dat_xgb,lev=levels(imbal_test$Class))
##       ROC      Sens      Spec 
## 0.9670741 0.8211382 0.9996623
prSummary(dat_xgb, lev=levels(imbal_test$Class))
##       AUC Precision    Recall         F 
## 0.8388666 0.8080000 0.8211382 0.8145161
positive_xgb<-fraud_probs_xgb[imbal_test[,30]==c("fraud")]
negative_xgb<-fraud_probs_xgb[imbal_test[,30]==c("non_fraud")]

PRC <- pr.curve(positive_xgb, negative_xgb, curve=TRUE)
plot(PRC)

ROC<-roc.curve(positive_xgb, negative_xgb, curve=TRUE)
plot(ROC)

Another way of calculating ROC Curve and PR Curve

prediction_probability_xgb_scores<-data.frame(event_prob = prediction_probability_xgb$fraud, labels = imbal_test$Class)

roc <- PRROC::roc.curve(scores.class0 = prediction_probability_xgb_scores[prediction_probability_xgb_scores$labels == "fraud", ]$event_prob,scores.class1 = prediction_probability_xgb_scores[prediction_probability_xgb_scores$labels == "non_fraud", ]$event_prob,curve=T)
plot(roc)

pr<-PRROC::pr.curve(scores.class0 = prediction_probability_xgb_scores[prediction_probability_xgb_scores$labels == "fraud", ]$event_prob, #POSITIVE class 
                    scores.class1 = prediction_probability_xgb_scores[prediction_probability_xgb_scores$labels == "non_fraud", ]$event_prob, # NEGATIVE class
                     curve=T)
plot(pr)

paste("Area under the Precision-Recall curve:", round(pr$auc.integral, 7))
## [1] "Area under the Precision-Recall curve: 0.8469936"
paste("Area under the ROC curve:", round(roc$auc, 7))
## [1] "Area under the ROC curve: 0.9670741"