The dataset contains transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are ‘Time’ and ‘Amount’. Feature ‘Time’ contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature ‘Amount’ is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature ‘Class’ is the response variable and it takes value 1 in case of fraud and 0 otherwise.
Given the class imbalance ratio, we recommend measuring the accuracy using the Area Under the Precision-Recall Curve (AUPRC). Confusion matrix accuracy is not meaningful for unbalanced classification.
You can download creditcard.csv from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
library(caret)
library(ggplot2)
library(dplyr)
library(corrplot)
library(tidyverse)
library(LiblineaR)
library(recipes)
library(themis)
library(kernlab)
library(pROC)
library(ROSE)
library(DMwR2)
library(h2o)
library(glmnet)
library(xgboost)
library(PRROC)
setwd("C:/Users/ozge/Desktop/credit_card_deneme")
creditcard <- read.csv("creditcard.csv")
head(creditcard)
## Time V1 V2 V3 V4 V5 V6
## 1 0 -1.3598071 -0.07278117 2.5363467 1.3781552 -0.33832077 0.46238778
## 2 0 1.1918571 0.26615071 0.1664801 0.4481541 0.06001765 -0.08236081
## 3 1 -1.3583541 -1.34016307 1.7732093 0.3797796 -0.50319813 1.80049938
## 4 1 -0.9662717 -0.18522601 1.7929933 -0.8632913 -0.01030888 1.24720317
## 5 2 -1.1582331 0.87773675 1.5487178 0.4030339 -0.40719338 0.09592146
## 6 2 -0.4259659 0.96052304 1.1411093 -0.1682521 0.42098688 -0.02972755
## V7 V8 V9 V10 V11 V12
## 1 0.23959855 0.09869790 0.3637870 0.09079417 -0.5515995 -0.61780086
## 2 -0.07880298 0.08510165 -0.2554251 -0.16697441 1.6127267 1.06523531
## 3 0.79146096 0.24767579 -1.5146543 0.20764287 0.6245015 0.06608369
## 4 0.23760894 0.37743587 -1.3870241 -0.05495192 -0.2264873 0.17822823
## 5 0.59294075 -0.27053268 0.8177393 0.75307443 -0.8228429 0.53819555
## 6 0.47620095 0.26031433 -0.5686714 -0.37140720 1.3412620 0.35989384
## V13 V14 V15 V16 V17 V18
## 1 -0.9913898 -0.3111694 1.4681770 -0.4704005 0.20797124 0.02579058
## 2 0.4890950 -0.1437723 0.6355581 0.4639170 -0.11480466 -0.18336127
## 3 0.7172927 -0.1659459 2.3458649 -2.8900832 1.10996938 -0.12135931
## 4 0.5077569 -0.2879237 -0.6314181 -1.0596472 -0.68409279 1.96577500
## 5 1.3458516 -1.1196698 0.1751211 -0.4514492 -0.23703324 -0.03819479
## 6 -0.3580907 -0.1371337 0.5176168 0.4017259 -0.05813282 0.06865315
## V19 V20 V21 V22 V23 V24
## 1 0.40399296 0.25141210 -0.018306778 0.277837576 -0.11047391 0.06692807
## 2 -0.14578304 -0.06908314 -0.225775248 -0.638671953 0.10128802 -0.33984648
## 3 -2.26185710 0.52497973 0.247998153 0.771679402 0.90941226 -0.68928096
## 4 -1.23262197 -0.20803778 -0.108300452 0.005273597 -0.19032052 -1.17557533
## 5 0.80348692 0.40854236 -0.009430697 0.798278495 -0.13745808 0.14126698
## 6 -0.03319379 0.08496767 -0.208253515 -0.559824796 -0.02639767 -0.37142658
## V25 V26 V27 V28 Amount Class
## 1 0.1285394 -0.1891148 0.133558377 -0.02105305 149.62 0
## 2 0.1671704 0.1258945 -0.008983099 0.01472417 2.69 0
## 3 -0.3276418 -0.1390966 -0.055352794 -0.05975184 378.66 0
## 4 0.6473760 -0.2219288 0.062722849 0.06145763 123.50 0
## 5 -0.2060096 0.5022922 0.219422230 0.21515315 69.99 0
## 6 -0.2327938 0.1059148 0.253844225 0.08108026 3.67 0
summary(creditcard)
## Time V1 V2 V3
## Min. : 0 Min. :-56.40751 Min. :-72.71573 Min. :-48.3256
## 1st Qu.: 54202 1st Qu.: -0.92037 1st Qu.: -0.59855 1st Qu.: -0.8904
## Median : 84692 Median : 0.01811 Median : 0.06549 Median : 0.1799
## Mean : 94814 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:139321 3rd Qu.: 1.31564 3rd Qu.: 0.80372 3rd Qu.: 1.0272
## Max. :172792 Max. : 2.45493 Max. : 22.05773 Max. : 9.3826
## V4 V5 V6 V7
## Min. :-5.68317 Min. :-113.74331 Min. :-26.1605 Min. :-43.5572
## 1st Qu.:-0.84864 1st Qu.: -0.69160 1st Qu.: -0.7683 1st Qu.: -0.5541
## Median :-0.01985 Median : -0.05434 Median : -0.2742 Median : 0.0401
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.74334 3rd Qu.: 0.61193 3rd Qu.: 0.3986 3rd Qu.: 0.5704
## Max. :16.87534 Max. : 34.80167 Max. : 73.3016 Max. :120.5895
## V8 V9 V10 V11
## Min. :-73.21672 Min. :-13.43407 Min. :-24.58826 Min. :-4.79747
## 1st Qu.: -0.20863 1st Qu.: -0.64310 1st Qu.: -0.53543 1st Qu.:-0.76249
## Median : 0.02236 Median : -0.05143 Median : -0.09292 Median :-0.03276
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.32735 3rd Qu.: 0.59714 3rd Qu.: 0.45392 3rd Qu.: 0.73959
## Max. : 20.00721 Max. : 15.59500 Max. : 23.74514 Max. :12.01891
## V12 V13 V14 V15
## Min. :-18.6837 Min. :-5.79188 Min. :-19.2143 Min. :-4.49894
## 1st Qu.: -0.4056 1st Qu.:-0.64854 1st Qu.: -0.4256 1st Qu.:-0.58288
## Median : 0.1400 Median :-0.01357 Median : 0.0506 Median : 0.04807
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6182 3rd Qu.: 0.66251 3rd Qu.: 0.4931 3rd Qu.: 0.64882
## Max. : 7.8484 Max. : 7.12688 Max. : 10.5268 Max. : 8.87774
## V16 V17 V18
## Min. :-14.12985 Min. :-25.16280 Min. :-9.498746
## 1st Qu.: -0.46804 1st Qu.: -0.48375 1st Qu.:-0.498850
## Median : 0.06641 Median : -0.06568 Median :-0.003636
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.52330 3rd Qu.: 0.39968 3rd Qu.: 0.500807
## Max. : 17.31511 Max. : 9.25353 Max. : 5.041069
## V19 V20 V21
## Min. :-7.213527 Min. :-54.49772 Min. :-34.83038
## 1st Qu.:-0.456299 1st Qu.: -0.21172 1st Qu.: -0.22839
## Median : 0.003735 Median : -0.06248 Median : -0.02945
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.458949 3rd Qu.: 0.13304 3rd Qu.: 0.18638
## Max. : 5.591971 Max. : 39.42090 Max. : 27.20284
## V22 V23 V24
## Min. :-10.933144 Min. :-44.80774 Min. :-2.83663
## 1st Qu.: -0.542350 1st Qu.: -0.16185 1st Qu.:-0.35459
## Median : 0.006782 Median : -0.01119 Median : 0.04098
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.528554 3rd Qu.: 0.14764 3rd Qu.: 0.43953
## Max. : 10.503090 Max. : 22.52841 Max. : 4.58455
## V25 V26 V27
## Min. :-10.29540 Min. :-2.60455 Min. :-22.565679
## 1st Qu.: -0.31715 1st Qu.:-0.32698 1st Qu.: -0.070840
## Median : 0.01659 Median :-0.05214 Median : 0.001342
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.35072 3rd Qu.: 0.24095 3rd Qu.: 0.091045
## Max. : 7.51959 Max. : 3.51735 Max. : 31.612198
## V28 Amount Class
## Min. :-15.43008 Min. : 0.00 Min. :0.000000
## 1st Qu.: -0.05296 1st Qu.: 5.60 1st Qu.:0.000000
## Median : 0.01124 Median : 22.00 Median :0.000000
## Mean : 0.00000 Mean : 88.35 Mean :0.001728
## 3rd Qu.: 0.07828 3rd Qu.: 77.17 3rd Qu.:0.000000
## Max. : 33.84781 Max. :25691.16 Max. :1.000000
slice_sample(creditcard, n=10)
## Time V1 V2 V3 V4 V5 V6
## 1 79649 -0.5765298 0.8024263 1.27146501 0.1551318 -0.1914559 -0.57444009
## 2 153752 -1.2796333 -0.3325638 -0.49623592 1.3706392 3.1041763 -1.30056006
## 3 26847 1.0662275 -1.0786974 1.47490021 -0.3269744 -2.1454943 -0.73947917
## 4 115881 2.0707735 -0.7420339 -0.85037206 -0.2657001 -0.6205071 -0.39380603
## 5 154242 -0.4388533 1.4496002 -3.65700592 -1.6115014 3.2679898 2.53153436
## 6 164485 -1.1345349 0.2330074 1.60286263 -1.0033010 -0.2469486 0.01208722
## 7 137613 1.1447911 0.7448279 -1.44164254 1.7417154 0.2651848 -0.79532923
## 8 45923 -0.2198232 1.4734856 0.02720225 0.9943687 0.1816129 -0.88070368
## 9 167101 -0.2021112 1.2635791 -0.75853482 -0.4856347 0.5375364 -0.91595348
## 10 153525 2.1374834 0.1469457 -2.44830610 0.3939755 0.6369531 -1.79325153
## V7 V8 V9 V10 V11 V12
## 1 0.9264783 0.02719064 -0.4288248 -0.630036286 0.27101050 -9.344354e-05
## 2 0.5784788 -0.16639080 -0.7750215 -0.132521830 1.19199019 4.566840e-01
## 3 -1.0539963 -0.07165695 -0.3787547 0.542979832 0.02154368 4.425232e-01
## 4 -0.5541589 -0.12996139 -0.6372424 1.038047528 0.46730946 1.029260e+00
## 5 0.5549607 0.93943919 -0.3557659 -0.129309905 0.13975275 -2.019823e-01
## 6 -0.5166304 0.82723167 0.2739687 -1.080644753 -0.07642238 2.786068e-01
## 7 0.4729544 -1.22183824 -0.2489892 -0.005723708 -0.68016264 -5.114519e-01
## 8 0.6685907 0.20583637 -0.9100215 -0.097988780 0.04450857 9.811081e-01
## 9 0.6214350 0.32443698 -0.1868532 -1.055514056 -0.83556511 -6.328867e-02
## 10 0.9565911 -0.58404171 0.2963657 0.111534233 -1.04390809 -5.310429e-01
## V13 V14 V15 V16 V17 V18
## 1 -0.11525551 -0.336486575 1.14809810 0.02174258 0.40847549 -0.6354767
## 2 -0.08561097 -1.023557634 -0.76225349 -0.64462091 0.99636262 0.8133687
## 3 1.26974682 -0.510824512 1.37606400 -0.80232562 -0.20000790 1.3706801
## 4 1.06633481 0.001936594 -0.65398993 -1.17052188 -0.57130923 1.4571431
## 5 -0.40863300 -0.234488318 0.24337944 0.02078957 0.51643635 -0.1583971
## 6 -0.91202238 0.207814480 -0.74620073 1.12278742 -0.91869630 0.7406828
## 7 -0.73647004 -0.391634159 0.94336192 -1.08704188 1.76733081 0.2494799
## 8 1.20781670 0.609009677 0.71731359 -0.88155715 0.34746978 -0.7928405
## 9 -0.16413103 -0.653463986 -0.25690696 0.45168054 0.56377628 -0.1632633
## 10 -1.68718695 1.188898610 0.08382977 -1.01226211 0.01757501 -0.4165197
## V19 V20 V21 V22 V23 V24
## 1 -0.59347764 0.26715583 -0.16074423 -0.5369689 0.27690478 0.33308541
## 2 0.82741169 0.74496596 0.18896820 0.4530237 0.07630484 0.58969944
## 3 -1.41591263 -0.17199980 -0.04921762 0.1459888 0.02898353 0.98473896
## 4 -0.50106428 -0.50692390 -0.34304798 -0.4102081 0.22174795 -0.33692932
## 5 -0.71269657 -0.06158877 0.17068847 0.3917650 0.05075306 0.65002917
## 6 -0.54865616 -0.21818082 0.16132641 0.2063681 -0.12666844 -0.40200183
## 7 2.37499001 0.05110286 0.77161897 -0.0152730 0.08136853 -0.15536283
## 8 -0.11157577 -0.02759102 0.23906452 0.8758596 0.12157365 0.44968220
## 9 -0.25061016 -0.09560083 -0.29602508 -0.8558209 0.17208891 0.62183103
## 10 0.05806814 -0.38127555 0.27552886 0.9670000 -0.20182718 -0.04380673
## V25 V26 V27 V28 Amount Class
## 1 -0.24409799 0.07905963 0.22866653 0.104994947 94.48 0
## 2 0.08361377 -0.45783806 0.21538069 0.086589276 48.46 0
## 3 -0.12723909 1.06709877 -0.01777489 0.054942803 125.90 0
## 4 -0.28306456 0.22319776 -0.01740954 -0.055963460 30.90 0
## 5 -0.23947845 0.52599873 -0.18199015 0.054466773 12.00 0
## 6 -0.41436283 0.27509475 -0.02461016 0.006444748 6.99 0
## 7 -0.24275518 0.72694944 0.30744396 0.381852932 49.99 0
## 8 -0.76178684 -0.40773742 0.33985738 0.170960151 2.69 0
## 9 -0.40821244 0.11241438 0.09367545 0.022464678 9.12 0
## 10 0.77375162 0.13503352 -0.08263250 -0.090162117 1.00 0
str(creditcard)
## 'data.frame': 284807 obs. of 31 variables:
## $ Time : num 0 0 1 1 2 2 4 7 7 9 ...
## $ V1 : num -1.36 1.192 -1.358 -0.966 -1.158 ...
## $ V2 : num -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
## $ V3 : num 2.536 0.166 1.773 1.793 1.549 ...
## $ V4 : num 1.378 0.448 0.38 -0.863 0.403 ...
## $ V5 : num -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
## $ V6 : num 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
## $ V7 : num 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
## $ V8 : num 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
## $ V9 : num 0.364 -0.255 -1.515 -1.387 0.818 ...
## $ V10 : num 0.0908 -0.167 0.2076 -0.055 0.7531 ...
## $ V11 : num -0.552 1.613 0.625 -0.226 -0.823 ...
## $ V12 : num -0.6178 1.0652 0.0661 0.1782 0.5382 ...
## $ V13 : num -0.991 0.489 0.717 0.508 1.346 ...
## $ V14 : num -0.311 -0.144 -0.166 -0.288 -1.12 ...
## $ V15 : num 1.468 0.636 2.346 -0.631 0.175 ...
## $ V16 : num -0.47 0.464 -2.89 -1.06 -0.451 ...
## $ V17 : num 0.208 -0.115 1.11 -0.684 -0.237 ...
## $ V18 : num 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
## $ V19 : num 0.404 -0.146 -2.262 -1.233 0.803 ...
## $ V20 : num 0.2514 -0.0691 0.525 -0.208 0.4085 ...
## $ V21 : num -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
## $ V22 : num 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
## $ V23 : num -0.11 0.101 0.909 -0.19 -0.137 ...
## $ V24 : num 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
## $ V25 : num 0.129 0.167 -0.328 0.647 -0.206 ...
## $ V26 : num -0.189 0.126 -0.139 -0.222 0.502 ...
## $ V27 : num 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
## $ V28 : num -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
## $ Amount: num 149.62 2.69 378.66 123.5 69.99 ...
## $ Class : int 0 0 0 0 0 0 0 0 0 0 ...
creditcard$Amount <- scale(creditcard$Amount, center = TRUE, scale = TRUE)
summary(creditcard)
## Time V1 V2 V3
## Min. : 0 Min. :-56.40751 Min. :-72.71573 Min. :-48.3256
## 1st Qu.: 54202 1st Qu.: -0.92037 1st Qu.: -0.59855 1st Qu.: -0.8904
## Median : 84692 Median : 0.01811 Median : 0.06549 Median : 0.1799
## Mean : 94814 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:139321 3rd Qu.: 1.31564 3rd Qu.: 0.80372 3rd Qu.: 1.0272
## Max. :172792 Max. : 2.45493 Max. : 22.05773 Max. : 9.3826
## V4 V5 V6 V7
## Min. :-5.68317 Min. :-113.74331 Min. :-26.1605 Min. :-43.5572
## 1st Qu.:-0.84864 1st Qu.: -0.69160 1st Qu.: -0.7683 1st Qu.: -0.5541
## Median :-0.01985 Median : -0.05434 Median : -0.2742 Median : 0.0401
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.74334 3rd Qu.: 0.61193 3rd Qu.: 0.3986 3rd Qu.: 0.5704
## Max. :16.87534 Max. : 34.80167 Max. : 73.3016 Max. :120.5895
## V8 V9 V10 V11
## Min. :-73.21672 Min. :-13.43407 Min. :-24.58826 Min. :-4.79747
## 1st Qu.: -0.20863 1st Qu.: -0.64310 1st Qu.: -0.53543 1st Qu.:-0.76249
## Median : 0.02236 Median : -0.05143 Median : -0.09292 Median :-0.03276
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.32735 3rd Qu.: 0.59714 3rd Qu.: 0.45392 3rd Qu.: 0.73959
## Max. : 20.00721 Max. : 15.59500 Max. : 23.74514 Max. :12.01891
## V12 V13 V14 V15
## Min. :-18.6837 Min. :-5.79188 Min. :-19.2143 Min. :-4.49894
## 1st Qu.: -0.4056 1st Qu.:-0.64854 1st Qu.: -0.4256 1st Qu.:-0.58288
## Median : 0.1400 Median :-0.01357 Median : 0.0506 Median : 0.04807
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6182 3rd Qu.: 0.66251 3rd Qu.: 0.4931 3rd Qu.: 0.64882
## Max. : 7.8484 Max. : 7.12688 Max. : 10.5268 Max. : 8.87774
## V16 V17 V18
## Min. :-14.12985 Min. :-25.16280 Min. :-9.498746
## 1st Qu.: -0.46804 1st Qu.: -0.48375 1st Qu.:-0.498850
## Median : 0.06641 Median : -0.06568 Median :-0.003636
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.52330 3rd Qu.: 0.39968 3rd Qu.: 0.500807
## Max. : 17.31511 Max. : 9.25353 Max. : 5.041069
## V19 V20 V21
## Min. :-7.213527 Min. :-54.49772 Min. :-34.83038
## 1st Qu.:-0.456299 1st Qu.: -0.21172 1st Qu.: -0.22839
## Median : 0.003735 Median : -0.06248 Median : -0.02945
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.458949 3rd Qu.: 0.13304 3rd Qu.: 0.18638
## Max. : 5.591971 Max. : 39.42090 Max. : 27.20284
## V22 V23 V24
## Min. :-10.933144 Min. :-44.80774 Min. :-2.83663
## 1st Qu.: -0.542350 1st Qu.: -0.16185 1st Qu.:-0.35459
## Median : 0.006782 Median : -0.01119 Median : 0.04098
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.528554 3rd Qu.: 0.14764 3rd Qu.: 0.43953
## Max. : 10.503090 Max. : 22.52841 Max. : 4.58455
## V25 V26 V27
## Min. :-10.29540 Min. :-2.60455 Min. :-22.565679
## 1st Qu.: -0.31715 1st Qu.:-0.32698 1st Qu.: -0.070840
## Median : 0.01659 Median :-0.05214 Median : 0.001342
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.35072 3rd Qu.: 0.24095 3rd Qu.: 0.091045
## Max. : 7.51959 Max. : 3.51735 Max. : 31.612198
## V28 Amount.V1 Class
## Min. :-15.43008 Min. : -0.35323 Min. :0.000000
## 1st Qu.: -0.05296 1st Qu.: -0.33084 1st Qu.:0.000000
## Median : 0.01124 Median : -0.26527 Median :0.000000
## Mean : 0.00000 Mean : 0.00000 Mean :0.001728
## 3rd Qu.: 0.07828 3rd Qu.: -0.04472 3rd Qu.:0.000000
## Max. : 33.84781 Max. :102.36206 Max. :1.000000
credit_table <- table(creditcard$Class)
print(credit_table)
##
## 0 1
## 284315 492
print(credit_table[2]/(credit_table[1]+credit_table[2]))
## 1
## 0.001727486
creditcard$Class<- factor(make.names(creditcard$Class), labels = c("non_fraud", "fraud"))
creditcard<-subset(creditcard, select = -c(Time))
set.seed(77)
partition <- caret::createDataPartition(y=creditcard$Class, p=.75, list=FALSE)
imbal_train <- creditcard[partition,]
imbal_test <- creditcard[-partition,]
print(nrow(imbal_train)/(nrow(imbal_test)+nrow(imbal_train)))
## [1] 0.7500026
set.seed(9560)
down_train <- downSample(x = imbal_train[, -ncol(imbal_train)],
y = imbal_train$Class)
table(down_train$Class)
##
## non_fraud fraud
## 369 369
set.seed(9560)
up_train <- upSample(x = imbal_train[, -ncol(imbal_train)],
y = imbal_train$Class)
table(up_train$Class)
##
## non_fraud fraud
## 213237 213237
set.seed(9560)
smote_train <- smote(imbal_train,var="Class",over_ratio = 0.5)
table(smote_train$Class)
##
## non_fraud fraud
## 213237 106618
set.seed(9560)
rose_train <- ovun.sample(Class ~ ., data = imbal_train,method="both",p=0.5)$data
table(rose_train$Class)
##
## non_fraud fraud
## 106996 106610
ctrl <- trainControl(method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
In this training model, smote_train training data is used.
train <- train(Class ~., data = smote_train, method = 'xgbLinear',trControl = ctrl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
train_xgb<-train
prediction_probability_xgb <- predict(train_xgb, imbal_test, type="prob")
prediction_raw_xgb <- predict(train_xgb, imbal_test, type="raw")
fraud_probs_xgb <- predict(train_xgb, imbal_test, type="prob")[,2]
non_fraud_probs_xgb <- predict(train_xgb, imbal_test, type="prob")[,1]
pred_xgb <- factor(ifelse(fraud_probs_xgb >= .5, "fraud", "non_fraud"))
prediction_raw_xgb<-relevel(prediction_raw_xgb,ref=c("fraud"))
imbal_test$Class<-relevel(imbal_test$Class,ref=c("fraud"))
confusionMatrix(data = pred_xgb, reference = factor(imbal_test$Class,levels=c("fraud","non_fraud")))
## Confusion Matrix and Statistics
##
## Reference
## Prediction fraud non_fraud
## fraud 101 24
## non_fraud 22 71054
##
## Accuracy : 0.9994
## 95% CI : (0.9991, 0.9995)
## No Information Rate : 0.9983
## P-Value [Acc > NIR] : 1.441e-15
##
## Kappa : 0.8142
##
## Mcnemar's Test P-Value : 0.8828
##
## Sensitivity : 0.821138
## Specificity : 0.999662
## Pos Pred Value : 0.808000
## Neg Pred Value : 0.999690
## Prevalence : 0.001728
## Detection Rate : 0.001419
## Detection Prevalence : 0.001756
## Balanced Accuracy : 0.910400
##
## 'Positive' Class : fraud
##
dat_xgb<-data.frame(obs=imbal_test$Class,pred=prediction_raw_xgb,prediction_probability_xgb)
twoClassSummary(dat_xgb,lev=levels(imbal_test$Class))
## ROC Sens Spec
## 0.9670741 0.8211382 0.9996623
prSummary(dat_xgb, lev=levels(imbal_test$Class))
## AUC Precision Recall F
## 0.8388666 0.8080000 0.8211382 0.8145161
positive_xgb<-fraud_probs_xgb[imbal_test[,30]==c("fraud")]
negative_xgb<-fraud_probs_xgb[imbal_test[,30]==c("non_fraud")]
PRC <- pr.curve(positive_xgb, negative_xgb, curve=TRUE)
plot(PRC)
ROC<-roc.curve(positive_xgb, negative_xgb, curve=TRUE)
plot(ROC)
prediction_probability_xgb_scores<-data.frame(event_prob = prediction_probability_xgb$fraud, labels = imbal_test$Class)
roc <- PRROC::roc.curve(scores.class0 = prediction_probability_xgb_scores[prediction_probability_xgb_scores$labels == "fraud", ]$event_prob,scores.class1 = prediction_probability_xgb_scores[prediction_probability_xgb_scores$labels == "non_fraud", ]$event_prob,curve=T)
plot(roc)
pr<-PRROC::pr.curve(scores.class0 = prediction_probability_xgb_scores[prediction_probability_xgb_scores$labels == "fraud", ]$event_prob, #POSITIVE class
scores.class1 = prediction_probability_xgb_scores[prediction_probability_xgb_scores$labels == "non_fraud", ]$event_prob, # NEGATIVE class
curve=T)
plot(pr)
paste("Area under the Precision-Recall curve:", round(pr$auc.integral, 7))
## [1] "Area under the Precision-Recall curve: 0.8469936"
paste("Area under the ROC curve:", round(roc$auc, 7))
## [1] "Area under the ROC curve: 0.9670741"