library(readr)
library(DataExplorer)
library(skimr)
library(ggplot2)
library(corrplot)## corrplot 0.92 loaded
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(skimr)
library(rpart)
library(rpart.plot)
library(DMwR)## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(randomForest)## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ tidyr 1.2.0 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x randomForest::combine() masks dplyr::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()
library(e1071)
library(ipred)The data that I’m going to analyze is from the Kaggle’s website. https://www.kaggle.com/datasets/rupakroy/online-payments-fraud-detection-dataset This data set is considered as big data set, It consists with 6362620 observations and 11 variables. The data set has too many observations, therefore, it make sense to narrow the data observations in terms of efficiency. The purpose of this project is building models for detecting whether the online payment is fraud or not to provide a better insight for financial institutions. The selected models are random forest model and Bootstrap Aggregating Model. The attributes/variables are presented as below.
step: represents a unit of time where 1 step equals 1 hour
type: type of online transaction (have 5 different types, included PAYMENT, TRANSFER, CASH OUT, CASH IN, DEBIT)
amount: the amount of the transaction
nameOrig: customer starting the transaction
oldbalanceOrg: balance before the transaction
newbalanceOrig: balance after the transaction
nameDest: recipient of the transaction
oldbalanceDest: initial balance of recipient before the transaction
newbalanceDest: the new balance of recipient after the transaction()
isFraud: fraud transaction (0 represents the transaction is fraud, 1 represents the transaction is not fraud)
data<-read_csv('data.csv', col_types = "nfnfnnfnnff")dim(data)## [1] 6362620 11
head(data)## # A tibble: 6 × 11
## step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest
## <dbl> <fct> <dbl> <fct> <dbl> <dbl> <fct>
## 1 1 PAYMENT 9840. C1231006815 170136 160296. M1979787155
## 2 1 PAYMENT 1864. C1666544295 21249 19385. M2044282225
## 3 1 TRANSFER 181 C1305486145 181 0 C553264065
## 4 1 CASH_OUT 181 C840083671 181 0 C38997010
## 5 1 PAYMENT 11668. C2048537720 41554 29886. M1230701703
## 6 1 PAYMENT 7818. C90045638 53860 46042. M573487274
## # … with 4 more variables: oldbalanceDest <dbl>, newbalanceDest <dbl>,
## # isFraud <fct>, isFlaggedFraud <fct>
summary(data)## step type amount nameOrig
## Min. : 1.0 PAYMENT :2151495 Min. : 0 C2098525306: 3
## 1st Qu.:156.0 TRANSFER: 532909 1st Qu.: 13390 C400299098 : 3
## Median :239.0 CASH_OUT:2237500 Median : 74872 C1999539787: 3
## Mean :243.4 DEBIT : 41432 Mean : 179862 C1065307291: 3
## 3rd Qu.:335.0 CASH_IN :1399284 3rd Qu.: 208721 C545315117 : 3
## Max. :743.0 Max. :92445517 C1976208114: 3
## (Other) :6362602
## oldbalanceOrg newbalanceOrig nameDest
## Min. : 0 Min. : 0 C1286084959: 113
## 1st Qu.: 0 1st Qu.: 0 C985934102 : 109
## Median : 14208 Median : 0 C665576141 : 105
## Mean : 833883 Mean : 855114 C2083562754: 102
## 3rd Qu.: 107315 3rd Qu.: 144258 C248609774 : 101
## Max. :59585040 Max. :49585040 C1590550415: 101
## (Other) :6361989
## oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## Min. : 0 Min. : 0 0:6354407 0:6362604
## 1st Qu.: 0 1st Qu.: 0 1: 8213 1: 16
## Median : 132706 Median : 214661
## Mean : 1100702 Mean : 1224996
## 3rd Qu.: 943037 3rd Qu.: 1111909
## Max. :356015889 Max. :356179279
##
str(data)## spec_tbl_df [6,362,620 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ step : num [1:6362620] 1 1 1 1 1 1 1 1 1 1 ...
## $ type : Factor w/ 5 levels "PAYMENT","TRANSFER",..: 1 1 2 3 1 1 1 1 1 4 ...
## $ amount : num [1:6362620] 9840 1864 181 181 11668 ...
## $ nameOrig : Factor w/ 6353307 levels "C1231006815",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ oldbalanceOrg : num [1:6362620] 170136 21249 181 181 41554 ...
## $ newbalanceOrig: num [1:6362620] 160296 19385 0 0 29886 ...
## $ nameDest : Factor w/ 2722362 levels "M1979787155",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ oldbalanceDest: num [1:6362620] 0 0 0 21182 0 ...
## $ newbalanceDest: num [1:6362620] 0 0 0 0 0 ...
## $ isFraud : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 1 1 ...
## $ isFlaggedFraud: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. step = col_number(),
## .. type = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. amount = col_number(),
## .. nameOrig = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. oldbalanceOrg = col_number(),
## .. newbalanceOrig = col_number(),
## .. nameDest = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. oldbalanceDest = col_number(),
## .. newbalanceDest = col_number(),
## .. isFraud = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. isFlaggedFraud = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
## .. )
## - attr(*, "problems")=<externalptr>
As the summary() and str() functions show, the data set seems doesn’t have any NA or missing values.
plot_missing(data)Fortunately, as the plot shows, the data set doesn’t have any missing values as the above str() function shows.
skim(data)| Name | data |
| Number of rows | 6362620 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| factor | 5 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| type | 0 | 1 | FALSE | 5 | CAS: 2237500, PAY: 2151495, CAS: 1399284, TRA: 532909 |
| nameOrig | 0 | 1 | FALSE | 6353307 | C20: 3, C40: 3, C19: 3, C10: 3 |
| nameDest | 0 | 1 | FALSE | 2722362 | C12: 113, C98: 109, C66: 105, C20: 102 |
| isFraud | 0 | 1 | FALSE | 2 | 0: 6354407, 1: 8213 |
| isFlaggedFraud | 0 | 1 | FALSE | 2 | 0: 6362604, 1: 16 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| step | 0 | 1 | 243.4 | 142.33 | 1 | 156.00 | 239.00 | 335.0 | 743 | ▅▇▆▁▁ |
| amount | 0 | 1 | 179861.9 | 603858.23 | 0 | 13389.57 | 74871.94 | 208721.5 | 92445517 | ▇▁▁▁▁ |
| oldbalanceOrg | 0 | 1 | 833883.1 | 2888242.67 | 0 | 0.00 | 14208.00 | 107315.2 | 59585040 | ▇▁▁▁▁ |
| newbalanceOrig | 0 | 1 | 855113.7 | 2924048.50 | 0 | 0.00 | 0.00 | 144258.4 | 49585040 | ▇▁▁▁▁ |
| oldbalanceDest | 0 | 1 | 1100701.7 | 3399180.11 | 0 | 0.00 | 132705.66 | 943036.7 | 356015889 | ▇▁▁▁▁ |
| newbalanceDest | 0 | 1 | 1224996.4 | 3674128.94 | 0 | 0.00 | 214661.44 | 1111909.2 | 356179279 | ▇▁▁▁▁ |
The skim() function provides overall statistic summaries, and it tells that the variable - isFraud has class imbalance problem. Therefore, should fix the class imbalance problem before building the models to avoid any bias.
There are more Not Fraud transactions than Fraud transactions.
ggplot(data, aes(x=isFraud, fill=isFraud))+geom_bar()+scale_y_log10()+
ggtitle('The Fraud Transactions')+xlab('Fraud')+ylab('Log Count')+scale_fill_discrete(labels=c('Not Fraud', 'Fraud'))Among the types of transactions,CASH_OUT ranks the first, and followed by PAYMENT AND CASH_IN. DEBIT is the least type of the transactions.
ggplot(data, aes(x=type, fill=type))+geom_bar()+ggtitle('The Type of Transactions')According the plot, I can see that most of the Fraud is from the transaction type of TRANSFER and CASH_OUT.
ggplot(data, aes(x=type, y= isFraud, fill = isFraud)) +
geom_bar(stat="identity")+scale_fill_discrete(labels=c('Not Fraud', 'Fraud'))The transaction histogram expresses that the distribution of transaction amount is rightly skewed, it means most of transactions are made in low values.
ggplot(data, aes(x=amount))+geom_histogram()+scale_y_log10()+ggtitle('Histogram of Transactions')+ylab('Log Count')## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 5 rows containing missing values (geom_bar).
According to the correlation plot, the numeric variables in the data set the new balance origin is highly correlated with the new balance origin, also the nw balance dest is slightly correlated with amount.
numeric<-data%>%
select(step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest)
corrplot(cor(numeric))For this analysis, I decide to split the data in two groups. One for training data, and the other for testing the model accuracy as a holdout set. The data is split into 75/20 ratio.Due to the variable-isFraud has class imbalance problem. Therefore, use SMOTE() to fix the class imbalance problem.Before fixing the class imbalance problem, convert the name of level of isFraud from 0 and 1 to Not_Fraud and Fraud.
data<-SMOTE(factor(isFraud)~., data.frame(data),perc.over = 100,perc.under = 200)## Warning in names(data) == as.character(form[[2]]): longer object length is not a
## multiple of shorter object length
round(prop.table(table(select(data,isFraud),exclude = NULL)),4)*100##
## 0 1
## 50 50
Perfectly, the ratio of Fraud and Not_Fraud becomes 50/50 roughly. Successfully fixed the class imbalance problem. Therefore, it is good to continue to building models. Since it is required to use one from week 1 to 10, and one from week 11 to week 15, I’d like to use the random forest model and bagging model for the further analysis.
set.seed(1000)
data$isFraud <-ifelse(data$isFraud==0,'Not_Fraud','Fraud')
data$isFraud<-as.factor(data$isFraud)
levels(data$isFraud)<-c("Fraud","Not_Fraud")
split <- sample(nrow(data), round(nrow(data)*0.75), replace = F)
train <- data[split,]
test <- data[-split,]
round(prop.table(table(select(train, isFraud),exclude = NULL)),4)*100##
## Fraud Not_Fraud
## 50.02 49.98
set.seed(123)
forest <- randomForest(isFraud~ step+type+amount+oldbalanceOrg+newbalanceOrig+oldbalanceDest+newbalanceDest+isFlaggedFraud, data = train, importance=TRUE, ntree=1000)
# display model details
forest##
## Call:
## randomForest(formula = isFraud ~ step + type + amount + oldbalanceOrg + newbalanceOrig + oldbalanceDest + newbalanceDest + isFlaggedFraud, data = train, importance = TRUE, ntree = 1000)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 1.14%
## Confusion matrix:
## Fraud Not_Fraud class.error
## Fraud 12246 78 0.006329114
## Not_Fraud 203 12112 0.016483963
plot(forest)varImpPlot(forest)forest_pred<-predict(forest, newdata = test, type='class')
forest_cm<-confusionMatrix(forest_pred,test$isFraud)
forest_cm## Confusion Matrix and Statistics
##
## Reference
## Prediction Fraud Not_Fraud
## Fraud 4074 68
## Not_Fraud 28 4043
##
## Accuracy : 0.9883
## 95% CI : (0.9857, 0.9905)
## No Information Rate : 0.5005
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9766
##
## Mcnemar's Test P-Value : 6.879e-05
##
## Sensitivity : 0.9932
## Specificity : 0.9835
## Pos Pred Value : 0.9836
## Neg Pred Value : 0.9931
## Prevalence : 0.4995
## Detection Rate : 0.4960
## Detection Prevalence : 0.5043
## Balanced Accuracy : 0.9883
##
## 'Positive' Class : Fraud
##
set.seed(1000)
bag<-bagging(isFraud~ step+type+amount+oldbalanceOrg+newbalanceOrig+oldbalanceDest+newbalanceDest+isFlaggedFraud, data = train, nbagg=150,coob=T,control=rpart.control(minsplit = 2, cp = 0))varImp(bag)## Overall
## amount 7399.9918
## isFlaggedFraud 952.6310
## newbalanceDest 2515.7845
## newbalanceOrig 506.5196
## oldbalanceDest 2577.1155
## oldbalanceOrg 9763.2609
## step 5081.1263
## type 5437.8756
bag_pred<-predict(bag, newdata = test, type='class')
bag_cm<-confusionMatrix(bag_pred,test$isFraud)
bag_cm## Confusion Matrix and Statistics
##
## Reference
## Prediction Fraud Not_Fraud
## Fraud 4081 35
## Not_Fraud 21 4076
##
## Accuracy : 0.9932
## 95% CI : (0.9912, 0.9948)
## No Information Rate : 0.5005
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9864
##
## Mcnemar's Test P-Value : 0.08235
##
## Sensitivity : 0.9949
## Specificity : 0.9915
## Pos Pred Value : 0.9915
## Neg Pred Value : 0.9949
## Prevalence : 0.4995
## Detection Rate : 0.4969
## Detection Prevalence : 0.5012
## Balanced Accuracy : 0.9932
##
## 'Positive' Class : Fraud
##
In conclusion, the Bootstrap Aggregating Model performs a better accuracy compare to the random forest model with 99.37% of accuracy rate. Especially, it not only provides more accuracy performance but also predicted less incorrectly the the Fraud as Not_Fraud. As the confusion Matrix shows, Bootstrap Aggregating Model predicted the Fraud as Not_Fraud incorrectly with 39 cases. Random forest model predicted the Fraud as Not_Fraud incorrectly with 57 cases. It is important for a financial institution to correctly determine the Fraud case since it may link to a tremendous disaster and risk of losing properties for the customers. Therefore, I’d like to recommend to use the Bootstrap Aggregating Model to the financial institutions to detect the fraud transactions.