Housekeeping

rm(list=ls()) # clear workspace
cat("\014")  # clear console
graphics.off() # shuts down all open graphics devices 

Import dataset

financialfraud <- read.csv("/Users/thuhuong/Desktop/R Working Directory/FinancialFraud.csv")
head(financialfraud)
##   step     type   amount    nameOrig oldbalanceOrg newbalanceOrig    nameDest
## 1    1  PAYMENT  9839.64 C1231006815        170136      160296.36 M1979787155
## 2    1  PAYMENT  1864.28 C1666544295         21249       19384.72 M2044282225
## 3    1 TRANSFER   181.00 C1305486145           181           0.00  C553264065
## 4    1 CASH_OUT   181.00  C840083671           181           0.00   C38997010
## 5    1  PAYMENT 11668.14 C2048537720         41554       29885.86 M1230701703
## 6    1  PAYMENT  7817.71   C90045638         53860       46042.29  M573487274
##   oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## 1              0              0       0              0
## 2              0              0       0              0
## 3              0              0       1              0
## 4          21182              0       1              0
## 5              0              0       0              0
## 6              0              0       0              0

EDA

dim(financialfraud) #dimension of the dataset
## [1] 6362620      11
summary(financialfraud) #summary of the dataset
##       step           type               amount           nameOrig        
##  Min.   :  1.0   Length:6362620     Min.   :       0   Length:6362620    
##  1st Qu.:156.0   Class :character   1st Qu.:   13390   Class :character  
##  Median :239.0   Mode  :character   Median :   74872   Mode  :character  
##  Mean   :243.4                      Mean   :  179862                     
##  3rd Qu.:335.0                      3rd Qu.:  208721                     
##  Max.   :743.0                      Max.   :92445517                     
##  oldbalanceOrg      newbalanceOrig       nameDest         oldbalanceDest     
##  Min.   :       0   Min.   :       0   Length:6362620     Min.   :        0  
##  1st Qu.:       0   1st Qu.:       0   Class :character   1st Qu.:        0  
##  Median :   14208   Median :       0   Mode  :character   Median :   132706  
##  Mean   :  833883   Mean   :  855114                      Mean   :  1100702  
##  3rd Qu.:  107315   3rd Qu.:  144258                      3rd Qu.:   943037  
##  Max.   :59585040   Max.   :49585040                      Max.   :356015889  
##  newbalanceDest         isFraud         isFlaggedFraud   
##  Min.   :        0   Min.   :0.000000   Min.   :0.0e+00  
##  1st Qu.:        0   1st Qu.:0.000000   1st Qu.:0.0e+00  
##  Median :   214661   Median :0.000000   Median :0.0e+00  
##  Mean   :  1224996   Mean   :0.001291   Mean   :2.5e-06  
##  3rd Qu.:  1111909   3rd Qu.:0.000000   3rd Qu.:0.0e+00  
##  Max.   :356179279   Max.   :1.000000   Max.   :1.0e+00
sum(is.na(financialfraud)) #check null values in the dataset
## [1] 0
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
financialfraud %>% count(isFraud) #number of fraud vs. non-fraud transactions
##   isFraud       n
## 1       0 6354407
## 2       1    8213
library(ggplot2)
g <- ggplot(financialfraud, aes(isFraud))
g + geom_bar(fill="lightblue") + geom_label(stat='count', aes(label = paste0(round(((..count..) /sum(..count..)), 4)*100, "%"))) + labs(x = "Fraud vs Not Fraud", y = "Frequency", title = "Frequency of Fraud", subtitle = "Labels as Percent of Total Observations") + scale_y_continuous(labels = scales::comma)

fraud_trans_type <- financialfraud %>% group_by(type) %>% summarize(fraud_count = sum(isFraud)) #type of fraud transactions
ggplot(fraud_trans_type, aes(x=type, y=fraud_count)) + geom_col(fill="lightblue") + labs(title = 'Fraud transactions per Type', x = 'Transaction Type', y = 'Number of Fraud Transactions') + geom_label(aes(label=fraud_count)) + theme_classic()

ggplot(data=financialfraud[financialfraud$isFraud==1,], aes(x=amount)) + geom_histogram(fill="lightblue") + labs(title = 'Fraud transactions Amount distribution', x = 'Amount', y = 'Number of Fraud Transactions') + theme_classic() + scale_x_continuous(labels = scales::comma)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

modulo <- function(y, x) {
  q <- y/x
  q <- floor(q)
  p <- q*x
  mod <- y-p
  return(mod)
}
financialfraud$hour <- modulo(y=financialfraud$step,x=24)
fraud_hour <- financialfraud %>%
group_by(hour) %>%
summarise(cnt = n(),sum=sum(isFraud)) %>%
mutate(fraud_percentage = round((sum/cnt)*100,2))
ggplot(fraud_hour, aes(x=hour,y=fraud_percentage)) + geom_col(fill="lightblue") + labs(title = 'Fraud Percentage per Hour', x = 'Hour', y = 'Percentage of Fraud Transactions') + theme_classic()

Create train and test set

financialfraud<- financialfraud %>% 
                  select( -one_of('step','nameOrig', 'nameDest', 'isFlaggedFraud'))
library(caret)
## Loading required package: lattice
set.seed(1234)
splitindex <- createDataPartition(financialfraud$isFraud, p=0.7, list=FALSE, times=1)
train <- financialfraud[splitindex,]
table(train$isFraud)
## 
##       0       1 
## 4448109    5725
test <- financialfraud[-splitindex,]
table(test$isFraud)
## 
##       0       1 
## 1906298    2488
# Use under-sampling majority class method for inbalanced dataset
library(unbalanced)
## Loading required package: mlr
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:caret':
## 
##     train
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
inputs <- train[,-which(names(train) %in% "isFraud")]
target <- as.factor(train[,which(names(train) %in% "isFraud")])
under_sam <- ubUnder(X = inputs, Y = target)
train <- cbind(under_sam$X, under_sam$Y)
train$isFraud <- train$`under_sam$Y`
train$`under_sam$Y` <- NULL
table(train$isFraud)
## 
##    0    1 
## 5725 5725

A.Logistic Regression

logistic <- glm(isFraud~., data=train, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic)
## 
## Call:
## glm(formula = isFraud ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.2350   0.0000   0.0000   0.0313   8.4904  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -7.038e+01  2.571e+02  -0.274    0.784    
## typeCASH_OUT    7.162e+01  2.571e+02   0.279    0.781    
## typeDEBIT       5.021e+01  5.038e+03   0.010    0.992    
## typePAYMENT     5.026e+01  6.888e+02   0.073    0.942    
## typeTRANSFER    7.323e+01  2.571e+02   0.285    0.776    
## amount         -3.484e-06  6.900e-07  -5.049 4.43e-07 ***
## oldbalanceOrg   2.366e-05  7.402e-07  31.964  < 2e-16 ***
## newbalanceOrig -2.162e-05  7.859e-07 -27.514  < 2e-16 ***
## oldbalanceDest  8.245e-06  6.782e-07  12.158  < 2e-16 ***
## newbalanceDest -8.277e-06  6.781e-07 -12.207  < 2e-16 ***
## hour           -1.245e-01  8.187e-03 -15.212  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15873.1  on 11449  degrees of freedom
## Residual deviance:  3236.9  on 11439  degrees of freedom
## AIC: 3258.9
## 
## Number of Fisher Scoring iterations: 20
logisticprediction <- predict(logistic, test, type="response")
logisticprediction <- ifelse(logisticprediction>0.5, 1, 0)
table(logisticprediction)
## logisticprediction
##       0       1 
## 1811902   96884
table(test$isFraud)
## 
##       0       1 
## 1906298    2488
confusionMatrix(as.factor(logisticprediction),as.factor(test$isFraud))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction       0       1
##          0 1811765     137
##          1   94533    2351
##                                           
##                Accuracy : 0.9504          
##                  95% CI : (0.9501, 0.9507)
##     No Information Rate : 0.9987          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0449          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.95041         
##             Specificity : 0.94494         
##          Pos Pred Value : 0.99992         
##          Neg Pred Value : 0.02427         
##              Prevalence : 0.99870         
##          Detection Rate : 0.94917         
##    Detection Prevalence : 0.94924         
##       Balanced Accuracy : 0.94767         
##                                           
##        'Positive' Class : 0               
## 
library("pROC")
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
auc1 <- roc(test$isFraud, logisticprediction)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc1
## 
## Call:
## roc.default(response = test$isFraud, predictor = logisticprediction)
## 
## Data: logisticprediction in 1906298 controls (test$isFraud 0) < 2488 cases (test$isFraud 1).
## Area under the curve: 0.9477

B.Decision Tree

library(rpart)
library(rpart.plot)
tree <- rpart(isFraud ~ ., data = train)
prp(tree)

treeprediction <- predict(tree, test, type = "class")
table(treeprediction)
## treeprediction
##       0       1 
## 1836964   71822
table(test$isFraud)
## 
##       0       1 
## 1906298    2488
confusionMatrix(as.factor(treeprediction),as.factor(test$isFraud))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction       0       1
##          0 1836946      18
##          1   69352    2470
##                                           
##                Accuracy : 0.9637          
##                  95% CI : (0.9634, 0.9639)
##     No Information Rate : 0.9987          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0641          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.96362         
##             Specificity : 0.99277         
##          Pos Pred Value : 0.99999         
##          Neg Pred Value : 0.03439         
##              Prevalence : 0.99870         
##          Detection Rate : 0.96236         
##    Detection Prevalence : 0.96237         
##       Balanced Accuracy : 0.97819         
##                                           
##        'Positive' Class : 0               
## 
auc2 <- roc(test$isFraud, as.numeric(treeprediction))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc2
## 
## Call:
## roc.default(response = test$isFraud, predictor = as.numeric(treeprediction))
## 
## Data: as.numeric(treeprediction) in 1906298 controls (test$isFraud 0) < 2488 cases (test$isFraud 1).
## Area under the curve: 0.9782

C.Random Forest

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
rf <- randomForest(isFraud ~ ., data = train, importance = TRUE)
rfprediction <- predict(rf, test)
table(rfprediction)
## rfprediction
##       0       1 
## 1878467   30319
confusionMatrix(as.factor(rfprediction),as.factor(test$isFraud))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction       0       1
##          0 1878454      13
##          1   27844    2475
##                                           
##                Accuracy : 0.9854          
##                  95% CI : (0.9852, 0.9856)
##     No Information Rate : 0.9987          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1488          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98539         
##             Specificity : 0.99477         
##          Pos Pred Value : 0.99999         
##          Neg Pred Value : 0.08163         
##              Prevalence : 0.99870         
##          Detection Rate : 0.98411         
##    Detection Prevalence : 0.98412         
##       Balanced Accuracy : 0.99008         
##                                           
##        'Positive' Class : 0               
## 
auc3 <- roc(test$isFraud, as.numeric(rfprediction))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc3
## 
## Call:
## roc.default(response = test$isFraud, predictor = as.numeric(rfprediction))
## 
## Data: as.numeric(rfprediction) in 1906298 controls (test$isFraud 0) < 2488 cases (test$isFraud 1).
## Area under the curve: 0.9901

D.XGBoost

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
train_x = data.matrix(train[,-which(names(train) %in% "isFraud")])
train_y = as.numeric(levels(train$isFraud))[train$isFraud]
test_x = data.matrix(test[,-which(names(test) %in% "isFraud")])
test$isFraud <- as.factor(test$isFraud)
test_y = as.numeric(levels(test$isFraud))[test$isFraud]
xgb_train = xgb.DMatrix(data=train_x, label=train_y)
xgb_test = xgb.DMatrix(data=test_x, label=test_y)
xgmodel <- xgboost(data=xgb_train, nrounds=10000, max_depth=3, early_stopping_rounds=50, objective="binary:logistic", verbose=0)
## [11:12:29] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
xgmodel$best_iteration #tune for best nround
## [1] 5422
xgmodel <- xgboost(data=xgb_train, nrounds=2290, max_depth=3, early_stopping_rounds=50, objective="binary:logistic", verbose=0)
## [11:12:54] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
xgboostprediction <- predict(xgmodel, xgb_test)
xgboostprediction <- ifelse(xgboostprediction>0.5, 1, 0)
table(xgboostprediction)
## xgboostprediction
##       0       1 
## 1887669   21117
confusionMatrix(as.factor(xgboostprediction),as.factor(test$isFraud))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction       0       1
##          0 1887662       7
##          1   18636    2481
##                                           
##                Accuracy : 0.9902          
##                  95% CI : (0.9901, 0.9904)
##     No Information Rate : 0.9987          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2084          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9902          
##             Specificity : 0.9972          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.1175          
##              Prevalence : 0.9987          
##          Detection Rate : 0.9889          
##    Detection Prevalence : 0.9889          
##       Balanced Accuracy : 0.9937          
##                                           
##        'Positive' Class : 0               
## 
auc4 <- roc(test$isFraud, as.numeric(xgboostprediction))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc4
## 
## Call:
## roc.default(response = test$isFraud, predictor = as.numeric(xgboostprediction))
## 
## Data: as.numeric(xgboostprediction) in 1906298 controls (test$isFraud 0) < 2488 cases (test$isFraud 1).
## Area under the curve: 0.9937

Feature Importance in XGBoost

importance <- xgb.importance(model=xgmodel)
importance
##           Feature       Gain      Cover  Frequency
## 1:  oldbalanceOrg 0.39580645 0.31005022 0.24926448
## 2: newbalanceOrig 0.35113572 0.10976239 0.01298570
## 3:         amount 0.11870235 0.24924811 0.27026479
## 4:           type 0.05981641 0.06426301 0.02089885
## 5:           hour 0.04205064 0.06551954 0.05610226
## 6: newbalanceDest 0.02682679 0.11797130 0.19793040
## 7: oldbalanceDest 0.00566164 0.08318542 0.19255352
xgb.plot.importance(importance_matrix = importance, xlab = "Relative importance based on Gain")