Research Question

Reading the Dataset

#install.packages('ggpubr')
# Loading the Required Libraries

# data.table::update.dev.pkg()
library(data.table)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggpubr)
# Loading the dataset
df <- read.csv("D:/C++/fraud_dataset_example.csv")
# Previewing the top records
head(df)
##   step     type   amount    nameOrig oldbalanceOrg newbalanceOrig    nameDest
## 1    1  PAYMENT  9839.64 C1231006815        170136      160296.36 M1979787155
## 2    1  PAYMENT  1864.28 C1666544295         21249       19384.72 M2044282225
## 3    1 TRANSFER   181.00 C1305486145           181           0.00  C553264065
## 4    1 CASH_OUT   181.00  C840083671           181           0.00   C38997010
## 5    1  PAYMENT 11668.14 C2048537720         41554       29885.86 M1230701703
## 6    1  PAYMENT  7817.71   C90045638         53860       46042.29  M573487274
##   oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## 1              0              0       0              0
## 2              0              0       0              0
## 3              0              0       1              0
## 4          21182              0       1              0
## 5              0              0       0              0
## 6              0              0       0              0
summary(df)
##       step            type               amount           nameOrig        
##  Min.   : 1.000   Length:101613      Min.   :       0   Length:101613     
##  1st Qu.: 8.000   Class :character   1st Qu.:   10017   Class :character  
##  Median : 9.000   Mode  :character   Median :   53385   Mode  :character  
##  Mean   : 8.523                      Mean   :  174090                     
##  3rd Qu.:10.000                      3rd Qu.:  212498                     
##  Max.   :10.000                      Max.   :10000000                     
##  oldbalanceOrg      newbalanceOrig       nameDest         oldbalanceDest    
##  Min.   :       0   Min.   :       0   Length:101613      Min.   :       0  
##  1st Qu.:       0   1st Qu.:       0   Class :character   1st Qu.:       0  
##  Median :   20190   Median :       0   Mode  :character   Median :   21058  
##  Mean   :  907178   Mean   :  923501                      Mean   :  881048  
##  3rd Qu.:  194715   3rd Qu.:  219218                      3rd Qu.:  591922  
##  Max.   :38900000   Max.   :38900000                      Max.   :34000000  
##  newbalanceDest        isFraud         isFlaggedFraud
##  Min.   :       0   Min.   :0.000000   Min.   :0     
##  1st Qu.:       0   1st Qu.:0.000000   1st Qu.:0     
##  Median :   51783   Median :0.000000   Median :0     
##  Mean   : 1184036   Mean   :0.001142   Mean   :0     
##  3rd Qu.: 1063122   3rd Qu.:0.000000   3rd Qu.:0     
##  Max.   :38900000   Max.   :1.000000   Max.   :0
# Checking the data types 
str(df)
## 'data.frame':    101613 obs. of  11 variables:
##  $ step          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ type          : chr  "PAYMENT" "PAYMENT" "TRANSFER" "CASH_OUT" ...
##  $ amount        : num  9840 1864 181 181 11668 ...
##  $ nameOrig      : chr  "C1231006815" "C1666544295" "C1305486145" "C840083671" ...
##  $ oldbalanceOrg : num  170136 21249 181 181 41554 ...
##  $ newbalanceOrig: num  160296 19385 0 0 29886 ...
##  $ nameDest      : chr  "M1979787155" "M2044282225" "C553264065" "C38997010" ...
##  $ oldbalanceDest: num  0 0 0 21182 0 ...
##  $ newbalanceDest: num  0 0 0 0 0 ...
##  $ isFraud       : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ isFlaggedFraud: int  0 0 0 0 0 0 0 0 0 0 ...

Tidying the Dataset

# Checking for missing values 
colSums(is.na(df))
##           step           type         amount       nameOrig  oldbalanceOrg 
##              0              0              0              0              0 
## newbalanceOrig       nameDest oldbalanceDest newbalanceDest        isFraud 
##              0              0              0              0              0 
## isFlaggedFraud 
##              0
# There are no missing values in our dataset.
# Converting the data to a table

df1 <- data.frame(df)
# Converting column names to lower case for uniformity 

colnames(df1) <- tolower(colnames(df1))

# Confirming the changes 
head(df1)
##   step     type   amount    nameorig oldbalanceorg newbalanceorig    namedest
## 1    1  PAYMENT  9839.64 C1231006815        170136      160296.36 M1979787155
## 2    1  PAYMENT  1864.28 C1666544295         21249       19384.72 M2044282225
## 3    1 TRANSFER   181.00 C1305486145           181           0.00  C553264065
## 4    1 CASH_OUT   181.00  C840083671           181           0.00   C38997010
## 5    1  PAYMENT 11668.14 C2048537720         41554       29885.86 M1230701703
## 6    1  PAYMENT  7817.71   C90045638         53860       46042.29  M573487274
##   oldbalancedest newbalancedest isfraud isflaggedfraud
## 1              0              0       0              0
## 2              0              0       0              0
## 3              0              0       1              0
## 4          21182              0       1              0
## 5              0              0       0              0
## 6              0              0       0              0
# Checking for outliers 
## Identifying the numerical columns

numcols <- subset(df1, select = -c( type, nameorig, namedest,isfraud, isflaggedfraud))

head(numcols)
##   step   amount oldbalanceorg newbalanceorig oldbalancedest newbalancedest
## 1    1  9839.64        170136      160296.36              0              0
## 2    1  1864.28         21249       19384.72              0              0
## 3    1   181.00           181           0.00              0              0
## 4    1   181.00           181           0.00          21182              0
## 5    1 11668.14         41554       29885.86              0              0
## 6    1  7817.71         53860       46042.29              0              0
# Checking for outliers using boxplot
boxplot(numcols)

The numerical columns in our dataset consist of amount transfered, new balances and old balances. The outliers are expected because we have different people doing transactions from different income levels. Since the outliers also form large part of our dataset, dropping them will leave us with a small amount of data to work with.

Univariate Analysis

Central Tendecies

describe(numcols)
##                vars      n       mean         sd   median   trimmed      mad
## step              1 101613       8.52       1.82     9.00      8.88     1.48
## amount            2 101613  174090.07  345019.90 53385.41 102694.86 74440.34
## oldbalanceorg     3 101613  907177.81 2829599.19 20190.47 178716.07 29934.39
## newbalanceorig    4 101613  923501.45 2867332.58     0.00 181210.43     0.00
## oldbalancedest    5 101613  881047.57 2399986.52 21058.00 300302.15 31220.59
## newbalancedest    6 101613 1184036.37 2797923.75 51783.43 491938.61 76774.11
##                 min      max    range  skew kurtosis      se
## step           1.00       10        9 -2.55     7.40    0.01
## amount         0.32 10000000 10000000  5.43    47.82 1082.35
## oldbalanceorg  0.00 38900000 38900000  5.29    36.91 8876.67
## newbalanceorig 0.00 38900000 38900000  5.22    35.93 8995.05
## oldbalancedest 0.00 34000000 34000000  5.06    32.52 7528.95
## newbalancedest 0.00 38900000 38900000  4.32    23.96 8777.31

The mean time where the transactions occurred is between hour 8 to 9.

The average transaction amount is 174,090.07 while the maximum transacted amount is 10,000,000.

The average old balance in the original account before transaction is 907,175.26 and the maximum old balance amount before transaction is 38,939,424.

The average new balance in the original account after transaction is 923,499.25 and the maximum new balance amount after transaction is 38,946,233.

The average old balance in the destination account before transaction is 881,042.80 and the maximum old balance amount before transaction is 34,008,737.

The average new balance in the destination account after transaction is 1,183,998.10 and the maximum new balance amount after transaction is 38,946,233.

Apart from Step variable, all the other columns have positive skewness.

All our variables are leptokurtic because there kurtosis value is greater than 3. A high kurtosis means a heavy tail which indicates the presence of outliers while a low kurtosis means a light tail hence no outliers. A kurtosis greater than +1 indicates that the distribution is too peaked.(High kurtosis) while a kurtosis lower than -1 indicates a distribution that is too flat. Both of these parameters indicate that the distribution is not normal.

# Class Imbalance using barplot
df1$isfraud <- as.factor(df1$isfraud)

barplot(prop.table(table(df1$isfraud)),
        col = rainbow(2),
        ylim = c(0, 1),
        main = "is fraud Distribution")

Since our prediction is a classification problem, we can use skewness to check for class imbalance in our dependent variable. Our target variable, isfraud, is positively skewed therefore we have class imbalance.

Graphical Univariate Analysis

density_plot<-function(data,var,main){
  plot(density(data[[var]]),ylab="Distribution of transactions",main=main)
  polygon(density(data[[var]]),col = "blue",border="red")
}
density_plot(df,1,"Distribution of steps")

Majority of the transactions are made within the 9th hour. The most active hours are within the 8th to 9th hour.

df1%>%
    ggplot() +
    geom_bar(aes(fct_infreq(type)), color = "black",fill = "grey") +
    coord_flip() +
    labs(title = "Distribution of type of payment",
         x = "type of payment",
         y = "Amount") +
    scale_y_continuous(breaks = seq(0,40,5)) +
    theme_minimal() 

#Create a bar plot
#barplot(df1$isflaggedfraud,
       # main = 'is flagged distribution',
        #xlab='isflaggedfraud',
        #ylab='count'
#)
colnames(df1)
##  [1] "step"           "type"           "amount"         "nameorig"      
##  [5] "oldbalanceorg"  "newbalanceorig" "namedest"       "oldbalancedest"
##  [9] "newbalancedest" "isfraud"        "isflaggedfraud"
library(vtree)
isfraud <-df1$isfraud
vtree(isfraud,horiz=FALSE)

Out of 101614 observations made on transactions only 116 are labelled as fraud and this represent a tiny percentage of the entire information.

This can be attributed to the rear cases of fraud in financial institutions which normally happens once while involving a massive amount of money.

isflaggedfraud<-df1$isflaggedfraud
vtree(isflaggedfraud,horiz=FALSE, title="Those flagged")

#install.packages("vtree")
library(vtree)
type_of_payment <-df1$type

vtree(type_of_payment,horiz=FALSE, title="type")

The types of payments made were 101614 cash_in was 20540 representing 20%, cash_out was 31310 representing 31%, all debits were 1012 representing 1% of the entire transactions, all payment were 40062 representing 39% and transferred payments were 8689 which represent 9%.

Bivariate

#
vtree(df1,"isfraud isflaggedfraud", title="Comparison between fraud and flagged fraud",horiz=FALSE)

It is clear that 116 fraud transactions was not flagged as fraud and this should be considered for further investigations.

vtree(df1,"isfraud type", title="Type of payments which are fraud or not ",
      labelnode=list(isfraud=c(not_fraud ="0",fraud ="1")),horiz=FALSE)

A total of 116 fraud transactions cases are happening during cash outs at 51% which represent 59 cases where there is a direct cash payment and also during transfer at 49% representing 57 cases at this point it is where money are transferred from illegitimate account to a legitimate one before exchanging hands.

vtree(df1,"isflaggedfraud type", title="Type of payments which are flagged fraud or not ",
      labelnode=list(isflaggedfraud=c(not_flagged ="0",flagged ="1")),horiz=FALSE)

There were no type of payments that was flagged as an attempt of fraud transactions

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)
library(tidyverse)
p1<- ggplot(data = df1, aes(x = factor(isfraud) ,y = log1p(oldbalanceorg), fill = factor(isfraud))) + geom_boxplot(show.legend = FALSE) +labs(title= 'Old Balance in Sender Accounts' , x = 'isFraud', y='Balance Amount') +  theme_classic()

p2 <- ggplot(data = df1, aes(x = factor(isfraud) ,y = log1p(oldbalancedest), fill = factor(isfraud))) + geom_boxplot(show.legend = FALSE) +labs(title= 'Old balance in Receiver Accounts' , x = 'isFraud',y='Balance Amount') +  theme_classic()

grid.arrange(p1, p2, nrow = 1)

In the majority of fraud transactions, the Old balance of the Origin account that is where the payments are made, is higher than rest of the origin accounts while the Old balance in Destination accounts is Lower than rest.

plot1<- ggplot(data = df1, aes(x = factor(isfraud) ,y = log1p(newbalanceorig), fill = factor(isfraud))) + geom_boxplot(show.legend = FALSE) +labs(title= 'New Balance in Sender Accounts' , x = 'isFraud', y='Balance Amount') +  theme_classic()

plot2 <- ggplot(data = df1, aes(x = factor(isfraud) ,y = log1p(newbalancedest), fill = factor(isfraud))) + geom_boxplot(show.legend = FALSE) +labs(title= 'New balance in Receiver Accounts' , x = 'isFraud',y='Balance Amount') +  theme_classic()

grid.arrange(plot1, plot2, nrow = 1)

The new balance of the senders account reads zero for those accounts which are involve in fraudulent transactions while the recipient account new balance have a huge amount. This clearly indicates fraudsters empty their accounts.