Research Question
#install.packages('ggpubr')
# Loading the Required Libraries
# data.table::update.dev.pkg()
library(data.table)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggpubr)
# Loading the dataset
df <- read.csv("D:/C++/fraud_dataset_example.csv")
# Previewing the top records
head(df)
## step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest
## 1 1 PAYMENT 9839.64 C1231006815 170136 160296.36 M1979787155
## 2 1 PAYMENT 1864.28 C1666544295 21249 19384.72 M2044282225
## 3 1 TRANSFER 181.00 C1305486145 181 0.00 C553264065
## 4 1 CASH_OUT 181.00 C840083671 181 0.00 C38997010
## 5 1 PAYMENT 11668.14 C2048537720 41554 29885.86 M1230701703
## 6 1 PAYMENT 7817.71 C90045638 53860 46042.29 M573487274
## oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 1 0
## 4 21182 0 1 0
## 5 0 0 0 0
## 6 0 0 0 0
summary(df)
## step type amount nameOrig
## Min. : 1.000 Length:101613 Min. : 0 Length:101613
## 1st Qu.: 8.000 Class :character 1st Qu.: 10017 Class :character
## Median : 9.000 Mode :character Median : 53385 Mode :character
## Mean : 8.523 Mean : 174090
## 3rd Qu.:10.000 3rd Qu.: 212498
## Max. :10.000 Max. :10000000
## oldbalanceOrg newbalanceOrig nameDest oldbalanceDest
## Min. : 0 Min. : 0 Length:101613 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 Class :character 1st Qu.: 0
## Median : 20190 Median : 0 Mode :character Median : 21058
## Mean : 907178 Mean : 923501 Mean : 881048
## 3rd Qu.: 194715 3rd Qu.: 219218 3rd Qu.: 591922
## Max. :38900000 Max. :38900000 Max. :34000000
## newbalanceDest isFraud isFlaggedFraud
## Min. : 0 Min. :0.000000 Min. :0
## 1st Qu.: 0 1st Qu.:0.000000 1st Qu.:0
## Median : 51783 Median :0.000000 Median :0
## Mean : 1184036 Mean :0.001142 Mean :0
## 3rd Qu.: 1063122 3rd Qu.:0.000000 3rd Qu.:0
## Max. :38900000 Max. :1.000000 Max. :0
# Checking the data types
str(df)
## 'data.frame': 101613 obs. of 11 variables:
## $ step : int 1 1 1 1 1 1 1 1 1 1 ...
## $ type : chr "PAYMENT" "PAYMENT" "TRANSFER" "CASH_OUT" ...
## $ amount : num 9840 1864 181 181 11668 ...
## $ nameOrig : chr "C1231006815" "C1666544295" "C1305486145" "C840083671" ...
## $ oldbalanceOrg : num 170136 21249 181 181 41554 ...
## $ newbalanceOrig: num 160296 19385 0 0 29886 ...
## $ nameDest : chr "M1979787155" "M2044282225" "C553264065" "C38997010" ...
## $ oldbalanceDest: num 0 0 0 21182 0 ...
## $ newbalanceDest: num 0 0 0 0 0 ...
## $ isFraud : int 0 0 1 1 0 0 0 0 0 0 ...
## $ isFlaggedFraud: int 0 0 0 0 0 0 0 0 0 0 ...
# Checking for missing values
colSums(is.na(df))
## step type amount nameOrig oldbalanceOrg
## 0 0 0 0 0
## newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud
## 0 0 0 0 0
## isFlaggedFraud
## 0
# There are no missing values in our dataset.
# Converting the data to a table
df1 <- data.frame(df)
# Converting column names to lower case for uniformity
colnames(df1) <- tolower(colnames(df1))
# Confirming the changes
head(df1)
## step type amount nameorig oldbalanceorg newbalanceorig namedest
## 1 1 PAYMENT 9839.64 C1231006815 170136 160296.36 M1979787155
## 2 1 PAYMENT 1864.28 C1666544295 21249 19384.72 M2044282225
## 3 1 TRANSFER 181.00 C1305486145 181 0.00 C553264065
## 4 1 CASH_OUT 181.00 C840083671 181 0.00 C38997010
## 5 1 PAYMENT 11668.14 C2048537720 41554 29885.86 M1230701703
## 6 1 PAYMENT 7817.71 C90045638 53860 46042.29 M573487274
## oldbalancedest newbalancedest isfraud isflaggedfraud
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 1 0
## 4 21182 0 1 0
## 5 0 0 0 0
## 6 0 0 0 0
# Checking for outliers
## Identifying the numerical columns
numcols <- subset(df1, select = -c( type, nameorig, namedest,isfraud, isflaggedfraud))
head(numcols)
## step amount oldbalanceorg newbalanceorig oldbalancedest newbalancedest
## 1 1 9839.64 170136 160296.36 0 0
## 2 1 1864.28 21249 19384.72 0 0
## 3 1 181.00 181 0.00 0 0
## 4 1 181.00 181 0.00 21182 0
## 5 1 11668.14 41554 29885.86 0 0
## 6 1 7817.71 53860 46042.29 0 0
# Checking for outliers using boxplot
boxplot(numcols)
The numerical columns in our dataset consist of amount transfered, new balances and old balances. The outliers are expected because we have different people doing transactions from different income levels. Since the outliers also form large part of our dataset, dropping them will leave us with a small amount of data to work with.
describe(numcols)
## vars n mean sd median trimmed mad
## step 1 101613 8.52 1.82 9.00 8.88 1.48
## amount 2 101613 174090.07 345019.90 53385.41 102694.86 74440.34
## oldbalanceorg 3 101613 907177.81 2829599.19 20190.47 178716.07 29934.39
## newbalanceorig 4 101613 923501.45 2867332.58 0.00 181210.43 0.00
## oldbalancedest 5 101613 881047.57 2399986.52 21058.00 300302.15 31220.59
## newbalancedest 6 101613 1184036.37 2797923.75 51783.43 491938.61 76774.11
## min max range skew kurtosis se
## step 1.00 10 9 -2.55 7.40 0.01
## amount 0.32 10000000 10000000 5.43 47.82 1082.35
## oldbalanceorg 0.00 38900000 38900000 5.29 36.91 8876.67
## newbalanceorig 0.00 38900000 38900000 5.22 35.93 8995.05
## oldbalancedest 0.00 34000000 34000000 5.06 32.52 7528.95
## newbalancedest 0.00 38900000 38900000 4.32 23.96 8777.31
The mean time where the transactions occurred is between hour 8 to 9.
The average transaction amount is 174,090.07 while the maximum transacted amount is 10,000,000.
The average old balance in the original account before transaction is 907,175.26 and the maximum old balance amount before transaction is 38,939,424.
The average new balance in the original account after transaction is 923,499.25 and the maximum new balance amount after transaction is 38,946,233.
The average old balance in the destination account before transaction is 881,042.80 and the maximum old balance amount before transaction is 34,008,737.
The average new balance in the destination account after transaction is 1,183,998.10 and the maximum new balance amount after transaction is 38,946,233.
Apart from Step variable, all the other columns have positive skewness.
All our variables are leptokurtic because there kurtosis value is greater than 3. A high kurtosis means a heavy tail which indicates the presence of outliers while a low kurtosis means a light tail hence no outliers. A kurtosis greater than +1 indicates that the distribution is too peaked.(High kurtosis) while a kurtosis lower than -1 indicates a distribution that is too flat. Both of these parameters indicate that the distribution is not normal.
# Class Imbalance using barplot
df1$isfraud <- as.factor(df1$isfraud)
barplot(prop.table(table(df1$isfraud)),
col = rainbow(2),
ylim = c(0, 1),
main = "is fraud Distribution")
Since our prediction is a classification problem, we can use skewness to check for class imbalance in our dependent variable. Our target variable, isfraud, is positively skewed therefore we have class imbalance.
density_plot<-function(data,var,main){
plot(density(data[[var]]),ylab="Distribution of transactions",main=main)
polygon(density(data[[var]]),col = "blue",border="red")
}
density_plot(df,1,"Distribution of steps")
Majority of the transactions are made within the 9th hour. The most active hours are within the 8th to 9th hour.
df1%>%
ggplot() +
geom_bar(aes(fct_infreq(type)), color = "black",fill = "grey") +
coord_flip() +
labs(title = "Distribution of type of payment",
x = "type of payment",
y = "Amount") +
scale_y_continuous(breaks = seq(0,40,5)) +
theme_minimal()
#Create a bar plot
#barplot(df1$isflaggedfraud,
# main = 'is flagged distribution',
#xlab='isflaggedfraud',
#ylab='count'
#)
colnames(df1)
## [1] "step" "type" "amount" "nameorig"
## [5] "oldbalanceorg" "newbalanceorig" "namedest" "oldbalancedest"
## [9] "newbalancedest" "isfraud" "isflaggedfraud"
library(vtree)
isfraud <-df1$isfraud
vtree(isfraud,horiz=FALSE)
Out of 101614 observations made on transactions only 116 are labelled as fraud and this represent a tiny percentage of the entire information.
This can be attributed to the rear cases of fraud in financial institutions which normally happens once while involving a massive amount of money.
isflaggedfraud<-df1$isflaggedfraud
vtree(isflaggedfraud,horiz=FALSE, title="Those flagged")
#install.packages("vtree")
library(vtree)
type_of_payment <-df1$type
vtree(type_of_payment,horiz=FALSE, title="type")
The types of payments made were 101614 cash_in was 20540 representing 20%, cash_out was 31310 representing 31%, all debits were 1012 representing 1% of the entire transactions, all payment were 40062 representing 39% and transferred payments were 8689 which represent 9%.
#
vtree(df1,"isfraud isflaggedfraud", title="Comparison between fraud and flagged fraud",horiz=FALSE)
It is clear that 116 fraud transactions was not flagged as fraud and this should be considered for further investigations.
vtree(df1,"isfraud type", title="Type of payments which are fraud or not ",
labelnode=list(isfraud=c(not_fraud ="0",fraud ="1")),horiz=FALSE)
A total of 116 fraud transactions cases are happening during cash outs at 51% which represent 59 cases where there is a direct cash payment and also during transfer at 49% representing 57 cases at this point it is where money are transferred from illegitimate account to a legitimate one before exchanging hands.
vtree(df1,"isflaggedfraud type", title="Type of payments which are flagged fraud or not ",
labelnode=list(isflaggedfraud=c(not_flagged ="0",flagged ="1")),horiz=FALSE)
There were no type of payments that was flagged as an attempt of fraud transactions
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
library(tidyverse)
p1<- ggplot(data = df1, aes(x = factor(isfraud) ,y = log1p(oldbalanceorg), fill = factor(isfraud))) + geom_boxplot(show.legend = FALSE) +labs(title= 'Old Balance in Sender Accounts' , x = 'isFraud', y='Balance Amount') + theme_classic()
p2 <- ggplot(data = df1, aes(x = factor(isfraud) ,y = log1p(oldbalancedest), fill = factor(isfraud))) + geom_boxplot(show.legend = FALSE) +labs(title= 'Old balance in Receiver Accounts' , x = 'isFraud',y='Balance Amount') + theme_classic()
grid.arrange(p1, p2, nrow = 1)
In the majority of fraud transactions, the Old balance of the Origin account that is where the payments are made, is higher than rest of the origin accounts while the Old balance in Destination accounts is Lower than rest.
plot1<- ggplot(data = df1, aes(x = factor(isfraud) ,y = log1p(newbalanceorig), fill = factor(isfraud))) + geom_boxplot(show.legend = FALSE) +labs(title= 'New Balance in Sender Accounts' , x = 'isFraud', y='Balance Amount') + theme_classic()
plot2 <- ggplot(data = df1, aes(x = factor(isfraud) ,y = log1p(newbalancedest), fill = factor(isfraud))) + geom_boxplot(show.legend = FALSE) +labs(title= 'New balance in Receiver Accounts' , x = 'isFraud',y='Balance Amount') + theme_classic()
grid.arrange(plot1, plot2, nrow = 1)
The new balance of the senders account reads zero for those accounts which are involve in fraudulent transactions while the recipient account new balance have a huge amount. This clearly indicates fraudsters empty their accounts.