Loading The Packages

We are going to load the packages we need to use in this Analyses

library(pacman)
p_load(tidyverse,lubridate,highcharter,stringr,xts,formattable)

Loading The Dataset

Let’s load the data and take a look at it

data <- read.csv("bank_transactions.csv", check.names = TRUE)
head(data)

##   TransactionID CustomerID CustomerDOB CustGender CustLocation
## 1            T1   C5841053     10/1/94          F   JAMSHEDPUR
## 2            T2   C2142763      4/4/57          M      JHAJJAR
## 3            T3   C4417068    26/11/96          F       MUMBAI
## 4            T4   C5342380     14/9/73          F       MUMBAI
## 5            T5   C9031234     24/3/88          F  NAVI MUMBAI
## 6            T6   C1536588     8/10/72          F     ITANAGAR
##   CustAccountBalance TransactionDate TransactionTime TransactionAmount..INR.
## 1           17819.05          2/8/16          143207                    25.0
## 2            2270.69          2/8/16          141858                 27999.0
## 3           17874.44          2/8/16          142712                   459.0
## 4          866503.21          2/8/16          142714                  2060.0
## 5            6714.43          2/8/16          181156                  1762.5
## 6           53609.20          2/8/16          173940                   676.0

Let’s see what is the structure of the data

str(data)

## 'data.frame':    1048567 obs. of  9 variables:
##  $ TransactionID          : chr  "T1" "T2" "T3" "T4" ...
##  $ CustomerID             : chr  "C5841053" "C2142763" "C4417068" "C5342380" ...
##  $ CustomerDOB            : chr  "10/1/94" "4/4/57" "26/11/96" "14/9/73" ...
##  $ CustGender             : chr  "F" "M" "F" "F" ...
##  $ CustLocation           : chr  "JAMSHEDPUR" "JHAJJAR" "MUMBAI" "MUMBAI" ...
##  $ CustAccountBalance     : num  17819 2271 17874 866503 6714 ...
##  $ TransactionDate        : chr  "2/8/16" "2/8/16" "2/8/16" "2/8/16" ...
##  $ TransactionTime        : int  143207 141858 142712 142714 181156 173940 173806 170537 192825 192446 ...
##  $ TransactionAmount..INR.: num  25 27999 459 2060 1762 ...

summary(data)

##  TransactionID       CustomerID        CustomerDOB         CustGender       
##  Length:1048567     Length:1048567     Length:1048567     Length:1048567    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  CustLocation       CustAccountBalance  TransactionDate    TransactionTime 
##  Length:1048567     Min.   :        0   Length:1048567     Min.   :     0  
##  Class :character   1st Qu.:     4722   Class :character   1st Qu.:124030  
##  Mode  :character   Median :    16792   Mode  :character   Median :164226  
##                     Mean   :   115404                      Mean   :157088  
##                     3rd Qu.:    57657                      3rd Qu.:200010  
##                     Max.   :115035495                      Max.   :235959  
##                     NA's   :2369                                           
##  TransactionAmount..INR.
##  Min.   :      0        
##  1st Qu.:    161        
##  Median :    459        
##  Mean   :   1574        
##  3rd Qu.:   1200        
##  Max.   :1560035        
##

Exploratory Data Analyses (EDA)

Duplicate Records

First, we need to check if there are any duplicates

nrow(data)

## [1] 1048567

nrow(data %>% distinct())

## [1] 1048567

We can see that there are no duplicate records in this data.

NA Analyses (Missing values Analyses)

Let’s see if there are any NA values and in which columns

data[data == ""]<- NA
data[data == "nan"]<- NA

data %>% summarise_all(funs(sum(is.na(.)))) %>% 
  gather() %>% 
  filter(value>0) %>% 
  arrange(-value) %>% 
  hchart("column", hcaes(x = "key", y = "value"),dataLabels = list(enabled = TRUE)) %>% 
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Missing values in the dataset per column")

Some people may not know why this column “CustAccountBalance” could have NA values, but in financial accounting it refers to zero balances as banking systems could resulted zero balances to NA values when exporting. so it’s not an issue for us. we could leave it NA or replace them with zeros.

data$CustAccountBalance[is.na(data$CustAccountBalance)] <- 0

And now we have zero balances for all NA values.

Let’s see now the remaining of NA values in the data

data %>% summarise_all(funs(sum(is.na(.)))) %>% 
  gather() %>% 
  filter(value>0) %>% 
  arrange(-value) %>% 
  hchart("column", hcaes(x = "key", y = "value"),dataLabels = list(enabled = TRUE)) %>% 
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Missing Values Per Column") %>% 
  hc_subtitle(text = "(After dealing with Null Balances)")

The remaining NA Values here are only in three columns “CustomerDOB”, “CustGender” and “CustLocation”, and it’s a suspicious thing that we have also null values in Date of Birth data for the customers. We are going to handle them separately during our analyses.

DOB Analyses

First, let’s have a look at Customer DOBs

data %>% group_by(CustomerDOB) %>% count() %>% arrange(desc(n)) %>% drop_na()

## # A tibble: 17,254 × 2
## # Groups:   CustomerDOB [17,254]
##    CustomerDOB     n
##    <chr>       <int>
##  1 1/1/1800    57339
##  2 1/1/89        809
##  3 1/1/90        784
##  4 6/8/91        699
##  5 1/1/91        666
##  6 1/1/92        633
##  7 1/7/90        619
##  8 21/11/86      602
##  9 1/1/87        601
## 10 1/1/88        592
## # … with 17,244 more rows

We notice that there are a lot of DOBs has year 1800 and that is suspicious, maybe it has null values but the system has filled it automatically that way. We are going to ignore them all. Also we noticed that there are “NA” values, we are going to ignore them too.

data %>% group_by(CustomerDOB) %>% 
  filter(CustomerDOB != "1/1/1800") %>% 
  filter(CustomerDOB != "nan") %>% 
  count() %>% arrange(desc(n)) %>% drop_na()

## # A tibble: 17,253 × 2
## # Groups:   CustomerDOB [17,253]
##    CustomerDOB     n
##    <chr>       <int>
##  1 1/1/89        809
##  2 1/1/90        784
##  3 6/8/91        699
##  4 1/1/91        666
##  5 1/1/92        633
##  6 1/7/90        619
##  7 21/11/86      602
##  8 1/1/87        601
##  9 1/1/88        592
## 10 1/1/93        527
## # … with 17,243 more rows

Now let extract the year number from this values

data_clean <- data

data_clean[c("DOB_Day","DOB_Month","DOB_Year")] <- str_split_fixed(data_clean$CustomerDOB, "/", 3)

data_clean %>% 
  filter(DOB_Year != "1800") %>% 
  filter(DOB_Year != "") %>% 
  group_by(DOB_Year) %>% 
  count() %>% 
  arrange(DOB_Year) %>% 
  hchart("column", hcaes(x = DOB_Year, y = n)) %>% 
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Number of Customers Per Year of Birth")

As we see, the most customers were born between 1980 and 1994. the highest are in 1990.

Transactions Analyses

Let’s discover how much transactions as per time series and per month too.

data_clean$TransactionDate_Formatted <- format(as.Date(data_clean$TransactionDate, "%d/%m/%y"), "%Y-%m-%d")

tmp <- data_clean %>% count(TransactionDate_Formatted)

tmp$TransactionDate_Formatted <- ymd(tmp$TransactionDate_Formatted)

time_series <- xts(tmp$n, order.by = tmp$TransactionDate_Formatted)

highchart(type = "stock") %>% 
  hc_add_series(time_series) %>% 
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Number of Transactions On Daily Basis")

data_clean %>%
  mutate(Trans_Month = month(TransactionDate_Formatted)) %>% 
  group_by(Trans_Month) %>% 
  count() %>% 
  hchart("bar", hcaes(x = Trans_Month, y = n),dataLabels = list(enabled = TRUE)) %>% 
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Numbere Of Transactions Per Month")

During August has been made 653,933 transactions, September 390,975 transactinos, and October only 3,659 tranasctions.

Now let’s take a look at the amounts of these transactions

data_clean %>%
  group_by(Trans_Month = month(TransactionDate)) %>% 
  summarise(Total_Amounts_INR = accounting(round(sum(TransactionAmount..INR.)/1000000,2))) %>% 
  hchart("bar", hcaes(x = Trans_Month, y = Total_Amounts_INR),dataLabels = list(enabled = TRUE)) %>%
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Total Amounts of Transactions Per Month") %>% 
  hc_subtitle(text = "Amounts in Millions INR")

As we have resulted Number of transactions, also the same results for transactions amounts, still August taking the lead with total amount of INR 1,030.60 Million, and September with Total of INR 612.53 Million.

Gender Types

Let’s see which gender has been doing the most transacitons.

data_clean %>% group_by(CustGender) %>% count()

## # A tibble: 4 × 2
## # Groups:   CustGender [4]
##   CustGender      n
##   <chr>       <int>
## 1 F          281936
## 2 M          765530
## 3 T               1
## 4 <NA>         1100

data_clean %>% 
  group_by(CustGender) %>% 
  count() %>% 
  drop_na() %>%
  arrange(-n) %>% 
  hchart("bar", hcaes(x = CustGender, y = n), dataLabels = list(enabled = TRUE)) %>% 
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Number Of Transactions Per Customer Gender")

We don’t know what is the value “T” in “CustGender” column, I believe it’s by mistake, anyway let’s figure out what is its record.

data_clean %>% filter(CustGender == "T")

##   TransactionID CustomerID CustomerDOB CustGender CustLocation
## 1       T959988   C3380368    1/1/1800          T      CHENNAI
##   CustAccountBalance TransactionDate TransactionTime TransactionAmount..INR.
## 1              50050         10/9/16          170731                   32500
##   DOB_Day DOB_Month DOB_Year TransactionDate_Formatted
## 1       1         1     1800                2016-09-10

also it has a wrong Customere DOB value :)

Locations Analysis

Let’s see how transactions distributed over Indian cities.

data_clean %>% 
  group_by(CustLocation) %>% 
  count() %>% 
  drop_na() %>% 
  arrange(-n) %>% 
  filter(n > 5000) %>% 
  hchart("bar", hcaes(x = CustLocation, y = n)) %>% 
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Number Of Transactions Per City") %>% 
  hc_subtitle(text = "Only Cities With Over 5,000 Transactions")

data_clean %>% 
  group_by(CustLocation) %>% 
  summarise(Total_Amounts = accounting(round(sum(TransactionAmount..INR.),2))) %>% 
  drop_na() %>% 
  arrange(-Total_Amounts) %>% 
  filter(Total_Amounts > 5000000) %>% 
  hchart("bar", hcaes(x = CustLocation, y = Total_Amounts)) %>% 
  hc_add_theme(hc_theme_bloom()) %>% 
  hc_title(text = "Total Amounts Of Transactions Per City") %>% 
  hc_subtitle(text = "Only Cities With Over INR 5 ML Total Amounts")

We can see that MUMBAI city has the leader by each of Number Of Transactions and Total Amounts Of Transactions.

Hope you are enjoyed over this EDA, if so please upvote. And don’t hesitate to leave a comment if you have any questions.

Bank Customer Segementation

Tarek

2022-10-19