ANZ Virtual Interhship Report

# Add an image
knitr::include_graphics('/Users/Biljana/R files/ANZ/image.jpg')

This task is based on a synthesised transaction dataset containing 3 months’ worth of transactions for 100 hypothetical customers. It contains purchases, recurring transactions, and salary transactions.The dataset is designed to simulate realistic transaction behaviours that are observed in ANZ’s real transaction data.

Import the Dataset

The dataset is provided in an excel file and has been imported into R for the analysis.The size of the dataset is 12043 rows. To get started load the packages tidyverse and readxl. The readxl package makes it easy to get data out of Excel and into R. Argument gues_max is set to 3052, a maximum number of data rows to use for guessing column types out of 12043 rows.

knitr::opts_chunk$set(fig.width = 7, fig.height = 7,
                     warning = FALSE, message = FALSE)
# Load the dataset
anz_data <- read_excel("/Users/Biljana/R files/ANZ/ANZ synthesised transaction dataset.xlsx",
                       sheet = 1, range = NULL, col_names = TRUE,
                       col_types = c("guess"), na = "", trim_ws = TRUE,
                       skip = 0, n_max = Inf, guess_max = min(3052, 12043),
                       progress = readxl_progress(),
                       .name_repair = "unique")

# View the structure of the data
str(anz_data)

## tibble [12,043 × 23] (S3: tbl_df/tbl/data.frame)
##  $ status           : chr [1:12043] "authorized" "authorized" "authorized" "authorized" ...
##  $ card_present_flag: num [1:12043] 1 0 1 1 1 NA 1 1 1 NA ...
##  $ bpay_biller_code : chr [1:12043] NA NA NA NA ...
##  $ account          : chr [1:12043] "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
##  $ currency         : chr [1:12043] "AUD" "AUD" "AUD" "AUD" ...
##  $ long_lat         : chr [1:12043] "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
##  $ txn_description  : chr [1:12043] "POS" "SALES-POS" "POS" "SALES-POS" ...
##  $ merchant_id      : chr [1:12043] "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
##  $ merchant_code    : num [1:12043] NA NA NA NA NA NA NA NA NA NA ...
##  $ first_name       : chr [1:12043] "Diana" "Diana" "Michael" "Rhonda" ...
##  $ balance          : num [1:12043] 35.39 21.2 5.71 2117.22 17.95 ...
##  $ date             : POSIXct[1:12043], format: "2018-08-01" "2018-08-01" ...
##  $ gender           : chr [1:12043] "F" "F" "M" "F" ...
##  $ age              : num [1:12043] 26 26 38 40 26 20 43 43 27 40 ...
##  $ merchant_suburb  : chr [1:12043] "Ashmore" "Sydney" "Sydney" "Buderim" ...
##  $ merchant_state   : chr [1:12043] "QLD" "NSW" "NSW" "QLD" ...
##  $ extraction       : chr [1:12043] "2018-08-01T01:01:15.000+0000" "2018-08-01T01:13:45.000+0000" "2018-08-01T01:26:15.000+0000" "2018-08-01T01:38:45.000+0000" ...
##  $ amount           : num [1:12043] 16.25 14.19 6.42 40.9 3.25 ...
##  $ transaction_id   : chr [1:12043] "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
##  $ country          : chr [1:12043] "Australia" "Australia" "Australia" "Australia" ...
##  $ customer_id      : chr [1:12043] "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
##  $ merchant_long_lat: chr [1:12043] "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
##  $ movement         : chr [1:12043] "debit" "debit" "debit" "debit" ...

About the Data

This dataset contains the transaction values collected daily and covers the time span between 1 August 2018 through 31 October 2018. We have a single data point for each minute of the day the transactions were processed in this dataset. However we are alo interested in summary values per month.

Are there any unusual values?

Below is a summary statistics of selected numeric, categorical and date variables in the dataset.

There are 12 043 transactions in this dataset
Average amount value of transactions is 187.93 AUD
Range of the variable date is between 1 August 2018 through 31 October 2018
Age of the customers is between 18 to 78 years of age

# Are there any unusual values?
anz_data %>%
  select(transaction_id, amount, age, date) %>%
  summary()

##  transaction_id         amount             age       
##  Length:12043       Min.   :   0.10   Min.   :18.00  
##  Class :character   1st Qu.:  16.00   1st Qu.:22.00  
##  Mode  :character   Median :  29.00   Median :28.00  
##                     Mean   : 187.93   Mean   :30.58  
##                     3rd Qu.:  53.66   3rd Qu.:38.00  
##                     Max.   :8835.98   Max.   :78.00  
##       date                    
##  Min.   :2018-08-01 00:00:00  
##  1st Qu.:2018-08-24 00:00:00  
##  Median :2018-09-16 00:00:00  
##  Mean   :2018-09-15 21:27:39  
##  3rd Qu.:2018-10-09 00:00:00  
##  Max.   :2018-10-31 00:00:00

Are there any duplicates?

For this analysis, we are only interested in complete and non-duplicated observations.

# Are there any duplicates?
cat("The number of non-duplicate observations within the data set is",
    nrow(unique(anz_data)), "out of", "\n",
    nrow(anz_data),
    "indicating that there are",
    nrow(anz_data) - nrow(unique(anz_data)),
    "duplicates within the data set.","\n",
    "anz_data is our new, duplicate observation free data frame.")

## The number of non-duplicate observations within the data set is 12043 out of 
##  12043 indicating that there are 0 duplicates within the data set. 
##  anz_data is our new, duplicate observation free data frame.

Handling Data Types

One of the first things to do when we start a data project is to assign the correct data type for each variable. In particular, it important to assign the right type and format of variable to column date as trends over time are key for the type of analysis conducted.

Original dataset contains 23 variables that has been converted into dataset with 28 variables: 7 categorical (ordinal), 12 categorical (nominal), 8 (numerical) and one date variable which format has been changed. New columns were created to derived varibles from date variable: hour, weekday and month. Customer and merchant longitude and latidue were split into individual columns. Prefixes CUS and ACC were removed from customer_id and account data entries.

# Manually define the variables
anz_data$status <- factor(anz_data$status, levels = c("authorized","posted"), labels = c("authorized", "posted"))
anz_data$card_present_flag <- factor(anz_data$card_present_flag, levels = c("0","1"), labels = c("no", "yes"))
anz_data$txn_description <- factor(anz_data$txn_description, 
                                   levels = c("INTER BANK","PAY/SALARY",
                                               "PAYMENT", "PHONE BANK", 
                                                "POS", "SALES-POS"), 
                                   labels = c("INTER BANK","PAY/SALARY",
                                               "PAYMENT", "PHONE BANK", 
                                               "POS", "SALES-POS"))
anz_data$gender <- factor(anz_data$gender, levels = c("F", "M"), labels = c("F", "M"))
anz_data$bpay_biller_code <- factor(anz_data$bpay_biller_code, 
                                    levels = c("0", "LAND WATER & PLANNING", 
                                               "THE DISCOUNT CHEMIST GROUP"),
                                    labels = c("None", "LAND WATER & PLANNING", 
                                               "THE DISCOUNT CHEMIST GROUP"))
anz_data$merchant_code <- as.character(anz_data$merchant_code)

# Change the format of date column
anz_data$date <- as.Date(anz_data$date, format = "%d/%m/%Y")

# Derive weekday of each transaction
anz_data$weekday = weekdays(anz_data$date, abbreviate = FALSE)
anz_data$weekday <- factor(anz_data$weekday, 
                           levels = c("Monday", "Tuesday", 
                                      "Wednesday", "Thursday",
                                      "Friday", "Saturday", "Sunday"),
                           labels = c("Monday", "Tuesday", "Wednesday", 
                                      "Thursday", "Friday",
                                      "Saturday", "Sunday"))

# Derive hour of each transaction
anz_data$extraction = as.character(anz_data$extraction)
anz_data$hour = hour(as.POSIXct(substr(anz_data$extraction,12,19),format = "%H:%M:%S"))

# Derive month of each transaction
anz_data$month <- month(anz_data$date)
anz_data$month <- factor(anz_data$month, levels = c("8", "9", "10"), 
                         labels = c("August", "September", "October"))

# Remove the CUS and ACC from the variables
anz_data[,c("cus", "cus_id")] <- str_split_fixed(anz_data$customer_id, "-", 2)
anz_data$customer_id <- NULL 
anz_data$cus <- NULL 

anz_data[,c("acc", "acc_id")] <- str_split_fixed(anz_data$account, "-", 2)
anz_data$account <- NULL 
anz_data$acc <- NULL 

# Split customer & merchant lat_long into individual columns for analysis
anz_data[,c("c_long", "c_lat")] <- str_split_fixed(anz_data$long_lat, " ", 2)
anz_data[,c("m_long", "m_lat")] <- str_split_fixed(anz_data$merchant_long_lat, " ", 2)

anz_data <- anz_data %>% select(-long_lat, -merchant_long_lat) 
anz_data <- anz_data %>%
  mutate(c_long = as.numeric(c_long),
         c_lat = as.numeric(c_lat),
         m_long = as.numeric(m_long),
         m_lat = as.numeric(m_lat))

Current overview od data types and missing values

The df_status function coming in package funModeling can help us by showing numbers of missing values (NA in R) in relative and percentage values and numbers of unique values for each variable in the dataset. There is a pattern in missing values: five variables contain 35.92% of missing values or NA’s and another two have 92.66% of NA values. By default, readxl treats blank cells as missing data. At this point, we dont exclude any rows or columns because it will delete the entire dataset.There is no general rule to shrink data, it depends on each case.

# Checking missing values, data type and unique values
df_status(anz_data, print_results = F) %>% select(variable, type, unique, q_na, p_na) %>% arrange(type)

##             variable      type unique  q_na  p_na
## 1           currency character      1     0  0.00
## 2        merchant_id character   5725  4326 35.92
## 3      merchant_code character      1 11160 92.67
## 4         first_name character     80     0  0.00
## 5    merchant_suburb character   1609  4326 35.92
## 6     merchant_state character      8  4326 35.92
## 7         extraction character   9442     0  0.00
## 8     transaction_id character  12043     0  0.00
## 9            country character      1     0  0.00
## 10          movement character      2     0  0.00
## 11            cus_id character    100     0  0.00
## 12            acc_id character    100     0  0.00
## 13              date      Date     91     0  0.00
## 14            status    factor      2     0  0.00
## 15 card_present_flag    factor      2  4326 35.92
## 16  bpay_biller_code    factor      2 11159 92.66
## 17   txn_description    factor      6     0  0.00
## 18            gender    factor      2     0  0.00
## 19           weekday    factor      7     0  0.00
## 20             month    factor      3     0  0.00
## 21              hour   integer     24     0  0.00
## 22           balance   numeric  12006     0  0.00
## 23               age   numeric     33     0  0.00
## 24            amount   numeric   4457     0  0.00
## 25            c_long   numeric     87     0  0.00
## 26             c_lat   numeric     85     0  0.00
## 27            m_long   numeric    718  4326 35.92
## 28             m_lat   numeric    669  4326 35.92

Which day is missing from the dataset?

The dateset only contains records for 91 days, one day is missing.

date_range <- seq(min(anz_data$date), max(anz_data$date), by = 1)
date_range[!date_range %in% anz_data$date]

## [1] "2018-08-16"

How many customers are there according to their unique account number?

# Confirm the one -to -one link of acc_id and cus_id
anz_data %>% select(acc_id,cus_id) %>%
  unique() %>%
  nrow()

## [1] 100

Are all customers living in Australia?

There is one customer who is living outside Australia.

# Check the range of customer locations and define customers who dont reside in Australia
# Reference: http://www.ga.gov.au/scientific-topics/national-location-information/
# dimensions/continental-extremities
anz_data_location_outlier <- anz_data[!(anz_data$c_long > 113 & anz_data$c_long < 154 &                                           anz_data$c_lat > (-44) & anz_data$c_lat < (-10)),]

# How many customers reside outside Australia?
length(unique(anz_data_location_outlier$cus_id))

## [1] 1

Who is the customer who resides outside Australia?

# Who is the customer who resides outside Australia?
anz_data_location_outlier %>% distinct(first_name, age, gender, cus_id)

## # A tibble: 1 x 4
##   first_name   age gender cus_id    
##   <chr>      <dbl> <fct>  <chr>     
## 1 Daniel        22 M      1617121891

Profiling categorical variables

Univariate analysis with categorical variables

The first step in data exploration usually consists of univariate, descriptive analysis of all variables of interest. Categorical variables are assessed by measurement of frequencies, percentages, cumulative values, and generating colorful plots. Frequency or distribution analysis is made simple by the freq function coming in package funModeling. It retrieves the distribution in a table and a plot (by default) and shows the distribution of absolute and relative numbers.The output is ordered by the frequency variable, which quickly analyzes the most frequent categories and how many shares they represent. By default, NA values are considered in both the table and the plot.

By analysing the plots and the tables, we can notice that factor level “credit” of movement variable has 883 rows that is 7.33% of all transactions. As well, factor level “PAY/SALARY” of txn_description variable has only 883 rows or 7.33% of the data. It is a useful thought to consider if we decide to analyse purchase (debit) or credit transactions separately.

# Select categorical variables for analysis
anz_data_cat = select(anz_data, txn_description, merchant_state, movement)

# Plot the frequencies (distribution) of selected variables
freq(anz_data_cat, input = NA,
     plot = TRUE, na.rm = FALSE) # NA values are excluded

##   txn_description frequency percentage cumulative_perc
## 1       SALES-POS      3934      32.67           32.67
## 2             POS      3783      31.41           64.08
## 3         PAYMENT      2600      21.59           85.67
## 4      PAY/SALARY       883       7.33           93.00
## 5      INTER BANK       742       6.16           99.16
## 6      PHONE BANK       101       0.84          100.00

##   merchant_state frequency percentage cumulative_perc
## 1           <NA>      4326      35.92           35.92
## 2            NSW      2169      18.01           53.93
## 3            VIC      2131      17.69           71.62
## 4            QLD      1556      12.92           84.54
## 5             WA      1100       9.13           93.67
## 6             SA       415       3.45           97.12
## 7             NT       205       1.70           98.82
## 8            ACT        73       0.61           99.43
## 9            TAS        68       0.56          100.00

##   movement frequency percentage cumulative_perc
## 1    debit     11160      92.67           92.67
## 2   credit       883       7.33          100.00

## [1] "Variables processed: txn_description, merchant_state, movement"

Visualize categorical variables

#  Top 10 customers at ANZ bank
# Calculate the number of transactions per customer
transactions_per_customer <- anz_data %>%
  group_by(cus_id) %>%
  dplyr::summarise(number_transactions = n()) %>%
  arrange(desc(number_transactions)) %>%
  top_n(10, number_transactions)

# Plot the number of transactions per customers in descending order
ggplot(transactions_per_customer, aes(x = reorder(cus_id,number_transactions) , y = number_transactions)) +
  geom_bar(stat = "identity", fill = "lightseagreen", alpha = .6, width = .6) +
  coord_flip() + theme_ipsum() +
  theme(
            panel.grid.minor = element_blank(),
            panel.grid.major = element_blank(),
            legend.title = element_blank(),
            plot.title = element_text(vjust = 2, face = 'bold'),
            axis.ticks.y = element_blank(),
            axis.ticks.x = element_blank(),
            axis.text.x = element_blank(),
            axis.text.y = element_text(),
            axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
            axis.title.y = element_text(size = 14, margin = margin(0,10,0,0))) + 
  labs(title = 'Top 10 Customers at ANZ Bank') +
  xlab("Customer number") + ylab("Number of transactions")

# Top states by number of transactions
# Calculate the number of transactions per state
transactions_per_state <- anz_data %>%
  group_by(merchant_state) %>%
  dplyr::summarise(number_transactions = n()) %>%
  arrange(desc(number_transactions))

# Remove Na's
transactions_per_state_omit <- na.omit(transactions_per_state)

# Plot the top states by number of transactions
transactions_per_state_omit %>%
  arrange(number_transactions) %>%    
  mutate(merchant_state = factor(merchant_state, levels = merchant_state)) %>%
  ggplot(aes(x = merchant_state, y = number_transactions)) +
  geom_segment( aes(xend = merchant_state, yend = 0)) +
  geom_point( size = 4, color = "orange") +
  coord_flip() + theme_ipsum() + 
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.title = element_blank(),
    plot.title = element_text(vjust = 2, face = 'bold'),
    axis.ticks.y = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_text(),
    axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
    axis.title.y = element_text(size = 14, margin = margin(0,10,0,0))) + 
  labs(title = 'Top States at ANZ Bank') +
  xlab("") + ylab("Number of transactions")

# Ways to bank at ANZ
# Trasnformation of amount variable into two variables: a debit and a credit numeric variables
data_debit_credit <- anz_data %>%
  spread(movement, amount)

# Check the distribution of debit variable for outliers
outliers_debit <- boxplot.stats(data_debit_credit$debit)$out
# length(outliers_debit) # 1182 data points (frequency)

# Find in which rows the outliers are
rows_outliers <- data_debit_credit[which(data_debit_credit$debit %in% outliers_debit),]

# Remove the rows containing the outliers
debit_clean <- data_debit_credit[-which(data_debit_credit$debit %in% outliers_debit),]
# dim(debit_clean)

# Filter out one factor level ("PAY/SALARY") from txn_description 
data_debit <- debit_clean %>% group_by(txn_description) %>% 
                                         filter(txn_description != "PAY/SALARY") 
# dim(data_debit)
data_debit$txn_description <- factor(data_debit$txn_description, 
                                     levels = c("PHONE BANK", "INTER BANK", 
                                                "PAYMENT", "POS", "SALES-POS")) 
# assigning as factor eliminates the level 


# Data_debit contains 9978 data points yielding 11160 together with outliers of 1182

# Plot the ridgeline plot with color relative to the numeric value of the variable debit
ggplot(data_debit, aes(x = `debit`, y = `txn_description` , fill = ..x..)) +
  geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01) +
  scale_fill_viridis(name = "debit", option = "C") +
  ggtitle('Ways to Bank at ANZ') +
  theme_ipsum() +
  xlab("") + ylab("") +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        legend.title = element_blank(),
        plot.title = element_text(vjust = 2, face = 'bold'),
        axis.ticks.y = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(),
        axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
        axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))

# Gender Purchase Amount by State
# Stacked bar plot of state debit proportions coloured by gender
data_debit_merchant_state <- data_debit %>% select(gender, debit, merchant_state) %>%
  filter(!is.na(merchant_state)) %>%
  group_by(merchant_state, debit, gender) %>% 
  dplyr::summarise(count = n())

ggplot(data_debit_merchant_state, aes(fill = gender, y = debit, x = merchant_state)) +
  geom_bar(position = "fill", stat = "identity") +
  scale_fill_manual(values = c("violet", "blue")) +
  ggtitle("Gender Purchase Amount by State") +
  xlab("") + ylab("") +
  theme_ipsum() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        legend.title = element_blank(),
        plot.title = element_text(vjust = 2, face = 'bold'),
        axis.ticks.y = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.x = element_text(),
        axis.text.y = element_text(),
        axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
        axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))

Profiling numeric variables

Univariate analysis with numerical variables

This analysis is performed with describe function and profiling_num function as part of packages Hmisc and funModeling respecitevly. The idea is to provide to data scientists a full set of metrics, so they can select the most relevant. Numerical variables are assessed by analysing:

percentiles as useful metrics for describing the distribution
mean: the well-known mean or average
lowest and highest valuest as useful information for spotting outliers and data errors
range_80 as indicator where 98% of values are
plots (histograms) of the distribution

All the values are calculated after removing NA’s.

library(Hmisc)
vars_to_profile_1 = c("amount", "balance", "age")
vars_to_profile_2 = c("debit", "credit")
data_subset_1 = select(anz_data, one_of(vars_to_profile_1))
data_subset_2 = select(data_debit_credit, one_of(vars_to_profile_2))
describe(data_subset_1)

## data_subset_1 
## 
##  3  Variables      12043  Observations
## --------------------------------------------------------------------------------
## amount 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    12043        0     4457        1    187.9    311.6    7.141    9.442 
##      .25      .50      .75      .90      .95 
##   16.000   29.000   53.655  186.000 1158.150 
## 
## lowest :    0.10    0.76    1.52    1.59    1.64
## highest: 5103.51 6024.49 6107.23 7081.09 8835.98
## --------------------------------------------------------------------------------
## balance 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    12043        0    12006        1    14704    19112    958.7   1533.5 
##      .25      .50      .75      .90      .95 
##   3158.6   6432.0  12465.9  29442.4  53362.9 
## 
## lowest :      0.24      0.25      0.98      1.02      2.38
## highest: 266981.30 267028.97 267050.66 267093.66 267128.52
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    12043        0       33    0.998    30.58    10.85       19       20 
##      .25      .50      .75      .90      .95 
##       22       28       38       43       46 
## 
## lowest : 18 19 20 21 22, highest: 52 53 64 69 78
## --------------------------------------------------------------------------------

describe(data_subset_2)

## data_subset_2 
## 
##  2  Variables      12043  Observations
## --------------------------------------------------------------------------------
## debit 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    11160      883     4358        1    52.57     62.5     6.89     9.13 
##      .25      .50      .75      .90      .95 
##    15.19    26.93    45.00    94.23   156.00 
## 
## lowest :    0.10    0.76    1.52    1.59    1.64
## highest: 2885.85 3680.71 4094.33 4233.00 7081.09
## --------------------------------------------------------------------------------
## credit 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      883    11160      100        1     1899     1188    725.3    892.1 
##      .25      .50      .75      .90      .95 
##   1013.7   1626.5   2538.7   3578.7   3977.5 
## 
## lowest :  576.00  596.33  664.43  725.32  761.33
## highest: 4910.90 5103.51 6024.49 6107.23 8835.98
## --------------------------------------------------------------------------------

library(funModeling)
my_profiling_table_1 <- profiling_num(data_subset_1) %>% select(variable, mean, p_01, p_99, range_80)
my_profiling_table_1

##   variable        mean     p_01      p_99              range_80
## 1   amount   187.93359   4.3084   3195.01          [9.442, 186]
## 2  balance 14704.19555 152.7050 201963.45 [1533.454, 29442.384]
## 3      age    30.58233  18.0000     69.00              [20, 43]

my_profiling_table_2 <- profiling_num(data_subset_2) %>% select(variable, mean, p_01, p_99, range_80)
my_profiling_table_2

##   variable       mean     p_01      p_99                 range_80
## 1    debit   52.57234   4.2259  488.8288 [9.13, 94.2340000000001]
## 2   credit 1898.72803 576.0000 5103.5100        [892.09, 3578.65]

Profiling numerical variables by plotting

# Remove outliers
# Outliers from debit variable have already been removed

# Remove outliers from credit variable
outliers_credit <- boxplot.stats(data_debit_credit$credit)$out
credit_clean <- data_debit_credit[-which(data_debit_credit$credit %in% outliers_credit),]

# Remove outliers from amount variable
outliers_amount <- boxplot.stats(anz_data$amount)$out
amount_clean <- anz_data[-which(anz_data$amount %in% outliers_amount),]

# Remove outliers from balance variable
outliers_balance <- boxplot.stats(anz_data$balance)$out
balance_clean <- anz_data[-which(anz_data$balance %in% outliers_balance),]

# Remove outliers from age variable
outliers_age <- boxplot.stats(anz_data$age)$out
age_clean <- anz_data[-which(anz_data$age %in% outliers_age),]

par(
  mfrow = c(1,1),
  mar = c(8.1,6.1,6.1,2.1)
)

hist(amount_clean$amount, col = rgb(0,0,0,0.5) , xlab = "Amount" , ylab = "Frequency" , 
     main = "Overall Transaction Amount")

hist(debit_clean$debit, col = rgb(0,0,0,0.5) , xlab = "Debit" , ylab = "Frequency" , 
     main = "Overall Purchase Amount")

hist(credit_clean$credit,  col = rgb(0,0,0,0.5) , xlab = "Credit" , ylab = "Frequency" , 
     main = "Overall Credit Amount")

hist(balance_clean$balance, col = rgb(0,0,0,0.5) , xlab = "Balance" , ylab = "Frequency" ,     main = "Overall Balance Amount")

hist(age_clean$age, col = rgb(0,0,0,0.5) , xlab = "Age" , ylab = "Frequency" , main = "Age Histogram")

Demographic analysis

Providing the right products, services, and messages to the right customers at the right time is at the core of bank marketing. Identifying the demographics of valuable and potentially valuable customers is useful for targeting campaigns and building audiences for remarketing.

Among 100 customers in the ANZ customer database, majority of the frequent customers are male individuals who appear to belong to age category from 18 to 30 years old. High proportion of credit and debit transaction activities are also related to male individuals in this age category. In addition, combination of age categories from 18 years old to 54 years old represents 97% of all customers in the database. According to the pie chart, it is evident that individuals above 50 years of age are mostly sporadic, particularly among female group. In terms of transaction volume, the age category above 54 shows lesser number of proportion, but when it comes to the mean value of the amount per transaction, the final conclusion is different. This would be useful from the ANZ perspective to initiate marketing campaigns targeting these or similar specific groups.

Age

# Create new variable age_category (discretization of numeric variables)
anz_data <- mutate(anz_data, age_category = case_when(18 <= age & age < 30 ~ '18-30',
                                                                  30 <= age & age < 42 ~ '30-42',
                                                                  42 <= age & age < 54 ~ '42-54',
                                                                   TRUE ~ '54>'))

# Define dataframe to calculate the number of transactios by customer_id
anz_data_id <- unique(anz_data$cus_id)
anz_df <- NULL

for (val in 1:length(anz_data_id)) {
  cus_id_info <- anz_data_id[val]
  cus_df <- anz_data[anz_data$cus_id == cus_id_info, ]
  cus_df <- cus_df[1, c('cus_id','gender','age', "age_category", "amount")]
  if (is.null(anz_df)) {
    anz_df <- cus_df
  } else{
    anz_df <- rbind(anz_df, cus_df)
  }
}

# Calculate the number of customers by age_category
anz_data_age_count <- anz_df %>% select(cus_id,age_category) %>%
  group_by(age_category) %>% 
  dplyr::summarise(count = n()) 

# Bar chart of age distribution
ggplot(anz_data_age_count, aes(x = fct_reorder(age_category, count), y = count, fill = age_category)) +
  geom_bar( width = 0.6, stat = "identity") + coord_flip() +
  ggtitle("Distribution of Age: ANZ Customer Database") +
  ylab("No of customers") + xlab("Age category") +
  theme_ipsum() +
  theme(
         panel.grid.minor = element_blank(),
         panel.grid.major = element_blank(),
         legend.title = element_blank(),
         plot.title = element_text(vjust = 2),
         axis.ticks.y = element_blank(),
         axis.ticks.x = element_blank(),
         axis.text.x = element_text(),
         axis.text.y = element_text(size = 12),
         axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
         axis.title.y = element_text(size = 14, margin = margin(0,10,0,0))) +
  guides(fill = FALSE) + 
  geom_text(aes(age_category, count + 2.0, label = count))

Gender

# Calculate the number of customers by gender
anz_data_gender_count <- anz_df %>% select(cus_id, gender) %>%
                         group_by(gender) %>% dplyr::summarise(count = n())

# Bar chart of gender distribution
ggplot(anz_data_gender_count, aes(x = gender, y = count, fill = gender)) +
  geom_bar(width = 0.6, stat = "identity") + coord_flip() +
  ggtitle("Distribution of Gender: ANZ Customer Database") +
  ylab("No of customers") + xlab("Gender category") +
  theme_ipsum() +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.title = element_blank(),
    plot.title = element_text(vjust = 2),
    axis.ticks.y = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_text(),
    axis.text.y = element_text(size = 12),
    axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
    axis.title.y = element_text(size = 14, margin = margin(0,10,0,0))) +
  guides(fill = FALSE) + 
  geom_text(aes(gender, count + 2.0, label = count))

Age & Gender

# Calculate the number of customers by gender and age_category
anz_data_age_gender_count <- anz_df %>% select(cus_id,age_category, gender) %>%
  group_by(age_category, gender) %>% dplyr::summarise(count = n())

# Plot age distribution by gender with pie chart
ggplot(anz_data_age_gender_count) +
  facet_wrap(~gender) +
  geom_bar(aes(y = count, x = age_category, fill = count), stat = "identity") +
  scale_fill_viridis(name = "count", option = "D") +
  theme_ipsum() +
  coord_polar("x", start = 0) +  ggtitle("Age Distribution Among Gender", 
                                         subtitle = "ANZ customer database") +
  ylab("Age category") + xlab("Gender category") +
  theme(
    plot.title = element_text(vjust = 2),
    axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
    axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))

Transaction summary

# Transaction count summary: genderwise and agewise

count_by_customer <- anz_df %>% select(cus_id, age_category, gender) %>%
  group_by(age_category, gender) %>%
  dplyr::summarise(count = n())

count_by_transactions <- anz_data %>% select(age_category, gender, amount) %>%
  group_by(age_category, gender) %>%
  dplyr::summarise(number_tnx = n()) 

count_by_amount <- anz_data %>% select(age_category, gender, amount) %>%
  group_by(age_category, gender) %>%
  dplyr::summarise(amount_vol = sum(amount))

transactions <- cbind(count_by_customer, count_by_transactions, count_by_amount)
transactions <- subset(transactions, select = -c(4,5,7,8)) # drop columns

# Calculate mean tnx count by customer and mean amount by customer from the table data
transactions <- transactions %>% mutate(tnx_per_cus = round(number_tnx/count))
transactions <- transactions %>% mutate(amount_vol_cus = round(amount_vol/count))

cols <- c("Age Category","Gender","Customers", " Tnx Count", "Amount, AUD", "Mean Tnx Count", "Mean Amount, AUD")
colnames(transactions) <- cols
transactions %>% knitr::kable(align = 'c', format = "markdown")

Age Category	Gender	Customers	Tnx Count	Amount, AUD	Mean Tnx Count	Mean Amount, AUD
18-30	F	21	3114	443723.60	148	21130
18-30	M	29	3358	646833.41	116	22305
30-42	F	15	1933	393041.63	129	26203
30-42	M	16	2092	414458.57	131	25904
42-54	F	7	623	122969.98	89	17567
42-54	M	9	699	202144.12	78	22460
54>	F	1	88	10587.42	88	10587
54>	M	2	136	29525.47	68	14763

Segment the dataset by transaction date and time

For the duration of 91 days, we can notice that time series debit and credit transaction activities observed on hourly, weekly and daily scales appear to have a very strong pattern of seasonality. The most engaged hour of the day is between 8-10 am, immediately following a serious decline and then increases again between 1-5 pm with a significant pick around 8 pm. It seems that hourly, weekly and daily seasonal variations are regularly repeating movements in these time series data values that can be easily considered as recurring events. in addition, these regular fluctuations are repeated from week to week with about the same timing and level of intensity. The greatest number of weekly seasonal pattern is on Friday including debit transactions with momentous on Saturday when customers are most active (shopping, entertainment, travelling).

By Hour

# Visualize transaction volume over an average hour
average_hour_transactions <- anz_data %>%
  select(date,hour) %>%
  group_by(date,hour) %>%
  dplyr::summarise(trans_vol = n()) %>%
  group_by(hour) %>%
  dplyr::summarise(trans_vol_per_hr = mean(trans_vol,na.rm = TRUE))

# Animation zero: line plot of average number of transactions by an hour
animation_zero <- ggplot(average_hour_transactions, aes(x = hour, y = trans_vol_per_hr)) + 
  geom_line(color = "white") +
  geom_point(color = "white") +
  xlab("") + ylab("") + 
  labs(title = "Average Transaction Volume by Hour at ANZ") +
  theme_ipsum(grid = FALSE, ticks = FALSE) +
  scale_y_continuous(labels = scales::comma) + 
  theme(plot.background = element_rect(fill = "dodgerblue2"),
        axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
        axis.text.y = element_text(color = "white"),
        plot.title = element_text(color = "white",size = 15)) +
  transition_reveal(hour)

animate(animation_zero, renderer = gifski_renderer())

By Day

# Visualize average transaction volume per day
daily_amount_transactions <- anz_data %>%
  group_by(day = floor_date(date, "day")) %>%
  summarise(avg_daily_trans = sum(amount)) 

# Animation first: line plot of overall transaction amount by day
animation_first <- ggplot(daily_amount_transactions, aes(x = day, y = avg_daily_trans)) + 
  geom_line(color = "white") +
  geom_point(color = "white") +
  xlab("") + ylab("") + 
  labs(title = "Average Daily Transaction Amount at ANZ") +
  theme_ipsum(grid = FALSE, ticks = FALSE) +
  scale_y_continuous(labels = scales::comma) + 
  theme(plot.background = element_rect(fill = "dodgerblue2"),
    axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
    axis.text.y = element_text(color = "white"),
        plot.title = element_text(color = "white",size = 15)) +
  transition_reveal(day)

animate(animation_first, renderer = gifski_renderer())

# Visualize average number of transactions per day
number_transactions_day <- anz_data %>%
  group_by(day = floor_date(date, "day")) %>%
  summarise(avg_num_trans = sum(n()))

# Animation second: line plot of average number of transactions per day
 animation_second <- ggplot(number_transactions_day, aes(day, avg_num_trans)) +
   geom_line(color = "white") +
   geom_point(color = "white") +
   xlab("") + ylab("") + 
   labs(title = "Average Daily Number of Transactions at ANZ") +
   theme_ipsum(grid = FALSE, ticks = FALSE) +
   scale_y_continuous(labels = scales::comma) + 
   theme(plot.background = element_rect(fill = "dodgerblue2"),
         axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
         axis.text.y = element_text(color = "white"),
         plot.title = element_text(color = "white",size = 15)) +
   transition_reveal(day)
 
animate(animation_second, renderer = gifski_renderer())

By Week

# Visualise transaction volume over an average week
average_weekly_transactions <- anz_data %>%
  select(date,weekday) %>%
  group_by(date,weekday) %>%
  summarise(daily_avg_vol = n()) %>%
  group_by(weekday) %>%
  summarise(avg_vol = mean(daily_avg_vol,na.rm = TRUE ))

average_weekly_transactions$weekday <- as.numeric(average_weekly_transactions$weekday)

# Animation third: average number of transactions by weekday
animation_third <- ggplot(average_weekly_transactions,aes(x = weekday, y = avg_vol)) + 
  geom_line(aes(group = 1), color = "white") +
  geom_point(color = "white") +
  xlab("") + ylab("") + 
  labs(title = "Average Weekday Number of Transactions at ANZ") +
  theme_ipsum(grid = FALSE) +
  scale_x_continuous(breaks = 1:7, 
                     labels = c("Monday","Tuesday", 
                                "Wednesday","Thursday",
                                "Friday", "Saturday", "Sunday")) +
  theme(plot.background = element_rect(fill = "dodgerblue2"),
        axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
        axis.text.y = element_text(color = "white"),
        plot.title = element_text(color = "white",size = 15)) + 
  transition_reveal(weekday) 
animate(animation_third, renderer = gifski_renderer())

# Visualise debit transaction volume over an average week
average_weekly_debit_transactions <- data_debit %>%
  select(date,weekday) %>%
  group_by(date,weekday) %>%
  summarise(daily_avg_debit_vol = n()) %>%
  group_by(weekday) %>%
  summarise(avg_debit_vol = mean(daily_avg_debit_vol,na.rm = TRUE ))

average_weekly_debit_transactions$weekday <- as.numeric(average_weekly_debit_transactions$weekday)

# Animation forth: debit volume over an average week
animation_forth <- ggplot(average_weekly_debit_transactions,aes(x = weekday, y = avg_debit_vol)) +
  geom_line(aes(group = 1), color = "white") +
  geom_point(color = "white") +
  xlab("") + ylab("") +
  labs(title = "Average Weekday Number of Debit Transactions at ANZ") +
  theme_ipsum(grid = FALSE) +
  scale_x_continuous(breaks = 1:7, labels = c("Monday",
                                              "Tuesday", "Wednesday", "Thursday",
                                              "Friday", "Saturday", "Sunday")) +
  theme(plot.background = element_rect(fill = "dodgerblue2"),
        axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
        axis.text.y = element_text(color = "white"),
        plot.title = element_text(color = "white",size = 15)) +
  transition_reveal(weekday)
animate(animation_forth, renderer = gifski_renderer())

Histogram of monthly transaction volume by customer

# Visualize monthly transaction volume by customer
monthly_average_cus <- amount_clean %>%
  group_by(cus_id) %>%
  summarise(mon_avg_vol = round(n()/3,0))

hist(monthly_average_cus$mon_avg_vol,col = rgb(0,0,0,0.5),
     xlab = 'Monthly transaction volume', ylab = 'Number of customers',
     main = "")

Exploring location information

Location of customers map

# Exclude the single foreign customer whose location information was incorrectly
# stored (i.e latitude 573)
anz_data_location <- anz_data %>%
  filter(c_long > 113 & c_long < 154 & c_lat > (-44) & c_lat < (-10))

# Select variables and create a dataframe
anz_data_cust_merchant <- anz_data_location %>% select(cus_id, c_long, c_lat, m_long,m_lat) %>%
  group_by(cus_id) %>% na.omit()

# Location of customers map
cus_loc <- unique(subset(anz_data_cust_merchant[,c("cus_id","c_long", "c_lat")],))
df_map = data.frame(longtitude = as.numeric(cus_loc$c_long), 
                    latitude = cus_loc$c_lat <- as.numeric(cus_loc$c_lat))
coordinates(df_map) <- ~longtitude+latitude
cus_icon <- makeAwesomeIcon(icon = 'home', markerColor = 'green')
location_customer_map <- leaflet(df_map) %>% addMarkers() %>% 
  addTiles(group = "OSM") %>%
  addProviderTiles("CartoDB", group = "CartoDB") %>%
  addProviderTiles("Esri", group = "Esri") %>%
  addLayersControl(baseGroups = c("OSM", "CartoDB", "Esri")) %>%
  addAwesomeMarkers(
    lng = as.numeric(cus_loc$c_long), lat = as.numeric(cus_loc$c_lat),
    icon = cus_icon)

location_customer_map

# We could also see the distribution of distance between a customer and the merchange he/she trades with

anz_data_loc = anz_data_cust_merchant[,c("c_long", "c_lat","m_long", "m_lat")]
anz_data_loc$dst <- distHaversine(anz_data_loc[, 1:2], anz_data_loc[, 3:4]) / 1000
hist(anz_data_loc$dst[anz_data_loc$dst < 100], col = rgb(0,0,0,0.5), main = "Distance Between Customer and Merchants",xlab = 'Distance (km)' )

Location of merchants map

# To validate, we could further plot the location of the customer and the merchants he/she trades with on a map. This map shows merchants only for one customer with customer id = 51506836.


merch_dist <- function(id){
  ### This function takes in a customer id and plot the location of the customer and all merchants he/she have traded with
  cus_icon <- makeAwesomeIcon(icon = 'home', markerColor = 'green')
  l = subset(anz_data_cust_merchant[,c("cus_id","m_long","m_lat")], cus_id == id)
  l <- l[c("m_long","m_lat")]
  cus_loc <- unique(subset(anz_data_cust_merchant[,c("cus_id","c_long", "c_lat")], cus_id == id))
  df_map = data.frame(longtitude = as.numeric(l$m_long), latitude = l$m_lat <- as.numeric(l$m_lat))
  coordinates(df_map) <- ~longtitude+latitude
   leaflet(df_map) %>% addMarkers() %>% addTiles(group = "OSM") %>%
    addProviderTiles("CartoDB", group = "CartoDB") %>%
    addProviderTiles("Esri", group = "Esri") %>%
    addLayersControl(baseGroups = c("OSM", "CartoDB", "Esri")) %>%
    addAwesomeMarkers(
      lng = as.numeric(cus_loc$c_long), lat = as.numeric(cus_loc$c_lat),
      icon = cus_icon)
}
merch_dist(id = '51506836')

ANZ Virtual Interhship Report - Part 1

Exploratory Data Analysis

Biljana Simonovikj

09/12/2020