# Add an image
knitr::include_graphics('/Users/Biljana/R files/ANZ/image.jpg')
This task is based on a synthesised transaction dataset containing 3 months’ worth of transactions for 100 hypothetical customers. It contains purchases, recurring transactions, and salary transactions.The dataset is designed to simulate realistic transaction behaviours that are observed in ANZ’s real transaction data.
The dataset is provided in an excel file and has been imported into R for the analysis.The size of the dataset is 12043 rows. To get started load the packages tidyverse and readxl. The readxl package makes it easy to get data out of Excel and into R. Argument gues_max is set to 3052, a maximum number of data rows to use for guessing column types out of 12043 rows.
knitr::opts_chunk$set(fig.width = 7, fig.height = 7,
warning = FALSE, message = FALSE)
# Load the dataset
anz_data <- read_excel("/Users/Biljana/R files/ANZ/ANZ synthesised transaction dataset.xlsx",
sheet = 1, range = NULL, col_names = TRUE,
col_types = c("guess"), na = "", trim_ws = TRUE,
skip = 0, n_max = Inf, guess_max = min(3052, 12043),
progress = readxl_progress(),
.name_repair = "unique")
# View the structure of the data
str(anz_data)
## tibble [12,043 × 23] (S3: tbl_df/tbl/data.frame)
## $ status : chr [1:12043] "authorized" "authorized" "authorized" "authorized" ...
## $ card_present_flag: num [1:12043] 1 0 1 1 1 NA 1 1 1 NA ...
## $ bpay_biller_code : chr [1:12043] NA NA NA NA ...
## $ account : chr [1:12043] "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
## $ currency : chr [1:12043] "AUD" "AUD" "AUD" "AUD" ...
## $ long_lat : chr [1:12043] "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
## $ txn_description : chr [1:12043] "POS" "SALES-POS" "POS" "SALES-POS" ...
## $ merchant_id : chr [1:12043] "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
## $ merchant_code : num [1:12043] NA NA NA NA NA NA NA NA NA NA ...
## $ first_name : chr [1:12043] "Diana" "Diana" "Michael" "Rhonda" ...
## $ balance : num [1:12043] 35.39 21.2 5.71 2117.22 17.95 ...
## $ date : POSIXct[1:12043], format: "2018-08-01" "2018-08-01" ...
## $ gender : chr [1:12043] "F" "F" "M" "F" ...
## $ age : num [1:12043] 26 26 38 40 26 20 43 43 27 40 ...
## $ merchant_suburb : chr [1:12043] "Ashmore" "Sydney" "Sydney" "Buderim" ...
## $ merchant_state : chr [1:12043] "QLD" "NSW" "NSW" "QLD" ...
## $ extraction : chr [1:12043] "2018-08-01T01:01:15.000+0000" "2018-08-01T01:13:45.000+0000" "2018-08-01T01:26:15.000+0000" "2018-08-01T01:38:45.000+0000" ...
## $ amount : num [1:12043] 16.25 14.19 6.42 40.9 3.25 ...
## $ transaction_id : chr [1:12043] "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
## $ country : chr [1:12043] "Australia" "Australia" "Australia" "Australia" ...
## $ customer_id : chr [1:12043] "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
## $ merchant_long_lat: chr [1:12043] "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
## $ movement : chr [1:12043] "debit" "debit" "debit" "debit" ...
This dataset contains the transaction values collected daily and covers the time span between 1 August 2018 through 31 October 2018. We have a single data point for each minute of the day the transactions were processed in this dataset. However we are alo interested in summary values per month.
Below is a summary statistics of selected numeric, categorical and date variables in the dataset.
# Are there any unusual values?
anz_data %>%
select(transaction_id, amount, age, date) %>%
summary()
## transaction_id amount age
## Length:12043 Min. : 0.10 Min. :18.00
## Class :character 1st Qu.: 16.00 1st Qu.:22.00
## Mode :character Median : 29.00 Median :28.00
## Mean : 187.93 Mean :30.58
## 3rd Qu.: 53.66 3rd Qu.:38.00
## Max. :8835.98 Max. :78.00
## date
## Min. :2018-08-01 00:00:00
## 1st Qu.:2018-08-24 00:00:00
## Median :2018-09-16 00:00:00
## Mean :2018-09-15 21:27:39
## 3rd Qu.:2018-10-09 00:00:00
## Max. :2018-10-31 00:00:00
For this analysis, we are only interested in complete and non-duplicated observations.
# Are there any duplicates?
cat("The number of non-duplicate observations within the data set is",
nrow(unique(anz_data)), "out of", "\n",
nrow(anz_data),
"indicating that there are",
nrow(anz_data) - nrow(unique(anz_data)),
"duplicates within the data set.","\n",
"anz_data is our new, duplicate observation free data frame.")
## The number of non-duplicate observations within the data set is 12043 out of
## 12043 indicating that there are 0 duplicates within the data set.
## anz_data is our new, duplicate observation free data frame.
One of the first things to do when we start a data project is to assign the correct data type for each variable. In particular, it important to assign the right type and format of variable to column date as trends over time are key for the type of analysis conducted.
Original dataset contains 23 variables that has been converted into dataset with 28 variables: 7 categorical (ordinal), 12 categorical (nominal), 8 (numerical) and one date variable which format has been changed. New columns were created to derived varibles from date variable: hour, weekday and month. Customer and merchant longitude and latidue were split into individual columns. Prefixes CUS and ACC were removed from customer_id and account data entries.
# Manually define the variables
anz_data$status <- factor(anz_data$status, levels = c("authorized","posted"), labels = c("authorized", "posted"))
anz_data$card_present_flag <- factor(anz_data$card_present_flag, levels = c("0","1"), labels = c("no", "yes"))
anz_data$txn_description <- factor(anz_data$txn_description,
levels = c("INTER BANK","PAY/SALARY",
"PAYMENT", "PHONE BANK",
"POS", "SALES-POS"),
labels = c("INTER BANK","PAY/SALARY",
"PAYMENT", "PHONE BANK",
"POS", "SALES-POS"))
anz_data$gender <- factor(anz_data$gender, levels = c("F", "M"), labels = c("F", "M"))
anz_data$bpay_biller_code <- factor(anz_data$bpay_biller_code,
levels = c("0", "LAND WATER & PLANNING",
"THE DISCOUNT CHEMIST GROUP"),
labels = c("None", "LAND WATER & PLANNING",
"THE DISCOUNT CHEMIST GROUP"))
anz_data$merchant_code <- as.character(anz_data$merchant_code)
# Change the format of date column
anz_data$date <- as.Date(anz_data$date, format = "%d/%m/%Y")
# Derive weekday of each transaction
anz_data$weekday = weekdays(anz_data$date, abbreviate = FALSE)
anz_data$weekday <- factor(anz_data$weekday,
levels = c("Monday", "Tuesday",
"Wednesday", "Thursday",
"Friday", "Saturday", "Sunday"),
labels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday",
"Saturday", "Sunday"))
# Derive hour of each transaction
anz_data$extraction = as.character(anz_data$extraction)
anz_data$hour = hour(as.POSIXct(substr(anz_data$extraction,12,19),format = "%H:%M:%S"))
# Derive month of each transaction
anz_data$month <- month(anz_data$date)
anz_data$month <- factor(anz_data$month, levels = c("8", "9", "10"),
labels = c("August", "September", "October"))
# Remove the CUS and ACC from the variables
anz_data[,c("cus", "cus_id")] <- str_split_fixed(anz_data$customer_id, "-", 2)
anz_data$customer_id <- NULL
anz_data$cus <- NULL
anz_data[,c("acc", "acc_id")] <- str_split_fixed(anz_data$account, "-", 2)
anz_data$account <- NULL
anz_data$acc <- NULL
# Split customer & merchant lat_long into individual columns for analysis
anz_data[,c("c_long", "c_lat")] <- str_split_fixed(anz_data$long_lat, " ", 2)
anz_data[,c("m_long", "m_lat")] <- str_split_fixed(anz_data$merchant_long_lat, " ", 2)
anz_data <- anz_data %>% select(-long_lat, -merchant_long_lat)
anz_data <- anz_data %>%
mutate(c_long = as.numeric(c_long),
c_lat = as.numeric(c_lat),
m_long = as.numeric(m_long),
m_lat = as.numeric(m_lat))
The df_status function coming in package funModeling can help us by showing numbers of missing values (NA in R) in relative and percentage values and numbers of unique values for each variable in the dataset. There is a pattern in missing values: five variables contain 35.92% of missing values or NA’s and another two have 92.66% of NA values. By default, readxl treats blank cells as missing data. At this point, we dont exclude any rows or columns because it will delete the entire dataset.There is no general rule to shrink data, it depends on each case.
# Checking missing values, data type and unique values
df_status(anz_data, print_results = F) %>% select(variable, type, unique, q_na, p_na) %>% arrange(type)
## variable type unique q_na p_na
## 1 currency character 1 0 0.00
## 2 merchant_id character 5725 4326 35.92
## 3 merchant_code character 1 11160 92.67
## 4 first_name character 80 0 0.00
## 5 merchant_suburb character 1609 4326 35.92
## 6 merchant_state character 8 4326 35.92
## 7 extraction character 9442 0 0.00
## 8 transaction_id character 12043 0 0.00
## 9 country character 1 0 0.00
## 10 movement character 2 0 0.00
## 11 cus_id character 100 0 0.00
## 12 acc_id character 100 0 0.00
## 13 date Date 91 0 0.00
## 14 status factor 2 0 0.00
## 15 card_present_flag factor 2 4326 35.92
## 16 bpay_biller_code factor 2 11159 92.66
## 17 txn_description factor 6 0 0.00
## 18 gender factor 2 0 0.00
## 19 weekday factor 7 0 0.00
## 20 month factor 3 0 0.00
## 21 hour integer 24 0 0.00
## 22 balance numeric 12006 0 0.00
## 23 age numeric 33 0 0.00
## 24 amount numeric 4457 0 0.00
## 25 c_long numeric 87 0 0.00
## 26 c_lat numeric 85 0 0.00
## 27 m_long numeric 718 4326 35.92
## 28 m_lat numeric 669 4326 35.92
The dateset only contains records for 91 days, one day is missing.
date_range <- seq(min(anz_data$date), max(anz_data$date), by = 1)
date_range[!date_range %in% anz_data$date]
## [1] "2018-08-16"
# Confirm the one -to -one link of acc_id and cus_id
anz_data %>% select(acc_id,cus_id) %>%
unique() %>%
nrow()
## [1] 100
There is one customer who is living outside Australia.
# Check the range of customer locations and define customers who dont reside in Australia
# Reference: http://www.ga.gov.au/scientific-topics/national-location-information/
# dimensions/continental-extremities
anz_data_location_outlier <- anz_data[!(anz_data$c_long > 113 & anz_data$c_long < 154 & anz_data$c_lat > (-44) & anz_data$c_lat < (-10)),]
# How many customers reside outside Australia?
length(unique(anz_data_location_outlier$cus_id))
## [1] 1
# Who is the customer who resides outside Australia?
anz_data_location_outlier %>% distinct(first_name, age, gender, cus_id)
## # A tibble: 1 x 4
## first_name age gender cus_id
## <chr> <dbl> <fct> <chr>
## 1 Daniel 22 M 1617121891
The first step in data exploration usually consists of univariate, descriptive analysis of all variables of interest. Categorical variables are assessed by measurement of frequencies, percentages, cumulative values, and generating colorful plots. Frequency or distribution analysis is made simple by the freq function coming in package funModeling. It retrieves the distribution in a table and a plot (by default) and shows the distribution of absolute and relative numbers.The output is ordered by the frequency variable, which quickly analyzes the most frequent categories and how many shares they represent. By default, NA values are considered in both the table and the plot.
By analysing the plots and the tables, we can notice that factor level “credit” of movement variable has 883 rows that is 7.33% of all transactions. As well, factor level “PAY/SALARY” of txn_description variable has only 883 rows or 7.33% of the data. It is a useful thought to consider if we decide to analyse purchase (debit) or credit transactions separately.
# Select categorical variables for analysis
anz_data_cat = select(anz_data, txn_description, merchant_state, movement)
# Plot the frequencies (distribution) of selected variables
freq(anz_data_cat, input = NA,
plot = TRUE, na.rm = FALSE) # NA values are excluded
## txn_description frequency percentage cumulative_perc
## 1 SALES-POS 3934 32.67 32.67
## 2 POS 3783 31.41 64.08
## 3 PAYMENT 2600 21.59 85.67
## 4 PAY/SALARY 883 7.33 93.00
## 5 INTER BANK 742 6.16 99.16
## 6 PHONE BANK 101 0.84 100.00
## merchant_state frequency percentage cumulative_perc
## 1 <NA> 4326 35.92 35.92
## 2 NSW 2169 18.01 53.93
## 3 VIC 2131 17.69 71.62
## 4 QLD 1556 12.92 84.54
## 5 WA 1100 9.13 93.67
## 6 SA 415 3.45 97.12
## 7 NT 205 1.70 98.82
## 8 ACT 73 0.61 99.43
## 9 TAS 68 0.56 100.00
## movement frequency percentage cumulative_perc
## 1 debit 11160 92.67 92.67
## 2 credit 883 7.33 100.00
## [1] "Variables processed: txn_description, merchant_state, movement"
# Top 10 customers at ANZ bank
# Calculate the number of transactions per customer
transactions_per_customer <- anz_data %>%
group_by(cus_id) %>%
dplyr::summarise(number_transactions = n()) %>%
arrange(desc(number_transactions)) %>%
top_n(10, number_transactions)
# Plot the number of transactions per customers in descending order
ggplot(transactions_per_customer, aes(x = reorder(cus_id,number_transactions) , y = number_transactions)) +
geom_bar(stat = "identity", fill = "lightseagreen", alpha = .6, width = .6) +
coord_flip() + theme_ipsum() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2, face = 'bold'),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0))) +
labs(title = 'Top 10 Customers at ANZ Bank') +
xlab("Customer number") + ylab("Number of transactions")
# Top states by number of transactions
# Calculate the number of transactions per state
transactions_per_state <- anz_data %>%
group_by(merchant_state) %>%
dplyr::summarise(number_transactions = n()) %>%
arrange(desc(number_transactions))
# Remove Na's
transactions_per_state_omit <- na.omit(transactions_per_state)
# Plot the top states by number of transactions
transactions_per_state_omit %>%
arrange(number_transactions) %>%
mutate(merchant_state = factor(merchant_state, levels = merchant_state)) %>%
ggplot(aes(x = merchant_state, y = number_transactions)) +
geom_segment( aes(xend = merchant_state, yend = 0)) +
geom_point( size = 4, color = "orange") +
coord_flip() + theme_ipsum() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2, face = 'bold'),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0))) +
labs(title = 'Top States at ANZ Bank') +
xlab("") + ylab("Number of transactions")
# Ways to bank at ANZ
# Trasnformation of amount variable into two variables: a debit and a credit numeric variables
data_debit_credit <- anz_data %>%
spread(movement, amount)
# Check the distribution of debit variable for outliers
outliers_debit <- boxplot.stats(data_debit_credit$debit)$out
# length(outliers_debit) # 1182 data points (frequency)
# Find in which rows the outliers are
rows_outliers <- data_debit_credit[which(data_debit_credit$debit %in% outliers_debit),]
# Remove the rows containing the outliers
debit_clean <- data_debit_credit[-which(data_debit_credit$debit %in% outliers_debit),]
# dim(debit_clean)
# Filter out one factor level ("PAY/SALARY") from txn_description
data_debit <- debit_clean %>% group_by(txn_description) %>%
filter(txn_description != "PAY/SALARY")
# dim(data_debit)
data_debit$txn_description <- factor(data_debit$txn_description,
levels = c("PHONE BANK", "INTER BANK",
"PAYMENT", "POS", "SALES-POS"))
# assigning as factor eliminates the level
# Data_debit contains 9978 data points yielding 11160 together with outliers of 1182
# Plot the ridgeline plot with color relative to the numeric value of the variable debit
ggplot(data_debit, aes(x = `debit`, y = `txn_description` , fill = ..x..)) +
geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01) +
scale_fill_viridis(name = "debit", option = "C") +
ggtitle('Ways to Bank at ANZ') +
theme_ipsum() +
xlab("") + ylab("") +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2, face = 'bold'),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))
# Gender Purchase Amount by State
# Stacked bar plot of state debit proportions coloured by gender
data_debit_merchant_state <- data_debit %>% select(gender, debit, merchant_state) %>%
filter(!is.na(merchant_state)) %>%
group_by(merchant_state, debit, gender) %>%
dplyr::summarise(count = n())
ggplot(data_debit_merchant_state, aes(fill = gender, y = debit, x = merchant_state)) +
geom_bar(position = "fill", stat = "identity") +
scale_fill_manual(values = c("violet", "blue")) +
ggtitle("Gender Purchase Amount by State") +
xlab("") + ylab("") +
theme_ipsum() +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2, face = 'bold'),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(),
axis.text.y = element_text(),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))
This analysis is performed with describe function and profiling_num function as part of packages Hmisc and funModeling respecitevly. The idea is to provide to data scientists a full set of metrics, so they can select the most relevant. Numerical variables are assessed by analysing:
All the values are calculated after removing NA’s.
library(Hmisc)
vars_to_profile_1 = c("amount", "balance", "age")
vars_to_profile_2 = c("debit", "credit")
data_subset_1 = select(anz_data, one_of(vars_to_profile_1))
data_subset_2 = select(data_debit_credit, one_of(vars_to_profile_2))
describe(data_subset_1)
## data_subset_1
##
## 3 Variables 12043 Observations
## --------------------------------------------------------------------------------
## amount
## n missing distinct Info Mean Gmd .05 .10
## 12043 0 4457 1 187.9 311.6 7.141 9.442
## .25 .50 .75 .90 .95
## 16.000 29.000 53.655 186.000 1158.150
##
## lowest : 0.10 0.76 1.52 1.59 1.64
## highest: 5103.51 6024.49 6107.23 7081.09 8835.98
## --------------------------------------------------------------------------------
## balance
## n missing distinct Info Mean Gmd .05 .10
## 12043 0 12006 1 14704 19112 958.7 1533.5
## .25 .50 .75 .90 .95
## 3158.6 6432.0 12465.9 29442.4 53362.9
##
## lowest : 0.24 0.25 0.98 1.02 2.38
## highest: 266981.30 267028.97 267050.66 267093.66 267128.52
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 12043 0 33 0.998 30.58 10.85 19 20
## .25 .50 .75 .90 .95
## 22 28 38 43 46
##
## lowest : 18 19 20 21 22, highest: 52 53 64 69 78
## --------------------------------------------------------------------------------
describe(data_subset_2)
## data_subset_2
##
## 2 Variables 12043 Observations
## --------------------------------------------------------------------------------
## debit
## n missing distinct Info Mean Gmd .05 .10
## 11160 883 4358 1 52.57 62.5 6.89 9.13
## .25 .50 .75 .90 .95
## 15.19 26.93 45.00 94.23 156.00
##
## lowest : 0.10 0.76 1.52 1.59 1.64
## highest: 2885.85 3680.71 4094.33 4233.00 7081.09
## --------------------------------------------------------------------------------
## credit
## n missing distinct Info Mean Gmd .05 .10
## 883 11160 100 1 1899 1188 725.3 892.1
## .25 .50 .75 .90 .95
## 1013.7 1626.5 2538.7 3578.7 3977.5
##
## lowest : 576.00 596.33 664.43 725.32 761.33
## highest: 4910.90 5103.51 6024.49 6107.23 8835.98
## --------------------------------------------------------------------------------
library(funModeling)
my_profiling_table_1 <- profiling_num(data_subset_1) %>% select(variable, mean, p_01, p_99, range_80)
my_profiling_table_1
## variable mean p_01 p_99 range_80
## 1 amount 187.93359 4.3084 3195.01 [9.442, 186]
## 2 balance 14704.19555 152.7050 201963.45 [1533.454, 29442.384]
## 3 age 30.58233 18.0000 69.00 [20, 43]
my_profiling_table_2 <- profiling_num(data_subset_2) %>% select(variable, mean, p_01, p_99, range_80)
my_profiling_table_2
## variable mean p_01 p_99 range_80
## 1 debit 52.57234 4.2259 488.8288 [9.13, 94.2340000000001]
## 2 credit 1898.72803 576.0000 5103.5100 [892.09, 3578.65]
# Remove outliers
# Outliers from debit variable have already been removed
# Remove outliers from credit variable
outliers_credit <- boxplot.stats(data_debit_credit$credit)$out
credit_clean <- data_debit_credit[-which(data_debit_credit$credit %in% outliers_credit),]
# Remove outliers from amount variable
outliers_amount <- boxplot.stats(anz_data$amount)$out
amount_clean <- anz_data[-which(anz_data$amount %in% outliers_amount),]
# Remove outliers from balance variable
outliers_balance <- boxplot.stats(anz_data$balance)$out
balance_clean <- anz_data[-which(anz_data$balance %in% outliers_balance),]
# Remove outliers from age variable
outliers_age <- boxplot.stats(anz_data$age)$out
age_clean <- anz_data[-which(anz_data$age %in% outliers_age),]
par(
mfrow = c(1,1),
mar = c(8.1,6.1,6.1,2.1)
)
hist(amount_clean$amount, col = rgb(0,0,0,0.5) , xlab = "Amount" , ylab = "Frequency" ,
main = "Overall Transaction Amount")
hist(debit_clean$debit, col = rgb(0,0,0,0.5) , xlab = "Debit" , ylab = "Frequency" ,
main = "Overall Purchase Amount")
hist(credit_clean$credit, col = rgb(0,0,0,0.5) , xlab = "Credit" , ylab = "Frequency" ,
main = "Overall Credit Amount")
hist(balance_clean$balance, col = rgb(0,0,0,0.5) , xlab = "Balance" , ylab = "Frequency" , main = "Overall Balance Amount")
hist(age_clean$age, col = rgb(0,0,0,0.5) , xlab = "Age" , ylab = "Frequency" , main = "Age Histogram")
Providing the right products, services, and messages to the right customers at the right time is at the core of bank marketing. Identifying the demographics of valuable and potentially valuable customers is useful for targeting campaigns and building audiences for remarketing.
Among 100 customers in the ANZ customer database, majority of the frequent customers are male individuals who appear to belong to age category from 18 to 30 years old. High proportion of credit and debit transaction activities are also related to male individuals in this age category. In addition, combination of age categories from 18 years old to 54 years old represents 97% of all customers in the database. According to the pie chart, it is evident that individuals above 50 years of age are mostly sporadic, particularly among female group. In terms of transaction volume, the age category above 54 shows lesser number of proportion, but when it comes to the mean value of the amount per transaction, the final conclusion is different. This would be useful from the ANZ perspective to initiate marketing campaigns targeting these or similar specific groups.
# Create new variable age_category (discretization of numeric variables)
anz_data <- mutate(anz_data, age_category = case_when(18 <= age & age < 30 ~ '18-30',
30 <= age & age < 42 ~ '30-42',
42 <= age & age < 54 ~ '42-54',
TRUE ~ '54>'))
# Define dataframe to calculate the number of transactios by customer_id
anz_data_id <- unique(anz_data$cus_id)
anz_df <- NULL
for (val in 1:length(anz_data_id)) {
cus_id_info <- anz_data_id[val]
cus_df <- anz_data[anz_data$cus_id == cus_id_info, ]
cus_df <- cus_df[1, c('cus_id','gender','age', "age_category", "amount")]
if (is.null(anz_df)) {
anz_df <- cus_df
} else{
anz_df <- rbind(anz_df, cus_df)
}
}
# Calculate the number of customers by age_category
anz_data_age_count <- anz_df %>% select(cus_id,age_category) %>%
group_by(age_category) %>%
dplyr::summarise(count = n())
# Bar chart of age distribution
ggplot(anz_data_age_count, aes(x = fct_reorder(age_category, count), y = count, fill = age_category)) +
geom_bar( width = 0.6, stat = "identity") + coord_flip() +
ggtitle("Distribution of Age: ANZ Customer Database") +
ylab("No of customers") + xlab("Age category") +
theme_ipsum() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0))) +
guides(fill = FALSE) +
geom_text(aes(age_category, count + 2.0, label = count))
# Calculate the number of customers by gender
anz_data_gender_count <- anz_df %>% select(cus_id, gender) %>%
group_by(gender) %>% dplyr::summarise(count = n())
# Bar chart of gender distribution
ggplot(anz_data_gender_count, aes(x = gender, y = count, fill = gender)) +
geom_bar(width = 0.6, stat = "identity") + coord_flip() +
ggtitle("Distribution of Gender: ANZ Customer Database") +
ylab("No of customers") + xlab("Gender category") +
theme_ipsum() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0))) +
guides(fill = FALSE) +
geom_text(aes(gender, count + 2.0, label = count))
# Calculate the number of customers by gender and age_category
anz_data_age_gender_count <- anz_df %>% select(cus_id,age_category, gender) %>%
group_by(age_category, gender) %>% dplyr::summarise(count = n())
# Plot age distribution by gender with pie chart
ggplot(anz_data_age_gender_count) +
facet_wrap(~gender) +
geom_bar(aes(y = count, x = age_category, fill = count), stat = "identity") +
scale_fill_viridis(name = "count", option = "D") +
theme_ipsum() +
coord_polar("x", start = 0) + ggtitle("Age Distribution Among Gender",
subtitle = "ANZ customer database") +
ylab("Age category") + xlab("Gender category") +
theme(
plot.title = element_text(vjust = 2),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))
# Transaction count summary: genderwise and agewise
count_by_customer <- anz_df %>% select(cus_id, age_category, gender) %>%
group_by(age_category, gender) %>%
dplyr::summarise(count = n())
count_by_transactions <- anz_data %>% select(age_category, gender, amount) %>%
group_by(age_category, gender) %>%
dplyr::summarise(number_tnx = n())
count_by_amount <- anz_data %>% select(age_category, gender, amount) %>%
group_by(age_category, gender) %>%
dplyr::summarise(amount_vol = sum(amount))
transactions <- cbind(count_by_customer, count_by_transactions, count_by_amount)
transactions <- subset(transactions, select = -c(4,5,7,8)) # drop columns
# Calculate mean tnx count by customer and mean amount by customer from the table data
transactions <- transactions %>% mutate(tnx_per_cus = round(number_tnx/count))
transactions <- transactions %>% mutate(amount_vol_cus = round(amount_vol/count))
cols <- c("Age Category","Gender","Customers", " Tnx Count", "Amount, AUD", "Mean Tnx Count", "Mean Amount, AUD")
colnames(transactions) <- cols
transactions %>% knitr::kable(align = 'c', format = "markdown")
| Age Category | Gender | Customers | Tnx Count | Amount, AUD | Mean Tnx Count | Mean Amount, AUD |
|---|---|---|---|---|---|---|
| 18-30 | F | 21 | 3114 | 443723.60 | 148 | 21130 |
| 18-30 | M | 29 | 3358 | 646833.41 | 116 | 22305 |
| 30-42 | F | 15 | 1933 | 393041.63 | 129 | 26203 |
| 30-42 | M | 16 | 2092 | 414458.57 | 131 | 25904 |
| 42-54 | F | 7 | 623 | 122969.98 | 89 | 17567 |
| 42-54 | M | 9 | 699 | 202144.12 | 78 | 22460 |
| 54> | F | 1 | 88 | 10587.42 | 88 | 10587 |
| 54> | M | 2 | 136 | 29525.47 | 68 | 14763 |
For the duration of 91 days, we can notice that time series debit and credit transaction activities observed on hourly, weekly and daily scales appear to have a very strong pattern of seasonality. The most engaged hour of the day is between 8-10 am, immediately following a serious decline and then increases again between 1-5 pm with a significant pick around 8 pm. It seems that hourly, weekly and daily seasonal variations are regularly repeating movements in these time series data values that can be easily considered as recurring events. in addition, these regular fluctuations are repeated from week to week with about the same timing and level of intensity. The greatest number of weekly seasonal pattern is on Friday including debit transactions with momentous on Saturday when customers are most active (shopping, entertainment, travelling).
# Visualize transaction volume over an average hour
average_hour_transactions <- anz_data %>%
select(date,hour) %>%
group_by(date,hour) %>%
dplyr::summarise(trans_vol = n()) %>%
group_by(hour) %>%
dplyr::summarise(trans_vol_per_hr = mean(trans_vol,na.rm = TRUE))
# Animation zero: line plot of average number of transactions by an hour
animation_zero <- ggplot(average_hour_transactions, aes(x = hour, y = trans_vol_per_hr)) +
geom_line(color = "white") +
geom_point(color = "white") +
xlab("") + ylab("") +
labs(title = "Average Transaction Volume by Hour at ANZ") +
theme_ipsum(grid = FALSE, ticks = FALSE) +
scale_y_continuous(labels = scales::comma) +
theme(plot.background = element_rect(fill = "dodgerblue2"),
axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
axis.text.y = element_text(color = "white"),
plot.title = element_text(color = "white",size = 15)) +
transition_reveal(hour)
animate(animation_zero, renderer = gifski_renderer())
# Visualize average transaction volume per day
daily_amount_transactions <- anz_data %>%
group_by(day = floor_date(date, "day")) %>%
summarise(avg_daily_trans = sum(amount))
# Animation first: line plot of overall transaction amount by day
animation_first <- ggplot(daily_amount_transactions, aes(x = day, y = avg_daily_trans)) +
geom_line(color = "white") +
geom_point(color = "white") +
xlab("") + ylab("") +
labs(title = "Average Daily Transaction Amount at ANZ") +
theme_ipsum(grid = FALSE, ticks = FALSE) +
scale_y_continuous(labels = scales::comma) +
theme(plot.background = element_rect(fill = "dodgerblue2"),
axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
axis.text.y = element_text(color = "white"),
plot.title = element_text(color = "white",size = 15)) +
transition_reveal(day)
animate(animation_first, renderer = gifski_renderer())
# Visualize average number of transactions per day
number_transactions_day <- anz_data %>%
group_by(day = floor_date(date, "day")) %>%
summarise(avg_num_trans = sum(n()))
# Animation second: line plot of average number of transactions per day
animation_second <- ggplot(number_transactions_day, aes(day, avg_num_trans)) +
geom_line(color = "white") +
geom_point(color = "white") +
xlab("") + ylab("") +
labs(title = "Average Daily Number of Transactions at ANZ") +
theme_ipsum(grid = FALSE, ticks = FALSE) +
scale_y_continuous(labels = scales::comma) +
theme(plot.background = element_rect(fill = "dodgerblue2"),
axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
axis.text.y = element_text(color = "white"),
plot.title = element_text(color = "white",size = 15)) +
transition_reveal(day)
animate(animation_second, renderer = gifski_renderer())
# Visualise transaction volume over an average week
average_weekly_transactions <- anz_data %>%
select(date,weekday) %>%
group_by(date,weekday) %>%
summarise(daily_avg_vol = n()) %>%
group_by(weekday) %>%
summarise(avg_vol = mean(daily_avg_vol,na.rm = TRUE ))
average_weekly_transactions$weekday <- as.numeric(average_weekly_transactions$weekday)
# Animation third: average number of transactions by weekday
animation_third <- ggplot(average_weekly_transactions,aes(x = weekday, y = avg_vol)) +
geom_line(aes(group = 1), color = "white") +
geom_point(color = "white") +
xlab("") + ylab("") +
labs(title = "Average Weekday Number of Transactions at ANZ") +
theme_ipsum(grid = FALSE) +
scale_x_continuous(breaks = 1:7,
labels = c("Monday","Tuesday",
"Wednesday","Thursday",
"Friday", "Saturday", "Sunday")) +
theme(plot.background = element_rect(fill = "dodgerblue2"),
axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
axis.text.y = element_text(color = "white"),
plot.title = element_text(color = "white",size = 15)) +
transition_reveal(weekday)
animate(animation_third, renderer = gifski_renderer())
# Visualise debit transaction volume over an average week
average_weekly_debit_transactions <- data_debit %>%
select(date,weekday) %>%
group_by(date,weekday) %>%
summarise(daily_avg_debit_vol = n()) %>%
group_by(weekday) %>%
summarise(avg_debit_vol = mean(daily_avg_debit_vol,na.rm = TRUE ))
average_weekly_debit_transactions$weekday <- as.numeric(average_weekly_debit_transactions$weekday)
# Animation forth: debit volume over an average week
animation_forth <- ggplot(average_weekly_debit_transactions,aes(x = weekday, y = avg_debit_vol)) +
geom_line(aes(group = 1), color = "white") +
geom_point(color = "white") +
xlab("") + ylab("") +
labs(title = "Average Weekday Number of Debit Transactions at ANZ") +
theme_ipsum(grid = FALSE) +
scale_x_continuous(breaks = 1:7, labels = c("Monday",
"Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday")) +
theme(plot.background = element_rect(fill = "dodgerblue2"),
axis.text.x = element_text(color = "white", angle = 60, hjust = 1),
axis.text.y = element_text(color = "white"),
plot.title = element_text(color = "white",size = 15)) +
transition_reveal(weekday)
animate(animation_forth, renderer = gifski_renderer())
# Visualize monthly transaction volume by customer
monthly_average_cus <- amount_clean %>%
group_by(cus_id) %>%
summarise(mon_avg_vol = round(n()/3,0))
hist(monthly_average_cus$mon_avg_vol,col = rgb(0,0,0,0.5),
xlab = 'Monthly transaction volume', ylab = 'Number of customers',
main = "")
# Exclude the single foreign customer whose location information was incorrectly
# stored (i.e latitude 573)
anz_data_location <- anz_data %>%
filter(c_long > 113 & c_long < 154 & c_lat > (-44) & c_lat < (-10))
# Select variables and create a dataframe
anz_data_cust_merchant <- anz_data_location %>% select(cus_id, c_long, c_lat, m_long,m_lat) %>%
group_by(cus_id) %>% na.omit()
# Location of customers map
cus_loc <- unique(subset(anz_data_cust_merchant[,c("cus_id","c_long", "c_lat")],))
df_map = data.frame(longtitude = as.numeric(cus_loc$c_long),
latitude = cus_loc$c_lat <- as.numeric(cus_loc$c_lat))
coordinates(df_map) <- ~longtitude+latitude
cus_icon <- makeAwesomeIcon(icon = 'home', markerColor = 'green')
location_customer_map <- leaflet(df_map) %>% addMarkers() %>%
addTiles(group = "OSM") %>%
addProviderTiles("CartoDB", group = "CartoDB") %>%
addProviderTiles("Esri", group = "Esri") %>%
addLayersControl(baseGroups = c("OSM", "CartoDB", "Esri")) %>%
addAwesomeMarkers(
lng = as.numeric(cus_loc$c_long), lat = as.numeric(cus_loc$c_lat),
icon = cus_icon)
location_customer_map
# We could also see the distribution of distance between a customer and the merchange he/she trades with
anz_data_loc = anz_data_cust_merchant[,c("c_long", "c_lat","m_long", "m_lat")]
anz_data_loc$dst <- distHaversine(anz_data_loc[, 1:2], anz_data_loc[, 3:4]) / 1000
hist(anz_data_loc$dst[anz_data_loc$dst < 100], col = rgb(0,0,0,0.5), main = "Distance Between Customer and Merchants",xlab = 'Distance (km)' )
# To validate, we could further plot the location of the customer and the merchants he/she trades with on a map. This map shows merchants only for one customer with customer id = 51506836.
merch_dist <- function(id){
### This function takes in a customer id and plot the location of the customer and all merchants he/she have traded with
cus_icon <- makeAwesomeIcon(icon = 'home', markerColor = 'green')
l = subset(anz_data_cust_merchant[,c("cus_id","m_long","m_lat")], cus_id == id)
l <- l[c("m_long","m_lat")]
cus_loc <- unique(subset(anz_data_cust_merchant[,c("cus_id","c_long", "c_lat")], cus_id == id))
df_map = data.frame(longtitude = as.numeric(l$m_long), latitude = l$m_lat <- as.numeric(l$m_lat))
coordinates(df_map) <- ~longtitude+latitude
leaflet(df_map) %>% addMarkers() %>% addTiles(group = "OSM") %>%
addProviderTiles("CartoDB", group = "CartoDB") %>%
addProviderTiles("Esri", group = "Esri") %>%
addLayersControl(baseGroups = c("OSM", "CartoDB", "Esri")) %>%
addAwesomeMarkers(
lng = as.numeric(cus_loc$c_long), lat = as.numeric(cus_loc$c_lat),
icon = cus_icon)
}
merch_dist(id = '51506836')