1. Import Data

head(ANZ_transactions)
## # A tibble: 6 x 23
##   status card_present_fl… bpay_biller_code account currency long_lat
##   <chr>             <dbl>            <dbl> <chr>   <chr>    <chr>   
## 1 autho…                1               NA ACC-15… AUD      153.41 …
## 2 autho…                0               NA ACC-15… AUD      153.41 …
## 3 autho…                1               NA ACC-12… AUD      151.23 …
## 4 autho…                1               NA ACC-10… AUD      153.10 …
## 5 autho…                1               NA ACC-15… AUD      153.41 …
## 6 posted               NA               NA ACC-16… AUD      151.22 …
## # … with 17 more variables: txn_description <chr>, merchant_id <chr>,
## #   merchant_code <dbl>, first_name <chr>, balance <dbl>, date <dttm>,
## #   gender <chr>, age <dbl>, merchant_suburb <chr>, merchant_state <chr>,
## #   extraction <chr>, amount <dbl>, transaction_id <chr>, country <chr>,
## #   customer_id <chr>, merchant_long_lat <chr>, movement <chr>
dim(ANZ_transactions)
## [1] 12043    23

2. Completeness

diagnose(ANZ_transactions)%>%
  filter(missing_percent !=0)%>%
  arrange(desc(missing_percent))
## # A tibble: 7 x 6
##   variables       types   missing_count missing_percent unique_count unique_rate
##   <chr>           <chr>           <int>           <dbl>        <int>       <dbl>
## 1 bpay_biller_co… numeric         11160            92.7            2    0.000166
## 2 merchant_code   numeric         11160            92.7            2    0.000166
## 3 card_present_f… numeric          4326            35.9            3    0.000249
## 4 merchant_id     charac…          4326            35.9         5726    0.475   
## 5 merchant_suburb charac…          4326            35.9         1610    0.134   
## 6 merchant_state  charac…          4326            35.9            9    0.000747
## 7 merchant_long_… charac…          4326            35.9         2704    0.225

3. Overall Insights

3.1 Understanding the variables

summary(ANZ_transactions)
##     status          card_present_flag bpay_biller_code   account         
##  Length:12043       Min.   :0.000     Min.   :0        Length:12043      
##  Class :character   1st Qu.:1.000     1st Qu.:0        Class :character  
##  Mode  :character   Median :1.000     Median :0        Mode  :character  
##                     Mean   :0.803     Mean   :0                          
##                     3rd Qu.:1.000     3rd Qu.:0                          
##                     Max.   :1.000     Max.   :0                          
##                     NA's   :4326      NA's   :11160                      
##    currency           long_lat         txn_description    merchant_id       
##  Length:12043       Length:12043       Length:12043       Length:12043      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  merchant_code    first_name           balance         
##  Min.   :0       Length:12043       Min.   :     0.24  
##  1st Qu.:0       Class :character   1st Qu.:  3158.59  
##  Median :0       Mode  :character   Median :  6432.01  
##  Mean   :0                          Mean   : 14704.20  
##  3rd Qu.:0                          3rd Qu.: 12465.94  
##  Max.   :0                          Max.   :267128.52  
##  NA's   :11160                                         
##       date                        gender               age       
##  Min.   :2018-08-01 00:00:00   Length:12043       Min.   :18.00  
##  1st Qu.:2018-08-24 00:00:00   Class :character   1st Qu.:22.00  
##  Median :2018-09-16 00:00:00   Mode  :character   Median :28.00  
##  Mean   :2018-09-15 21:27:39                      Mean   :30.58  
##  3rd Qu.:2018-10-09 00:00:00                      3rd Qu.:38.00  
##  Max.   :2018-10-31 00:00:00                      Max.   :78.00  
##                                                                  
##  merchant_suburb    merchant_state      extraction            amount       
##  Length:12043       Length:12043       Length:12043       Min.   :   0.10  
##  Class :character   Class :character   Class :character   1st Qu.:  16.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  29.00  
##                                                           Mean   : 187.93  
##                                                           3rd Qu.:  53.66  
##                                                           Max.   :8835.98  
##                                                                            
##  transaction_id       country          customer_id        merchant_long_lat 
##  Length:12043       Length:12043       Length:12043       Length:12043      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    movement        
##  Length:12043      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

3.2 By money spent

3.2.1 Boxplot By Gender

ANZ_transactions %>% 
  ggplot(aes(gender,amount, col=gender))+
  geom_boxplot()+
  geom_point(alpha=0.3)+
  scale_y_log10()

3.2.2 Distribution By Age and Gender

ANZ_transactions%>% 
  ggplot(aes(age,fill= gender))+
  geom_histogram(bins=30, col="black", alpha=0.8)

3.2.3 Spent By Age

ANZ_transactions%>% 
  ggplot(aes(age,amount, col=gender))+
  geom_point()+
  scale_y_log10()

3.2.4 Amount By Description

ANZ_transactions %>% 
  ggplot(aes(txn_description,amount, col=gender))+
  geom_boxplot()+
  scale_y_log10()

3.2.5 Amount By Group of Ages

ANZ_transactions %>% 
  ggplot(aes(factor(age),amount))+
  geom_boxplot()+
  scale_y_log10()

3.2.6 Age segmentation

ANZ_transactions %>%
  mutate(cuts=cut(age, c(0, 20, 25, 30, 35, 40, Inf))) %>%
  group_by(cuts)%>%
  ggplot(aes(cuts,amount))+
  geom_col()

3.2.7 Consumers that spend the most

ANZ_transactions%>%
  group_by(customer_id)%>%
  summarize(gender= first(gender), age= first(age), amount=sum(amount))%>%
  arrange(desc(amount))%>%
  slice(1:10)%>%
  knitr::kable(align = 'c', format = "markdown")
customer_id gender age amount
CUS-2738291516 M 35 45409.16
CUS-3142625864 M 43 42688.30
CUS-1816693151 M 40 40215.54
CUS-2155701614 F 35 37943.79
CUS-261674136 M 29 36786.13
CUS-883482547 F 19 36639.41
CUS-4142663097 F 34 36588.25
CUS-527400765 F 40 36543.61
CUS-1196156254 F 34 36050.72
CUS-2031327464 M 40 35832.97

3.2.8 Consumers with most transactions

ANZ_transactions%>%
  group_by(customer_id)%>%
  summarize(gender= first(gender), age= first(age),n=n() ,amount=sum(amount))%>%
  arrange(desc(n))%>%
  slice(1:10)%>%
  knitr::kable(align = 'c', format = "markdown")
customer_id gender age n amount
CUS-2487424745 F 26 578 26211.59
CUS-2142601169 M 38 303 23696.45
CUS-3026014945 F 27 292 29074.34
CUS-3378712515 F 24 260 22761.96
CUS-1614226872 F 40 259 19286.44
CUS-1196156254 F 34 245 36050.72
CUS-2695611575 M 37 239 21479.97
CUS-51506836 M 24 239 24100.75
CUS-860700529 M 30 233 18099.88
CUS-1669695324 F 25 230 23070.56

3.3 Total Averages per user

ANZ_transactions%>%
  group_by(customer_id)%>%
  summarize(n=n(),amount=sum(amount))%>%
  summary()
##  customer_id              n              amount     
##  Length:100         Min.   : 25.00   Min.   :10386  
##  Class :character   1st Qu.: 77.75   1st Qu.:16210  
##  Mode  :character   Median :109.50   Median :21451  
##                     Mean   :120.43   Mean   :22633  
##                     3rd Qu.:141.75   3rd Qu.:27859  
##                     Max.   :578.00   Max.   :45409

3.4 Timeliness of transactions

3.4.1 Per Day

PD1<-ANZ_transactions%>%
  group_by(date)%>%
  summarize(amount=sum(amount), n=n())%>%
  mutate(weekday= weekdays(date, abbr = TRUE))%>%
  ggplot(aes(date,amount))+
  geom_point(aes(col=factor(weekday)))+
  geom_path()+
  ggtitle("Amount spent per day")
PD2<-ANZ_transactions%>%
  group_by(date)%>%
  summarize(amount=sum(amount), n=n())%>%
  mutate(weekday= weekdays(date, abbr = TRUE))%>%
  ggplot(aes(date,n))+
  geom_point(aes(col=factor(weekday)))+
  geom_path()+
  ggtitle("Number of Transactions per day")
grid.arrange(PD1,PD2,ncol=1)

3.4.2 Per Week

PW1<-ANZ_transactions%>%
  mutate(date=round_date(date, unit="week"))%>%
  group_by(date)%>%
  summarize(amount=sum(amount), n=n())%>%
  ggplot(aes(date,amount))+
  geom_point()+
  geom_path()+
  ggtitle("Amount spent per week")
PW2<-ANZ_transactions%>%
  mutate(date=round_date(date, unit="week"))%>%
  group_by(date)%>%
  summarize(amount=sum(amount), n=n())%>%
  ggplot(aes(date,amount))+
  geom_point()+
  geom_path()+
  ggtitle("Number of Transactions per week")
grid.arrange(PW1,PW2,ncol=1)

Location

Split customer & merchant lat_long into individual columns

dfloc <-ANZ_transactions[,c("long_lat", "merchant_long_lat")]
dfloc <- dfloc %>% separate("long_lat", c("c_long","c_lat"), sep=' ')
dfloc<- dfloc %>% separate("merchant_long_lat", c("m_long", "m_lat"),sep=' ')
dfloc<- data.frame(sapply(dfloc, as.numeric))
ANZ_transactions <- cbind(ANZ_transactions,dfloc)

Checking transactions for those who don’t reside in Australia

df_temp <- ANZ_transactions %>%
  filter (!(c_long >113 & c_long <154 & c_lat > (-44) & c_lat < (-10)))
length(unique(df_temp$customer_id))
## [1] 1

Distance between customer and merchant

#Exclude foreign customer
df_temp <- ANZ_transactions %>%
  filter (c_long >113 & c_long <154 & c_lat > (-44) & c_lat < (-10))
#Transform long and lat to numbers
dfloc = df_temp [,c("c_long", "c_lat","m_long", "m_lat")]
dfloc<- data.frame(sapply(dfloc, as.numeric))

#Calculate the distance
library(geosphere)
dfloc$dst <- distHaversine(dfloc[, 1:2], dfloc[, 3:4]) / 1000

#Histogram of the results
hist(dfloc$dst[dfloc$dst<100], main = "Distance between customer and merchants",xlab= 'Distance
 (km)' )

There is a correlation between the proximity of the customer with the merchant and the number of transactions performed

Plot locations on a map

library(leaflet)
library(sp)

#remove missing values
df_temp <- df_temp %>% filter(!is.na(merchant_long_lat))
  
merch_dist <- function (id ){
### This function takes in a customer Id and plot the location of the customer and all merchants he/she have traded with.
  cus_icon<- makeAwesomeIcon(icon = 'home', markerColor = 'green')
  l = subset (df_temp[,c("customer_id","m_long","m_lat")], customer_id == id)
  l <- l[c("m_long","m_lat")]

#Home location
  cus_loc <- unique(subset (df_temp[,c("customer_id","long_lat")], customer_id == id))
  cus_loc <- cus_loc %>% separate("long_lat", c("long", "lat"),sep=' ')

#Merchants location
  df_t = data.frame(longtitude = as.numeric(l$m_long), latitude = as.numeric(l$m_lat))

#Mapping function
  coordinates(df_t) <- ~longtitude+latitude
  leaflet(df_t) %>%  #provide the dataset to function leaflet
    setView(lng = as.numeric(cus_loc$long), lat=as.numeric(cus_loc$lat) , zoom = 11) %>% #starting point of the map
    addMarkers() %>% #add the markers (position of the data)
    addProviderTiles(providers$CartoDB.Positron, 
                     options = providerTileOptions(opacity = 0.85)) %>% #theme of the map
    addAwesomeMarkers(lng=as.numeric(cus_loc$long), lat=as.numeric(cus_loc$lat), #add the home marker
                      icon = cus_icon)
}

merch_dist(id ='CUS-51506836')