rm(list=ls())
library(readxl)
## Warning: package 'readxl' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages -------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v readr   1.3.1
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'stringr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(visdat)
## Warning: package 'visdat' was built under R version 3.6.3
library(ggplot2)
library(cluster)
## Warning: package 'cluster' was built under R version 3.6.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
ANZ_dataset <- read_excel("D:/ANZ/ANZ synthesised transaction dataset.xlsx")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3052 / R3052C3: got 'THE DISCOUNT CHEMIST GROUP'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C4360 / R4360C3: got 'LAND WATER & PLANNING East Melbourne'
attach(ANZ_dataset)
#View(ANZ_dataset)
ANZ_dataset$date<- as.Date(date)
str(ANZ_dataset)
## tibble [12,043 x 23] (S3: tbl_df/tbl/data.frame)
##  $ status           : chr [1:12043] "authorized" "authorized" "authorized" "authorized" ...
##  $ card_present_flag: num [1:12043] 1 0 1 1 1 NA 1 1 1 NA ...
##  $ bpay_biller_code : num [1:12043] NA NA NA NA NA NA NA NA NA NA ...
##  $ account          : chr [1:12043] "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
##  $ currency         : chr [1:12043] "AUD" "AUD" "AUD" "AUD" ...
##  $ long_lat         : chr [1:12043] "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
##  $ txn_description  : chr [1:12043] "POS" "SALES-POS" "POS" "SALES-POS" ...
##  $ merchant_id      : chr [1:12043] "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
##  $ merchant_code    : num [1:12043] NA NA NA NA NA NA NA NA NA NA ...
##  $ first_name       : chr [1:12043] "Diana" "Diana" "Michael" "Rhonda" ...
##  $ balance          : num [1:12043] 35.39 21.2 5.71 2117.22 17.95 ...
##  $ date             : Date[1:12043], format: "2018-08-01" "2018-08-01" ...
##  $ gender           : chr [1:12043] "F" "F" "M" "F" ...
##  $ age              : num [1:12043] 26 26 38 40 26 20 43 43 27 40 ...
##  $ merchant_suburb  : chr [1:12043] "Ashmore" "Sydney" "Sydney" "Buderim" ...
##  $ merchant_state   : chr [1:12043] "QLD" "NSW" "NSW" "QLD" ...
##  $ extraction       : chr [1:12043] "2018-08-01T01:01:15.000+0000" "2018-08-01T01:13:45.000+0000" "2018-08-01T01:26:15.000+0000" "2018-08-01T01:38:45.000+0000" ...
##  $ amount           : num [1:12043] 16.25 14.19 6.42 40.9 3.25 ...
##  $ transaction_id   : chr [1:12043] "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
##  $ country          : chr [1:12043] "Australia" "Australia" "Australia" "Australia" ...
##  $ customer_id      : chr [1:12043] "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
##  $ merchant_long_lat: chr [1:12043] "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
##  $ movement         : chr [1:12043] "debit" "debit" "debit" "debit" ...
summary(ANZ_dataset)
##     status          card_present_flag bpay_biller_code   account         
##  Length:12043       Min.   :0.000     Min.   :0        Length:12043      
##  Class :character   1st Qu.:1.000     1st Qu.:0        Class :character  
##  Mode  :character   Median :1.000     Median :0        Mode  :character  
##                     Mean   :0.803     Mean   :0                          
##                     3rd Qu.:1.000     3rd Qu.:0                          
##                     Max.   :1.000     Max.   :0                          
##                     NA's   :4326      NA's   :11160                      
##    currency           long_lat         txn_description    merchant_id       
##  Length:12043       Length:12043       Length:12043       Length:12043      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  merchant_code    first_name           balance               date           
##  Min.   :0       Length:12043       Min.   :     0.24   Min.   :2018-08-01  
##  1st Qu.:0       Class :character   1st Qu.:  3158.59   1st Qu.:2018-08-24  
##  Median :0       Mode  :character   Median :  6432.01   Median :2018-09-16  
##  Mean   :0                          Mean   : 14704.20   Mean   :2018-09-15  
##  3rd Qu.:0                          3rd Qu.: 12465.94   3rd Qu.:2018-10-09  
##  Max.   :0                          Max.   :267128.52   Max.   :2018-10-31  
##  NA's   :11160                                                              
##     gender               age        merchant_suburb    merchant_state    
##  Length:12043       Min.   :18.00   Length:12043       Length:12043      
##  Class :character   1st Qu.:22.00   Class :character   Class :character  
##  Mode  :character   Median :28.00   Mode  :character   Mode  :character  
##                     Mean   :30.58                                        
##                     3rd Qu.:38.00                                        
##                     Max.   :78.00                                        
##                                                                          
##   extraction            amount        transaction_id       country         
##  Length:12043       Min.   :   0.10   Length:12043       Length:12043      
##  Class :character   1st Qu.:  16.00   Class :character   Class :character  
##  Mode  :character   Median :  29.00   Mode  :character   Mode  :character  
##                     Mean   : 187.93                                        
##                     3rd Qu.:  53.66                                        
##                     Max.   :8835.98                                        
##                                                                            
##  customer_id        merchant_long_lat    movement        
##  Length:12043       Length:12043       Length:12043      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 

The error obtained while reading the file is because of unexpected values in biller_code column.

#convert character to factor columns

fact_cols <- names(dplyr::select_if(subset(ANZ_dataset,select = -c(account,long_lat,merchant_id,first_name,extraction,transaction_id,customer_id,merchant_long_lat)), is_character))
ANZ_dataset[,fact_cols] <- lapply(ANZ_dataset[,fact_cols], factor)
str(ANZ_dataset)
## tibble [12,043 x 23] (S3: tbl_df/tbl/data.frame)
##  $ status           : Factor w/ 2 levels "authorized","posted": 1 1 1 1 1 2 1 1 1 2 ...
##  $ card_present_flag: num [1:12043] 1 0 1 1 1 NA 1 1 1 NA ...
##  $ bpay_biller_code : num [1:12043] NA NA NA NA NA NA NA NA NA NA ...
##  $ account          : chr [1:12043] "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
##  $ currency         : Factor w/ 1 level "AUD": 1 1 1 1 1 1 1 1 1 1 ...
##  $ long_lat         : chr [1:12043] "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
##  $ txn_description  : Factor w/ 6 levels "INTER BANK","PAY/SALARY",..: 5 6 5 6 6 3 6 5 5 1 ...
##  $ merchant_id      : chr [1:12043] "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
##  $ merchant_code    : num [1:12043] NA NA NA NA NA NA NA NA NA NA ...
##  $ first_name       : chr [1:12043] "Diana" "Diana" "Michael" "Rhonda" ...
##  $ balance          : num [1:12043] 35.39 21.2 5.71 2117.22 17.95 ...
##  $ date             : Date[1:12043], format: "2018-08-01" "2018-08-01" ...
##  $ gender           : Factor w/ 2 levels "F","M": 1 1 2 1 1 2 1 1 1 2 ...
##  $ age              : num [1:12043] 26 26 38 40 26 20 43 43 27 40 ...
##  $ merchant_suburb  : Factor w/ 1609 levels "Abbotsford","Aberdeen",..: 53 1380 1380 215 902 NA 709 893 1602 NA ...
##  $ merchant_state   : Factor w/ 8 levels "ACT","NSW","NT",..: 4 2 2 4 4 NA 7 7 8 NA ...
##  $ extraction       : chr [1:12043] "2018-08-01T01:01:15.000+0000" "2018-08-01T01:13:45.000+0000" "2018-08-01T01:26:15.000+0000" "2018-08-01T01:38:45.000+0000" ...
##  $ amount           : num [1:12043] 16.25 14.19 6.42 40.9 3.25 ...
##  $ transaction_id   : chr [1:12043] "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
##  $ country          : Factor w/ 1 level "Australia": 1 1 1 1 1 1 1 1 1 1 ...
##  $ customer_id      : chr [1:12043] "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
##  $ merchant_long_lat: chr [1:12043] "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
##  $ movement         : Factor w/ 2 levels "credit","debit": 2 2 2 2 2 2 2 2 2 2 ...
vis_dat(ANZ_dataset)

bpay_biller_code and merchant_code has 92.67% missingness in data and hence can be removed From the vismiss plot, we can observe that for the rest of the columns containing missing values, all values missing are from the same rows and all these columns have 35.92% of missing data (4326 rows)

ANZ_dataset <- subset(ANZ_dataset, select = -c(bpay_biller_code,merchant_code))
lapply(ANZ_dataset,function(x) { length(which(is.na(x)))})
## $status
## [1] 0
## 
## $card_present_flag
## [1] 4326
## 
## $account
## [1] 0
## 
## $currency
## [1] 0
## 
## $long_lat
## [1] 0
## 
## $txn_description
## [1] 0
## 
## $merchant_id
## [1] 4326
## 
## $first_name
## [1] 0
## 
## $balance
## [1] 0
## 
## $date
## [1] 0
## 
## $gender
## [1] 0
## 
## $age
## [1] 0
## 
## $merchant_suburb
## [1] 4326
## 
## $merchant_state
## [1] 4326
## 
## $extraction
## [1] 0
## 
## $amount
## [1] 0
## 
## $transaction_id
## [1] 0
## 
## $country
## [1] 0
## 
## $customer_id
## [1] 0
## 
## $merchant_long_lat
## [1] 4326
## 
## $movement
## [1] 0
head(ANZ_dataset)
ANZ_dataset %>% filter(is.na(card_present_flag)) %>% filter(status=="posted") %>% count()
ANZ_dataset  %>% filter(status=="posted") %>% count()

We can verify from above code that the missing values for the 5 columns are when the status is “posted” Data does not need to be cleansed as merchant details will be NA for posted transactions ##Explanatory analysis There are 100 customer out of which 20 has the same customer first name

unique(ANZ_dataset$first_name)
##  [1] "Diana"       "Michael"     "Rhonda"      "Robert"      "Kristin"    
##  [6] "Tonya"       "Fernando"    "Isaiah"      "Ricky"       "Jeffrey"    
## [11] "Patrick"     "Karen"       "Ruth"        "Kimberly"    "Joseph"     
## [16] "Tiffany"     "Emily"       "Christine"   "Ryan"        "Michelle"   
## [21] "Richard"     "Jessica"     "Ronald"      "Kaitlyn"     "Lori"       
## [26] "Virginia"    "Andrew"      "Susan"       "Luis"        "Gregory"    
## [31] "Barry"       "Daniel"      "Renee"       "Amy"         "Christopher"
## [36] "Marissa"     "Eric"        "Natasha"     "Edward"      "Craig"      
## [41] "Sandra"      "Debra"       "Michele"     "Antonio"     "Tyler"      
## [46] "Lucas"       "Jonathan"    "Matthew"     "James"       "Mackenzie"  
## [51] "Linda"       "Dustin"      "Heather"     "Derek"       "Scott"      
## [56] "Charles"     "Tim"         "Melissa"     "Darren"      "Jacqueline" 
## [61] "Cindy"       "Stephanie"   "Rachael"     "Mary"        "Maria"      
## [66] "Timothy"     "Nathaniel"   "Elizabeth"   "Paul"        "Sarah"      
## [71] "Alexander"   "Donald"      "Kenneth"     "Ashley"      "Catherine"  
## [76] "Billy"       "Abigail"     "Brian"       "David"       "Robin"

Michael is the most common first name, and has 6 different account users

ANZ_dataset %>% group_by(first_name)%>% summarise(unique(account)) %>% count() %>% filter(n>1) 
## `summarise()` regrouping output by 'first_name' (override with `.groups` argument)
#Defining Age Group
ANZ_dataset1 <-mutate(ANZ_dataset,Age_group = ifelse(age <= 30 ,'teenage',ifelse(age >30 & age <50, 'midage','old')))
#monthly salary
ANZ_dataset2<-ANZ_dataset1 %>% group_by(account,first_name,format(date, "%m"),format(date, "%Y"),movement)%>% summarise(sum(amount))
## `summarise()` regrouping output by 'account', 'first_name', 'format(date, "%m")', 'format(date, "%Y")' (override with `.groups` argument)
ANZ_dataset2<- ANZ_dataset2 %>% rename( month = "format(date, \"%m\")" ,year = "format(date, \"%Y\")",total_amount="sum(amount)" )

#Total number of Transactions per month excluding credit
AnZ2 <-ANZ_dataset1 %>% filter(movement!='credit')%>% group_by(account,first_name,format(date, "%m/%Y"),Age_group,gender)%>% count()
#outlier analysis
boxplot(AnZ2$n ~ AnZ2$Age_group,xlab = "Age Group",ylab = "Number of Transactions", main = "Monthy Transactions by age group")

out <-boxplot.stats(AnZ2$n)$out
out_ind <- which(AnZ2$n %in% c(out))
AnZ2[out_ind, ]
boxplot(AnZ2$n ~ AnZ2$gender,xlab = "Gender",ylab = "Number of Transactions", main = "Monthy Transactions by Gender")

ANZ_cred<-ANZ_dataset1 %>%filter(movement=='credit')%>% group_by(account,first_name,format(date, "%m/%Y")) %>%summarise(sum(amount)) %>%rename(credit="sum(amount)") 
## `summarise()` regrouping output by 'account', 'first_name' (override with `.groups` argument)
ANZ_debit<-ANZ_dataset1 %>%filter(movement=='debit')%>% group_by(account,first_name,format(date, "%m/%Y")) %>%summarise(sum(amount)) %>%rename(debit="sum(amount)") 
## `summarise()` regrouping output by 'account', 'first_name' (override with `.groups` argument)
ANZ_sal<-merge(ANZ_cred,ANZ_debit)
ANZ_sal2<-ANZ_sal %>% mutate(avg_savings_monthly = credit - debit) %>% rename(date ="format(date, \"%m/%Y\")")
ANZ_sal2 %>% ggplot(mapping = aes(x=date,y=avg_savings_monthly,label=first_name))+geom_point()+ geom_text()

There seems to be a number of outliers in below 30 age group. Further analysis shows that Diana and Michael have large number of transactions consistently (all 3 months) in their age group while Jeffery,Richard,Kimberly,Jessica and Rhonda has large number of transactions only in a particular month. Tonya has large number of transactions for 2 months From the boxplot, we can observe that Females tend to spend higher than males and has greater extent of distribution compared to males.

#########cluster analysis
ANZ_sal3<-ANZ_sal2%>% group_by(first_name,account) %>% summarise(credit=sum(credit),debit = sum(debit),savings=sum(avg_savings_monthly))
## `summarise()` regrouping output by 'first_name' (override with `.groups` argument)
str(ANZ_sal3)
## tibble [100 x 5] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ first_name: chr [1:100] "Abigail" "Alexander" "Amy" "Andrew" ...
##  $ account   : chr [1:100] "ACC-2528867619" "ACC-1890605467" "ACC-414431115" "ACC-721712940" ...
##  $ credit    : num [1:100] 25305 21897 9782 9389 15385 ...
##  $ debit     : num [1:100] 4407 5801 4981 1456 6757 ...
##  $ savings   : num [1:100] 20899 16096 4801 7933 8628 ...
##  - attr(*, "groups")= tibble [80 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ first_name: chr [1:80] "Abigail" "Alexander" "Amy" "Andrew" ...
##   ..$ .rows     : list<int> [1:80] 
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int [1:2] 13 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int [1:2] 28 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int [1:2] 36 37
##   .. ..$ : int [1:2] 38 39
##   .. ..$ : int 40
##   .. ..$ : int [1:2] 41 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int [1:2] 45 46
##   .. ..$ : int [1:3] 47 48 49
##   .. ..$ : int 50
##   .. ..$ : int [1:2] 51 52
##   .. ..$ : int 53
##   .. ..$ : int 54
##   .. ..$ : int 55
##   .. ..$ : int 56
##   .. ..$ : int 57
##   .. ..$ : int 58
##   .. ..$ : int 59
##   .. ..$ : int 60
##   .. ..$ : int 61
##   .. ..$ : int [1:6] 62 63 64 65 66 67
##   .. ..$ : int 68
##   .. ..$ : int 69
##   .. ..$ : int 70
##   .. ..$ : int 71
##   .. ..$ : int 72
##   .. ..$ : int 73
##   .. ..$ : int 74
##   .. ..$ : int 75
##   .. ..$ : int 76
##   .. ..$ : int [1:3] 77 78 79
##   .. ..$ : int 80
##   .. ..$ : int [1:2] 81 82
##   .. ..$ : int 83
##   .. ..$ : int 84
##   .. ..$ : int 85
##   .. ..$ : int [1:2] 86 87
##   .. ..$ : int 88
##   .. ..$ : int 89
##   .. ..$ : int 90
##   .. ..$ : int 91
##   .. ..$ : int [1:2] 92 93
##   .. ..$ : int 94
##   .. ..$ : int 95
##   .. ..$ : int 96
##   .. ..$ : int 97
##   .. ..$ : int [1:2] 98 99
##   .. ..$ : int 100
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
names(dplyr::select_if(ANZ_sal3,is.numeric))
## [1] "first_name" "credit"     "debit"      "savings"
mydata <- scale(ANZ_sal3[,c("credit","debit","savings")])
rownames(mydata) <- ANZ_sal3$account
library(factoextra)
fit <- kmeans(mydata, 3)
fviz_cluster(fit, data = mydata)

## Map

library(ggmap)
## Warning: package 'ggmap' was built under R version 3.6.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(tmap)
## Warning: package 'tmap' was built under R version 3.6.3
library(splitstackshape)
## Warning: package 'splitstackshape' was built under R version 3.6.3
str(ANZ_dataset1)
## tibble [12,043 x 22] (S3: tbl_df/tbl/data.frame)
##  $ status           : Factor w/ 2 levels "authorized","posted": 1 1 1 1 1 2 1 1 1 2 ...
##  $ card_present_flag: num [1:12043] 1 0 1 1 1 NA 1 1 1 NA ...
##  $ account          : chr [1:12043] "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
##  $ currency         : Factor w/ 1 level "AUD": 1 1 1 1 1 1 1 1 1 1 ...
##  $ long_lat         : chr [1:12043] "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
##  $ txn_description  : Factor w/ 6 levels "INTER BANK","PAY/SALARY",..: 5 6 5 6 6 3 6 5 5 1 ...
##  $ merchant_id      : chr [1:12043] "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
##  $ first_name       : chr [1:12043] "Diana" "Diana" "Michael" "Rhonda" ...
##  $ balance          : num [1:12043] 35.39 21.2 5.71 2117.22 17.95 ...
##  $ date             : Date[1:12043], format: "2018-08-01" "2018-08-01" ...
##  $ gender           : Factor w/ 2 levels "F","M": 1 1 2 1 1 2 1 1 1 2 ...
##  $ age              : num [1:12043] 26 26 38 40 26 20 43 43 27 40 ...
##  $ merchant_suburb  : Factor w/ 1609 levels "Abbotsford","Aberdeen",..: 53 1380 1380 215 902 NA 709 893 1602 NA ...
##  $ merchant_state   : Factor w/ 8 levels "ACT","NSW","NT",..: 4 2 2 4 4 NA 7 7 8 NA ...
##  $ extraction       : chr [1:12043] "2018-08-01T01:01:15.000+0000" "2018-08-01T01:13:45.000+0000" "2018-08-01T01:26:15.000+0000" "2018-08-01T01:38:45.000+0000" ...
##  $ amount           : num [1:12043] 16.25 14.19 6.42 40.9 3.25 ...
##  $ transaction_id   : chr [1:12043] "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
##  $ country          : Factor w/ 1 level "Australia": 1 1 1 1 1 1 1 1 1 1 ...
##  $ customer_id      : chr [1:12043] "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
##  $ merchant_long_lat: chr [1:12043] "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
##  $ movement         : Factor w/ 2 levels "credit","debit": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Age_group        : chr [1:12043] "teenage" "teenage" "midage" "midage" ...
ANZ_map<-cSplit(ANZ_dataset1, 5, " ", drop=F) %>% rename(longitude_cust = long_lat_1, latitude_cust=long_lat_2)
ANZ_map<-cSplit(ANZ_map, 20, " ", drop=F) %>% rename(longitude_merch = merchant_long_lat_1, latitude_merch=merchant_long_lat_2)
ANZ_map%>% select("longitude_cust","latitude_cust")
ggmap::register_google(key = "AIzaSyBVRB4Dngod3xK-dXAGCTIBIhIg-Pnpl4Y")
myMap <- get_map(location = "Australia", zoom = 4)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Australia&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx-dXAGCTIBIhIg-Pnpl4Y
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Australia&key=xxx-dXAGCTIBIhIg-Pnpl4Y
ggmap(myMap)  + geom_point(data = ANZ_map,aes(x = longitude_merch,y = latitude_merch,col='red')) + geom_point(data = ANZ_map%>% select("longitude_cust","latitude_cust"),aes(x = longitude_cust,y = latitude_cust))
## Warning: Removed 4326 rows containing missing values (geom_point).
## Warning: Removed 123 rows containing missing values (geom_point).

Most of the merchants are scattered throughout the Australia and Tasmania while users are in populated cities. This is as expected.