1. Import Data
head(ANZ_transactions)
## # A tibble: 6 x 23
## status card_present_fl… bpay_biller_code account currency long_lat
## <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 autho… 1 NA ACC-15… AUD 153.41 …
## 2 autho… 0 NA ACC-15… AUD 153.41 …
## 3 autho… 1 NA ACC-12… AUD 151.23 …
## 4 autho… 1 NA ACC-10… AUD 153.10 …
## 5 autho… 1 NA ACC-15… AUD 153.41 …
## 6 posted NA NA ACC-16… AUD 151.22 …
## # … with 17 more variables: txn_description <chr>, merchant_id <chr>,
## # merchant_code <dbl>, first_name <chr>, balance <dbl>, date <dttm>,
## # gender <chr>, age <dbl>, merchant_suburb <chr>, merchant_state <chr>,
## # extraction <chr>, amount <dbl>, transaction_id <chr>, country <chr>,
## # customer_id <chr>, merchant_long_lat <chr>, movement <chr>
dim(ANZ_transactions)
## [1] 12043 23
2. Completeness
diagnose(ANZ_transactions)%>%
filter(missing_percent !=0)%>%
arrange(desc(missing_percent))
## # A tibble: 7 x 6
## variables types missing_count missing_percent unique_count unique_rate
## <chr> <chr> <int> <dbl> <int> <dbl>
## 1 bpay_biller_co… numeric 11160 92.7 2 0.000166
## 2 merchant_code numeric 11160 92.7 2 0.000166
## 3 card_present_f… numeric 4326 35.9 3 0.000249
## 4 merchant_id charac… 4326 35.9 5726 0.475
## 5 merchant_suburb charac… 4326 35.9 1610 0.134
## 6 merchant_state charac… 4326 35.9 9 0.000747
## 7 merchant_long_… charac… 4326 35.9 2704 0.225
3. Overall Insights
3.1 Understanding the variables
summary(ANZ_transactions)
## status card_present_flag bpay_biller_code account
## Length:12043 Min. :0.000 Min. :0 Length:12043
## Class :character 1st Qu.:1.000 1st Qu.:0 Class :character
## Mode :character Median :1.000 Median :0 Mode :character
## Mean :0.803 Mean :0
## 3rd Qu.:1.000 3rd Qu.:0
## Max. :1.000 Max. :0
## NA's :4326 NA's :11160
## currency long_lat txn_description merchant_id
## Length:12043 Length:12043 Length:12043 Length:12043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## merchant_code first_name balance
## Min. :0 Length:12043 Min. : 0.24
## 1st Qu.:0 Class :character 1st Qu.: 3158.59
## Median :0 Mode :character Median : 6432.01
## Mean :0 Mean : 14704.20
## 3rd Qu.:0 3rd Qu.: 12465.94
## Max. :0 Max. :267128.52
## NA's :11160
## date gender age
## Min. :2018-08-01 00:00:00 Length:12043 Min. :18.00
## 1st Qu.:2018-08-24 00:00:00 Class :character 1st Qu.:22.00
## Median :2018-09-16 00:00:00 Mode :character Median :28.00
## Mean :2018-09-15 21:27:39 Mean :30.58
## 3rd Qu.:2018-10-09 00:00:00 3rd Qu.:38.00
## Max. :2018-10-31 00:00:00 Max. :78.00
##
## merchant_suburb merchant_state extraction amount
## Length:12043 Length:12043 Length:12043 Min. : 0.10
## Class :character Class :character Class :character 1st Qu.: 16.00
## Mode :character Mode :character Mode :character Median : 29.00
## Mean : 187.93
## 3rd Qu.: 53.66
## Max. :8835.98
##
## transaction_id country customer_id merchant_long_lat
## Length:12043 Length:12043 Length:12043 Length:12043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## movement
## Length:12043
## Class :character
## Mode :character
##
##
##
##
3.2 By money spent
3.2.1 Boxplot By Gender
ANZ_transactions %>%
ggplot(aes(gender,amount, col=gender))+
geom_boxplot()+
geom_point(alpha=0.3)+
scale_y_log10()

3.2.2 Distribution By Age and Gender
ANZ_transactions%>%
ggplot(aes(age,fill= gender))+
geom_histogram(bins=30, col="black", alpha=0.8)

3.2.3 Spent By Age
ANZ_transactions%>%
ggplot(aes(age,amount, col=gender))+
geom_point()+
scale_y_log10()

3.2.4 Amount By Description
ANZ_transactions %>%
ggplot(aes(txn_description,amount, col=gender))+
geom_boxplot()+
scale_y_log10()

3.2.5 Amount By Group of Ages
ANZ_transactions %>%
ggplot(aes(factor(age),amount))+
geom_boxplot()+
scale_y_log10()

3.2.6 Age segmentation
ANZ_transactions %>%
mutate(cuts=cut(age, c(0, 20, 25, 30, 35, 40, Inf))) %>%
group_by(cuts)%>%
ggplot(aes(cuts,amount))+
geom_col()

3.2.7 Consumers that spend the most
ANZ_transactions%>%
group_by(customer_id)%>%
summarize(gender= first(gender), age= first(age), amount=sum(amount))%>%
arrange(desc(amount))%>%
slice(1:10)%>%
knitr::kable(align = 'c', format = "markdown")
| CUS-2738291516 |
M |
35 |
45409.16 |
| CUS-3142625864 |
M |
43 |
42688.30 |
| CUS-1816693151 |
M |
40 |
40215.54 |
| CUS-2155701614 |
F |
35 |
37943.79 |
| CUS-261674136 |
M |
29 |
36786.13 |
| CUS-883482547 |
F |
19 |
36639.41 |
| CUS-4142663097 |
F |
34 |
36588.25 |
| CUS-527400765 |
F |
40 |
36543.61 |
| CUS-1196156254 |
F |
34 |
36050.72 |
| CUS-2031327464 |
M |
40 |
35832.97 |
3.2.8 Consumers with most transactions
ANZ_transactions%>%
group_by(customer_id)%>%
summarize(gender= first(gender), age= first(age),n=n() ,amount=sum(amount))%>%
arrange(desc(n))%>%
slice(1:10)%>%
knitr::kable(align = 'c', format = "markdown")
| CUS-2487424745 |
F |
26 |
578 |
26211.59 |
| CUS-2142601169 |
M |
38 |
303 |
23696.45 |
| CUS-3026014945 |
F |
27 |
292 |
29074.34 |
| CUS-3378712515 |
F |
24 |
260 |
22761.96 |
| CUS-1614226872 |
F |
40 |
259 |
19286.44 |
| CUS-1196156254 |
F |
34 |
245 |
36050.72 |
| CUS-2695611575 |
M |
37 |
239 |
21479.97 |
| CUS-51506836 |
M |
24 |
239 |
24100.75 |
| CUS-860700529 |
M |
30 |
233 |
18099.88 |
| CUS-1669695324 |
F |
25 |
230 |
23070.56 |
3.3 Total Averages per user
ANZ_transactions%>%
group_by(customer_id)%>%
summarize(n=n(),amount=sum(amount))%>%
summary()
## customer_id n amount
## Length:100 Min. : 25.00 Min. :10386
## Class :character 1st Qu.: 77.75 1st Qu.:16210
## Mode :character Median :109.50 Median :21451
## Mean :120.43 Mean :22633
## 3rd Qu.:141.75 3rd Qu.:27859
## Max. :578.00 Max. :45409
3.4 Timeliness of transactions
3.4.1 Per Day
PD1<-ANZ_transactions%>%
group_by(date)%>%
summarize(amount=sum(amount), n=n())%>%
mutate(weekday= weekdays(date, abbr = TRUE))%>%
ggplot(aes(date,amount))+
geom_point(aes(col=factor(weekday)))+
geom_path()+
ggtitle("Amount spent per day")
PD2<-ANZ_transactions%>%
group_by(date)%>%
summarize(amount=sum(amount), n=n())%>%
mutate(weekday= weekdays(date, abbr = TRUE))%>%
ggplot(aes(date,n))+
geom_point(aes(col=factor(weekday)))+
geom_path()+
ggtitle("Number of Transactions per day")
grid.arrange(PD1,PD2,ncol=1)

3.4.2 Per Week
PW1<-ANZ_transactions%>%
mutate(date=round_date(date, unit="week"))%>%
group_by(date)%>%
summarize(amount=sum(amount), n=n())%>%
ggplot(aes(date,amount))+
geom_point()+
geom_path()+
ggtitle("Amount spent per week")
PW2<-ANZ_transactions%>%
mutate(date=round_date(date, unit="week"))%>%
group_by(date)%>%
summarize(amount=sum(amount), n=n())%>%
ggplot(aes(date,amount))+
geom_point()+
geom_path()+
ggtitle("Number of Transactions per week")
grid.arrange(PW1,PW2,ncol=1)

Location
Split customer & merchant lat_long into individual columns
dfloc <-ANZ_transactions[,c("long_lat", "merchant_long_lat")]
dfloc <- dfloc %>% separate("long_lat", c("c_long","c_lat"), sep=' ')
dfloc<- dfloc %>% separate("merchant_long_lat", c("m_long", "m_lat"),sep=' ')
dfloc<- data.frame(sapply(dfloc, as.numeric))
ANZ_transactions <- cbind(ANZ_transactions,dfloc)
Checking transactions for those who don’t reside in Australia
df_temp <- ANZ_transactions %>%
filter (!(c_long >113 & c_long <154 & c_lat > (-44) & c_lat < (-10)))
length(unique(df_temp$customer_id))
## [1] 1
Distance between customer and merchant
#Exclude foreign customer
df_temp <- ANZ_transactions %>%
filter (c_long >113 & c_long <154 & c_lat > (-44) & c_lat < (-10))
#Transform long and lat to numbers
dfloc = df_temp [,c("c_long", "c_lat","m_long", "m_lat")]
dfloc<- data.frame(sapply(dfloc, as.numeric))
#Calculate the distance
library(geosphere)
dfloc$dst <- distHaversine(dfloc[, 1:2], dfloc[, 3:4]) / 1000
#Histogram of the results
hist(dfloc$dst[dfloc$dst<100], main = "Distance between customer and merchants",xlab= 'Distance
(km)' )

There is a correlation between the proximity of the customer with the merchant and the number of transactions performed
Plot locations on a map
library(leaflet)
library(sp)
#remove missing values
df_temp <- df_temp %>% filter(!is.na(merchant_long_lat))
merch_dist <- function (id ){
### This function takes in a customer Id and plot the location of the customer and all merchants he/she have traded with.
cus_icon<- makeAwesomeIcon(icon = 'home', markerColor = 'green')
l = subset (df_temp[,c("customer_id","m_long","m_lat")], customer_id == id)
l <- l[c("m_long","m_lat")]
#Home location
cus_loc <- unique(subset (df_temp[,c("customer_id","long_lat")], customer_id == id))
cus_loc <- cus_loc %>% separate("long_lat", c("long", "lat"),sep=' ')
#Merchants location
df_t = data.frame(longtitude = as.numeric(l$m_long), latitude = as.numeric(l$m_lat))
#Mapping function
coordinates(df_t) <- ~longtitude+latitude
leaflet(df_t) %>% #provide the dataset to function leaflet
setView(lng = as.numeric(cus_loc$long), lat=as.numeric(cus_loc$lat) , zoom = 11) %>% #starting point of the map
addMarkers() %>% #add the markers (position of the data)
addProviderTiles(providers$CartoDB.Positron,
options = providerTileOptions(opacity = 0.85)) %>% #theme of the map
addAwesomeMarkers(lng=as.numeric(cus_loc$long), lat=as.numeric(cus_loc$lat), #add the home marker
icon = cus_icon)
}
merch_dist(id ='CUS-51506836')